regex for groups
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
2 |
import enum
|
3 |
from functools import partial
|
4 |
import json
|
|
|
5 |
from pathlib import Path
|
6 |
import re
|
7 |
import tempfile
|
@@ -26,6 +27,8 @@ LOG_SCALE_STATS = {
|
|
26 |
"pages_with_lorem_ipsum",
|
27 |
}
|
28 |
|
|
|
|
|
29 |
|
30 |
def find_folders(base_folder, path):
|
31 |
base_folder = get_datafolder(base_folder)
|
@@ -145,11 +148,12 @@ def prepare_non_grouped_data(dataset_path, base_folder, grouping, stat_name, nor
|
|
145 |
return stats_rounded
|
146 |
|
147 |
|
148 |
-
def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS):
|
149 |
import heapq
|
|
|
150 |
|
151 |
stats = load_stats(base_folder, dataset_path, stat_name, grouping)
|
152 |
-
|
153 |
means = {key: value.mean for key, value in stats.items()}
|
154 |
|
155 |
# Use heap to get top_k keys
|
@@ -254,6 +258,7 @@ def update_graph(
|
|
254 |
normalization,
|
255 |
top_k,
|
256 |
direction,
|
|
|
257 |
progress=gr.Progress(),
|
258 |
):
|
259 |
if len(datasets) <= 0 or not stat_name or not grouping:
|
@@ -262,7 +267,7 @@ def update_graph(
|
|
262 |
prepare_fc = (
|
263 |
partial(prepare_non_grouped_data, normalization=normalization)
|
264 |
if grouping == "histogram"
|
265 |
-
else partial(prepare_grouped_data, top_k=top_k, direction=direction)
|
266 |
)
|
267 |
graph_fc = (
|
268 |
partial(plot_scatter, normalization=normalization)
|
@@ -348,20 +353,27 @@ Groupings:
|
|
348 |
)
|
349 |
|
350 |
with gr.Row(visible=False) as group_choices:
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
update_button = gr.Button("Update Graph", variant="primary")
|
367 |
with gr.Row():
|
@@ -413,6 +425,7 @@ Groupings:
|
|
413 |
normalization_checkbox,
|
414 |
top_select,
|
415 |
direction_checkbox,
|
|
|
416 |
],
|
417 |
outputs=[graph_output, exported_data, export_data_button],
|
418 |
)
|
|
|
2 |
import enum
|
3 |
from functools import partial
|
4 |
import json
|
5 |
+
import os
|
6 |
from pathlib import Path
|
7 |
import re
|
8 |
import tempfile
|
|
|
27 |
"pages_with_lorem_ipsum",
|
28 |
}
|
29 |
|
30 |
+
STATS_LOCATION_DEFAULT = os.getenv("STATS_LOCATION_DEFAULT", "s3://")
|
31 |
+
|
32 |
|
33 |
def find_folders(base_folder, path):
|
34 |
base_folder = get_datafolder(base_folder)
|
|
|
148 |
return stats_rounded
|
149 |
|
150 |
|
151 |
+
def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS, regex):
|
152 |
import heapq
|
153 |
+
regex_compiled = re.compile(regex) if regex else None
|
154 |
|
155 |
stats = load_stats(base_folder, dataset_path, stat_name, grouping)
|
156 |
+
stats = {key: value for key, value in stats.items() if not regex or regex_compiled.match(key)}
|
157 |
means = {key: value.mean for key, value in stats.items()}
|
158 |
|
159 |
# Use heap to get top_k keys
|
|
|
258 |
normalization,
|
259 |
top_k,
|
260 |
direction,
|
261 |
+
regex,
|
262 |
progress=gr.Progress(),
|
263 |
):
|
264 |
if len(datasets) <= 0 or not stat_name or not grouping:
|
|
|
267 |
prepare_fc = (
|
268 |
partial(prepare_non_grouped_data, normalization=normalization)
|
269 |
if grouping == "histogram"
|
270 |
+
else partial(prepare_grouped_data, top_k=top_k, direction=direction, regex=regex)
|
271 |
)
|
272 |
graph_fc = (
|
273 |
partial(plot_scatter, normalization=normalization)
|
|
|
353 |
)
|
354 |
|
355 |
with gr.Row(visible=False) as group_choices:
|
356 |
+
with gr.Column(scale=2):
|
357 |
+
group_regex = gr.Text(
|
358 |
+
label="Group Regex",
|
359 |
+
value=None,
|
360 |
+
)
|
361 |
+
with gr.Row():
|
362 |
+
top_select = gr.Number(
|
363 |
+
label="N Groups",
|
364 |
+
value=100,
|
365 |
+
interactive=True,
|
366 |
+
)
|
367 |
+
|
368 |
+
direction_checkbox = gr.Radio(
|
369 |
+
label="Partition",
|
370 |
+
choices=[
|
371 |
+
"Top",
|
372 |
+
"Bottom",
|
373 |
+
"Most frequent (n_docs)",
|
374 |
+
],
|
375 |
+
value="Most frequent (n_docs)",
|
376 |
+
)
|
377 |
|
378 |
update_button = gr.Button("Update Graph", variant="primary")
|
379 |
with gr.Row():
|
|
|
425 |
normalization_checkbox,
|
426 |
top_select,
|
427 |
direction_checkbox,
|
428 |
+
group_regex,
|
429 |
],
|
430 |
outputs=[graph_output, exported_data, export_data_button],
|
431 |
)
|