regex for groups
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 2 |
import enum
|
| 3 |
from functools import partial
|
| 4 |
import json
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
import re
|
| 7 |
import tempfile
|
|
@@ -26,6 +27,8 @@ LOG_SCALE_STATS = {
|
|
| 26 |
"pages_with_lorem_ipsum",
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def find_folders(base_folder, path):
|
| 31 |
base_folder = get_datafolder(base_folder)
|
|
@@ -145,11 +148,12 @@ def prepare_non_grouped_data(dataset_path, base_folder, grouping, stat_name, nor
|
|
| 145 |
return stats_rounded
|
| 146 |
|
| 147 |
|
| 148 |
-
def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS):
|
| 149 |
import heapq
|
|
|
|
| 150 |
|
| 151 |
stats = load_stats(base_folder, dataset_path, stat_name, grouping)
|
| 152 |
-
|
| 153 |
means = {key: value.mean for key, value in stats.items()}
|
| 154 |
|
| 155 |
# Use heap to get top_k keys
|
|
@@ -254,6 +258,7 @@ def update_graph(
|
|
| 254 |
normalization,
|
| 255 |
top_k,
|
| 256 |
direction,
|
|
|
|
| 257 |
progress=gr.Progress(),
|
| 258 |
):
|
| 259 |
if len(datasets) <= 0 or not stat_name or not grouping:
|
|
@@ -262,7 +267,7 @@ def update_graph(
|
|
| 262 |
prepare_fc = (
|
| 263 |
partial(prepare_non_grouped_data, normalization=normalization)
|
| 264 |
if grouping == "histogram"
|
| 265 |
-
else partial(prepare_grouped_data, top_k=top_k, direction=direction)
|
| 266 |
)
|
| 267 |
graph_fc = (
|
| 268 |
partial(plot_scatter, normalization=normalization)
|
|
@@ -348,20 +353,27 @@ Groupings:
|
|
| 348 |
)
|
| 349 |
|
| 350 |
with gr.Row(visible=False) as group_choices:
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
update_button = gr.Button("Update Graph", variant="primary")
|
| 367 |
with gr.Row():
|
|
@@ -413,6 +425,7 @@ Groupings:
|
|
| 413 |
normalization_checkbox,
|
| 414 |
top_select,
|
| 415 |
direction_checkbox,
|
|
|
|
| 416 |
],
|
| 417 |
outputs=[graph_output, exported_data, export_data_button],
|
| 418 |
)
|
|
|
|
| 2 |
import enum
|
| 3 |
from functools import partial
|
| 4 |
import json
|
| 5 |
+
import os
|
| 6 |
from pathlib import Path
|
| 7 |
import re
|
| 8 |
import tempfile
|
|
|
|
| 27 |
"pages_with_lorem_ipsum",
|
| 28 |
}
|
| 29 |
|
| 30 |
+
STATS_LOCATION_DEFAULT = os.getenv("STATS_LOCATION_DEFAULT", "s3://")
|
| 31 |
+
|
| 32 |
|
| 33 |
def find_folders(base_folder, path):
|
| 34 |
base_folder = get_datafolder(base_folder)
|
|
|
|
| 148 |
return stats_rounded
|
| 149 |
|
| 150 |
|
| 151 |
+
def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS, regex):
|
| 152 |
import heapq
|
| 153 |
+
regex_compiled = re.compile(regex) if regex else None
|
| 154 |
|
| 155 |
stats = load_stats(base_folder, dataset_path, stat_name, grouping)
|
| 156 |
+
stats = {key: value for key, value in stats.items() if not regex or regex_compiled.match(key)}
|
| 157 |
means = {key: value.mean for key, value in stats.items()}
|
| 158 |
|
| 159 |
# Use heap to get top_k keys
|
|
|
|
| 258 |
normalization,
|
| 259 |
top_k,
|
| 260 |
direction,
|
| 261 |
+
regex,
|
| 262 |
progress=gr.Progress(),
|
| 263 |
):
|
| 264 |
if len(datasets) <= 0 or not stat_name or not grouping:
|
|
|
|
| 267 |
prepare_fc = (
|
| 268 |
partial(prepare_non_grouped_data, normalization=normalization)
|
| 269 |
if grouping == "histogram"
|
| 270 |
+
else partial(prepare_grouped_data, top_k=top_k, direction=direction, regex=regex)
|
| 271 |
)
|
| 272 |
graph_fc = (
|
| 273 |
partial(plot_scatter, normalization=normalization)
|
|
|
|
| 353 |
)
|
| 354 |
|
| 355 |
with gr.Row(visible=False) as group_choices:
|
| 356 |
+
with gr.Column(scale=2):
|
| 357 |
+
group_regex = gr.Text(
|
| 358 |
+
label="Group Regex",
|
| 359 |
+
value=None,
|
| 360 |
+
)
|
| 361 |
+
with gr.Row():
|
| 362 |
+
top_select = gr.Number(
|
| 363 |
+
label="N Groups",
|
| 364 |
+
value=100,
|
| 365 |
+
interactive=True,
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
direction_checkbox = gr.Radio(
|
| 369 |
+
label="Partition",
|
| 370 |
+
choices=[
|
| 371 |
+
"Top",
|
| 372 |
+
"Bottom",
|
| 373 |
+
"Most frequent (n_docs)",
|
| 374 |
+
],
|
| 375 |
+
value="Most frequent (n_docs)",
|
| 376 |
+
)
|
| 377 |
|
| 378 |
update_button = gr.Button("Update Graph", variant="primary")
|
| 379 |
with gr.Row():
|
|
|
|
| 425 |
normalization_checkbox,
|
| 426 |
top_select,
|
| 427 |
direction_checkbox,
|
| 428 |
+
group_regex,
|
| 429 |
],
|
| 430 |
outputs=[graph_output, exported_data, export_data_button],
|
| 431 |
)
|