hynky HF Staff commited on
Commit
f6ba6f9
·
1 Parent(s): bd8b366

regex for groups

Browse files
Files changed (1) hide show
  1. app.py +30 -17
app.py CHANGED
@@ -2,6 +2,7 @@ from concurrent.futures import ThreadPoolExecutor
2
  import enum
3
  from functools import partial
4
  import json
 
5
  from pathlib import Path
6
  import re
7
  import tempfile
@@ -26,6 +27,8 @@ LOG_SCALE_STATS = {
26
  "pages_with_lorem_ipsum",
27
  }
28
 
 
 
29
 
30
  def find_folders(base_folder, path):
31
  base_folder = get_datafolder(base_folder)
@@ -145,11 +148,12 @@ def prepare_non_grouped_data(dataset_path, base_folder, grouping, stat_name, nor
145
  return stats_rounded
146
 
147
 
148
- def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS):
149
  import heapq
 
150
 
151
  stats = load_stats(base_folder, dataset_path, stat_name, grouping)
152
-
153
  means = {key: value.mean for key, value in stats.items()}
154
 
155
  # Use heap to get top_k keys
@@ -254,6 +258,7 @@ def update_graph(
254
  normalization,
255
  top_k,
256
  direction,
 
257
  progress=gr.Progress(),
258
  ):
259
  if len(datasets) <= 0 or not stat_name or not grouping:
@@ -262,7 +267,7 @@ def update_graph(
262
  prepare_fc = (
263
  partial(prepare_non_grouped_data, normalization=normalization)
264
  if grouping == "histogram"
265
- else partial(prepare_grouped_data, top_k=top_k, direction=direction)
266
  )
267
  graph_fc = (
268
  partial(plot_scatter, normalization=normalization)
@@ -348,20 +353,27 @@ Groupings:
348
  )
349
 
350
  with gr.Row(visible=False) as group_choices:
351
- top_select = gr.Number(
352
- label="K",
353
- value=100,
354
- interactive=True,
355
- )
356
- direction_checkbox = gr.Radio(
357
- label="Partition",
358
- choices=[
359
- "Top",
360
- "Bottom",
361
- "Most frequent (n_docs)",
362
- ],
363
- value="Top",
364
- )
 
 
 
 
 
 
 
365
 
366
  update_button = gr.Button("Update Graph", variant="primary")
367
  with gr.Row():
@@ -413,6 +425,7 @@ Groupings:
413
  normalization_checkbox,
414
  top_select,
415
  direction_checkbox,
 
416
  ],
417
  outputs=[graph_output, exported_data, export_data_button],
418
  )
 
2
  import enum
3
  from functools import partial
4
  import json
5
+ import os
6
  from pathlib import Path
7
  import re
8
  import tempfile
 
27
  "pages_with_lorem_ipsum",
28
  }
29
 
30
+ STATS_LOCATION_DEFAULT = os.getenv("STATS_LOCATION_DEFAULT", "s3://")
31
+
32
 
33
  def find_folders(base_folder, path):
34
  base_folder = get_datafolder(base_folder)
 
148
  return stats_rounded
149
 
150
 
151
+ def prepare_grouped_data(dataset_path, base_folder, grouping, stat_name, top_k, direction: PARTITION_OPTIONS, regex):
152
  import heapq
153
+ regex_compiled = re.compile(regex) if regex else None
154
 
155
  stats = load_stats(base_folder, dataset_path, stat_name, grouping)
156
+ stats = {key: value for key, value in stats.items() if not regex or regex_compiled.match(key)}
157
  means = {key: value.mean for key, value in stats.items()}
158
 
159
  # Use heap to get top_k keys
 
258
  normalization,
259
  top_k,
260
  direction,
261
+ regex,
262
  progress=gr.Progress(),
263
  ):
264
  if len(datasets) <= 0 or not stat_name or not grouping:
 
267
  prepare_fc = (
268
  partial(prepare_non_grouped_data, normalization=normalization)
269
  if grouping == "histogram"
270
+ else partial(prepare_grouped_data, top_k=top_k, direction=direction, regex=regex)
271
  )
272
  graph_fc = (
273
  partial(plot_scatter, normalization=normalization)
 
353
  )
354
 
355
  with gr.Row(visible=False) as group_choices:
356
+ with gr.Column(scale=2):
357
+ group_regex = gr.Text(
358
+ label="Group Regex",
359
+ value=None,
360
+ )
361
+ with gr.Row():
362
+ top_select = gr.Number(
363
+ label="N Groups",
364
+ value=100,
365
+ interactive=True,
366
+ )
367
+
368
+ direction_checkbox = gr.Radio(
369
+ label="Partition",
370
+ choices=[
371
+ "Top",
372
+ "Bottom",
373
+ "Most frequent (n_docs)",
374
+ ],
375
+ value="Most frequent (n_docs)",
376
+ )
377
 
378
  update_button = gr.Button("Update Graph", variant="primary")
379
  with gr.Row():
 
425
  normalization_checkbox,
426
  top_select,
427
  direction_checkbox,
428
+ group_regex,
429
  ],
430
  outputs=[graph_output, exported_data, export_data_button],
431
  )