hynky HF Staff commited on
Commit
f8b7659
·
1 Parent(s): 4668859

reverse search

Browse files
Files changed (1) hide show
  1. app.py +98 -26
app.py CHANGED
@@ -29,6 +29,8 @@ LOG_SCALE_STATS = {
29
 
30
  def find_folders(base_folder, path):
31
  base_folder = get_datafolder(base_folder)
 
 
32
  return sorted(
33
  [
34
  folder["name"]
@@ -49,9 +51,9 @@ def find_stats_folders(base_folder: str):
49
  return sorted(list(set(stats_folders)))
50
 
51
 
52
- def fetch_runs(base_folder: str):
53
- runs = sorted(find_stats_folders(base_folder))
54
- return runs, gr.update(choices=runs, value=None)
55
 
56
 
57
  def export_data(exported_data):
@@ -64,14 +66,19 @@ def export_data(exported_data):
64
  return gr.update(visible=True, value=temp_path)
65
 
66
 
67
- def fetch_groups(base_folder, datasets, old_groups):
68
- GROUPS = [
69
- [Path(x).name for x in find_folders(base_folder, run)] for run in datasets
70
- ]
 
 
71
  if len(GROUPS) == 0:
72
  return gr.update(choices=[], value=None)
73
 
74
- new_choices = set.intersection(*(set(g) for g in GROUPS))
 
 
 
75
  value = None
76
  if old_groups:
77
  value = list(set.intersection(new_choices, {old_groups}))
@@ -81,15 +88,17 @@ def fetch_groups(base_folder, datasets, old_groups):
81
  return gr.update(choices=sorted(list(new_choices)), value=value)
82
 
83
 
84
- def fetch_stats(base_folder, datasets, group, old_stats):
85
- STATS = [
86
- [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")]
87
- for run in datasets
88
- ]
89
  if len(STATS) == 0:
90
  return gr.update(choices=[], value=None)
91
 
92
- new_possibles_choices = set.intersection(*(set(s) for s in STATS))
 
 
 
93
  value = None
94
  if old_stats:
95
  value = list(set.intersection(new_possibles_choices, {old_stats}))
@@ -98,6 +107,23 @@ def fetch_stats(base_folder, datasets, group, old_stats):
98
  return gr.update(choices=sorted(list(new_possibles_choices)), value=value)
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def load_stats(base_folder, path, stat_name, group_by):
102
  base_folder = get_datafolder(base_folder)
103
  with base_folder.open(
@@ -265,12 +291,13 @@ def update_graph(
265
  with gr.Blocks() as demo:
266
  datasets = gr.State([])
267
  exported_data = gr.State([])
 
268
  with gr.Row():
269
  with gr.Column(scale=2):
270
  # Define the multiselect for crawls
271
  with gr.Row():
272
  with gr.Column(scale=1):
273
- stats_folder = gr.Textbox(
274
  label="Stats Location",
275
  value="s3://fineweb-stats/summary/",
276
  )
@@ -298,10 +325,6 @@ Groupings:
298
  * k: the number of groups to show
299
  * Top/Bottom: the top/bottom k groups are shown
300
  - summary: simply shows the average value of given stat for selected crawls
301
-
302
-
303
-
304
-
305
  """,
306
  )
307
  with gr.Column(scale=1):
@@ -348,11 +371,41 @@ Groupings:
348
  with gr.Row():
349
  # Define the graph output
350
  graph_output = gr.Plot(label="Graph")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  update_button.click(
353
  fn=update_graph,
354
  inputs=[
355
- stats_folder,
356
  datasets_selected,
357
  stat_name_dropdown,
358
  grouping_dropdown,
@@ -369,22 +422,41 @@ Groupings:
369
  outputs=export_data_json,
370
  )
371
 
372
- datasets_selected.select(
373
  fn=fetch_groups,
374
- inputs=[stats_folder, datasets_selected, grouping_dropdown],
375
  outputs=grouping_dropdown,
376
  )
377
 
378
  grouping_dropdown.select(
379
  fn=fetch_stats,
380
- inputs=[stats_folder, datasets_selected, grouping_dropdown, stat_name_dropdown],
381
  outputs=stat_name_dropdown,
382
  )
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  datasets_refetch.click(
385
- fn=fetch_runs,
386
- inputs=[stats_folder],
387
- outputs=[datasets, datasets_selected],
388
  )
389
 
390
  def update_datasets_with_regex(regex, selected_runs, all_runs):
 
29
 
30
  def find_folders(base_folder, path):
31
  base_folder = get_datafolder(base_folder)
32
+ if not base_folder.exists(path):
33
+ return []
34
  return sorted(
35
  [
36
  folder["name"]
 
51
  return sorted(list(set(stats_folders)))
52
 
53
 
54
+ def fetch_datasets(base_folder: str):
55
+ datasets = sorted(find_stats_folders(base_folder))
56
+ return datasets, gr.update(choices=datasets, value=None), fetch_groups(base_folder, datasets, None, "union")
57
 
58
 
59
  def export_data(exported_data):
 
66
  return gr.update(visible=True, value=temp_path)
67
 
68
 
69
+ def fetch_groups(base_folder, datasets, old_groups, type="intersection"):
70
+ if not datasets:
71
+ return gr.update(choices=[], value=None)
72
+
73
+ with ThreadPoolExecutor() as executor:
74
+ GROUPS = list(executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, run)], datasets))
75
  if len(GROUPS) == 0:
76
  return gr.update(choices=[], value=None)
77
 
78
+ if type == "intersection":
79
+ new_choices = set.intersection(*(set(g) for g in GROUPS))
80
+ elif type == "union":
81
+ new_choices = set.union(*(set(g) for g in GROUPS))
82
  value = None
83
  if old_groups:
84
  value = list(set.intersection(new_choices, {old_groups}))
 
88
  return gr.update(choices=sorted(list(new_choices)), value=value)
89
 
90
 
91
+ def fetch_stats(base_folder, datasets, group, old_stats, type="intersection"):
92
+ print("Fetching stats")
93
+ with ThreadPoolExecutor() as executor:
94
+ STATS = list(executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")], datasets))
 
95
  if len(STATS) == 0:
96
  return gr.update(choices=[], value=None)
97
 
98
+ if type == "intersection":
99
+ new_possibles_choices = set.intersection(*(set(s) for s in STATS))
100
+ elif type == "union":
101
+ new_possibles_choices = set.union(*(set(s) for s in STATS))
102
  value = None
103
  if old_stats:
104
  value = list(set.intersection(new_possibles_choices, {old_stats}))
 
107
  return gr.update(choices=sorted(list(new_possibles_choices)), value=value)
108
 
109
 
110
+ def reverse_search(base_folder, possible_datasets, grouping, stat_name):
111
+ with ThreadPoolExecutor() as executor:
112
+ found_datasets = list(executor.map(lambda dataset: dataset if stat_exists(base_folder, dataset, stat_name, grouping) else None, possible_datasets))
113
+ found_datasets = [dataset for dataset in found_datasets if dataset is not None]
114
+ return "\n".join(found_datasets)
115
+
116
+
117
+ def reverse_search_add(datasets, reverse_search_results):
118
+ datasets = datasets or []
119
+ return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
120
+
121
+
122
+
123
+ def stat_exists(base_folder, path, stat_name, group_by):
124
+ base_folder = get_datafolder(base_folder)
125
+ return base_folder.exists(f"{path}/{group_by}/{stat_name}/stats-merged.json")
126
+
127
  def load_stats(base_folder, path, stat_name, group_by):
128
  base_folder = get_datafolder(base_folder)
129
  with base_folder.open(
 
291
  with gr.Blocks() as demo:
292
  datasets = gr.State([])
293
  exported_data = gr.State([])
294
+ stats_headline = gr.Markdown(value="# Stats Exploration")
295
  with gr.Row():
296
  with gr.Column(scale=2):
297
  # Define the multiselect for crawls
298
  with gr.Row():
299
  with gr.Column(scale=1):
300
+ base_folder = gr.Textbox(
301
  label="Stats Location",
302
  value="s3://fineweb-stats/summary/",
303
  )
 
325
  * k: the number of groups to show
326
  * Top/Bottom: the top/bottom k groups are shown
327
  - summary: simply shows the average value of given stat for selected crawls
 
 
 
 
328
  """,
329
  )
330
  with gr.Column(scale=1):
 
371
  with gr.Row():
372
  # Define the graph output
373
  graph_output = gr.Plot(label="Graph")
374
+
375
+ with gr.Row():
376
+ reverse_search_headline = gr.Markdown(value="# Reverse stats search")
377
+
378
+ with gr.Row():
379
+ with gr.Column(scale=1):
380
+ # Define the dropdown for grouping
381
+ reverse_grouping_dropdown = gr.Dropdown(
382
+ choices=[],
383
+ label="Grouping",
384
+ multiselect=False,
385
+ )
386
+ # Define the dropdown for stat_name
387
+ reverse_stat_name_dropdown = gr.Dropdown(
388
+ choices=[],
389
+ label="Stat name",
390
+ multiselect=False,
391
+ )
392
+
393
+ with gr.Column(scale=1):
394
+ reverse_search_button = gr.Button("Search")
395
+ reverse_search_add_button = gr.Button("Add to selection")
396
+
397
+ with gr.Column(scale=2):
398
+ reverse_search_results = gr.Textbox(
399
+ label="Found datasets",
400
+ lines=10,
401
+ )
402
+
403
+
404
 
405
  update_button.click(
406
  fn=update_graph,
407
  inputs=[
408
+ base_folder,
409
  datasets_selected,
410
  stat_name_dropdown,
411
  grouping_dropdown,
 
422
  outputs=export_data_json,
423
  )
424
 
425
+ datasets_selected.change(
426
  fn=fetch_groups,
427
+ inputs=[base_folder, datasets_selected, grouping_dropdown],
428
  outputs=grouping_dropdown,
429
  )
430
 
431
  grouping_dropdown.select(
432
  fn=fetch_stats,
433
+ inputs=[base_folder, datasets_selected, grouping_dropdown, stat_name_dropdown],
434
  outputs=stat_name_dropdown,
435
  )
436
 
437
+ reverse_grouping_dropdown.select(
438
+ fn=partial(fetch_stats, type="union"),
439
+ inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_stat_name_dropdown],
440
+ outputs=reverse_stat_name_dropdown,
441
+ )
442
+
443
+ reverse_search_button.click(
444
+ fn=reverse_search,
445
+ inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_stat_name_dropdown],
446
+ outputs=reverse_search_results,
447
+ )
448
+
449
+ reverse_search_add_button.click(
450
+ fn=reverse_search_add,
451
+ inputs=[datasets_selected, reverse_search_results],
452
+ outputs=datasets_selected,
453
+ )
454
+
455
+
456
  datasets_refetch.click(
457
+ fn=fetch_datasets,
458
+ inputs=[base_folder],
459
+ outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
460
  )
461
 
462
  def update_datasets_with_regex(regex, selected_runs, all_runs):