Datasets-Metrics-Viewer

Running

App Files Files Community

guipenedo HF Staff commited on Jul 10, 2024

Commit

a5f2bd2

unverified ·

1 Parent(s): 13ccbad

nits

Browse files

Files changed (1) hide show

app.py +82 -79

app.py CHANGED Viewed

@@ -1,25 +1,23 @@
-from concurrent.futures import ThreadPoolExecutor
-import enum
-from functools import partial
 import json
 import os
-from pathlib import Path
 import re
-import heapq
 import tempfile
 from typing import Literal
-import gradio as gr
-from collections import defaultdict
-from datatrove.io import get_datafolder
-import plotly.graph_objects as go
-from datatrove.utils.stats import MetricStats, MetricStatsDict
 import plotly.express as px
 import tenacity
-import gradio as gr
-PARTITION_OPTIONS = Literal[ "Top", "Bottom", "Most frequent (n_docs)"]
-METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "s3://fineweb-stats/summary/")
 def find_folders(base_folder, path):
@@ -74,7 +72,7 @@ def fetch_groups(base_folder, datasets, old_groups, type="intersection"):
         return gr.update(choices=[], value=None)
     if type == "intersection":
-        new_choices = set.intersection(*(set(g) for g in GROUPS))
     else:
         new_choices = set.union(*(set(g) for g in GROUPS))
     value = None
@@ -88,7 +86,8 @@ def fetch_groups(base_folder, datasets, old_groups, type="intersection"):
 def fetch_metrics(base_folder, datasets, group, old_metrics, type="intersection"):
     with ThreadPoolExecutor() as executor:
-        metrics = list(executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")], datasets))
     if len(metrics) == 0:
         return gr.update(choices=[], value=None)
@@ -106,7 +105,9 @@ def fetch_metrics(base_folder, datasets, group, old_metrics, type="intersection"
 def reverse_search(base_folder, possible_datasets, grouping, metric_name):
     with ThreadPoolExecutor() as executor:
-        found_datasets = list(executor.map(lambda dataset: dataset if metric_exists(base_folder, dataset, metric_name, grouping) else None, possible_datasets))
     found_datasets = [dataset for dataset in found_datasets if dataset is not None]
     return "\n".join(found_datasets)
@@ -116,16 +117,16 @@ def reverse_search_add(datasets, reverse_search_results):
     return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
 def metric_exists(base_folder, path, metric_name, group_by):
     base_folder = get_datafolder(base_folder)
     return base_folder.exists(f"{path}/{group_by}/{metric_name}/metric.json")
 @tenacity.retry(stop=tenacity.stop_after_attempt(5))
 def load_metrics(base_folder, path, metric_name, group_by):
     base_folder = get_datafolder(base_folder)
     with base_folder.open(
-        f"{path}/{group_by}/{metric_name}/metric.json",
     ) as f:
         json_metric = json.load(f)
         # No idea why this is necessary, but it is, otheriwse the Metric StatsDict is malformed
@@ -149,6 +150,7 @@ def load_data(dataset_path, base_folder, grouping, metric_name):
     metrics = load_metrics(base_folder, dataset_path, metric_name, grouping)
     return metrics
 def prepare_for_group_plotting(metric, top_k, direction: PARTITION_OPTIONS, regex: str | None, rounding: int):
     regex_compiled = re.compile(regex) if regex else None
     metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
@@ -162,7 +164,6 @@ def prepare_for_group_plotting(metric, top_k, direction: PARTITION_OPTIONS, rege
     else:
         keys = heapq.nsmallest(top_k, means, key=means.get)
     means = [means[key] for key in keys]
     stds = [metric[key].standard_deviation for key in keys]
     return keys, means, stds
@@ -181,13 +182,13 @@ def set_alpha(color, alpha):
 def plot_scatter(
-    data: dict[str, dict[float, float]],
-    metric_name: str,
-    log_scale_x: bool,
-    log_scale_y: bool,
-    normalization: bool,
-    rounding: int,
-    progress: gr.Progress,
 ):
     fig = go.Figure()
@@ -225,15 +226,15 @@ def plot_scatter(
 def plot_bars(
-    data: dict[str, list[dict[str, float]]],
-    metric_name: str,
-    top_k: int,
-    direction: PARTITION_OPTIONS,
-    regex: str | None,
-    rounding: int,
-    log_scale_x: bool,
-    log_scale_y: bool,
-    progress: gr.Progress,
 ):
     fig = go.Figure()
     x = []
@@ -243,9 +244,9 @@ def plot_bars(
         x, y, stds = prepare_for_group_plotting(histogram, top_k, direction, regex, rounding)
         fig.add_trace(go.Bar(
-            x=x,
-            y=y,
-            name=f"{name} Mean",
             marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
             error_y=dict(type='data', array=stds, visible=True)
         ))
@@ -266,18 +267,18 @@ def plot_bars(
 def update_graph(
-    base_folder,
-    datasets,
-    metric_name,
-    grouping,
-    log_scale_x,
-    log_scale_y,
-    rounding,
-    normalization,
-    top_k,
-    direction,
-    regex,
-    progress=gr.Progress(),
 ):
     if len(datasets) <= 0 or not metric_name or not grouping:
         return None
@@ -296,9 +297,12 @@ def update_graph(
         )
     data = {path: result for path, result in zip(datasets, data)}
-    return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y, progress), data, export_data(data, metric_name)
-def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y, progress=gr.Progress()):
     if rounding is None or top_k is None:
         return None
     graph_fc = (
@@ -306,8 +310,8 @@ def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direc
         if grouping == "histogram"
         else partial(plot_bars, top_k=top_k, direction=direction, regex=regex, rounding=rounding)
     )
-    return graph_fc(data=data, metric_name=metric_name, progress=progress, log_scale_x=log_scale_x, log_scale_y=log_scale_y)
 # Create the Gradio interface
@@ -376,7 +380,6 @@ The data might not be 100% representative, due to the sampling and optimistic me
                 multiselect=False,
             )
             update_button = gr.Button("Update Graph", variant="primary")
     with gr.Row():
@@ -414,7 +417,7 @@ The data might not be 100% representative, due to the sampling and optimistic me
                             value=100,
                             interactive=True,
                         )
                         direction_checkbox = gr.Radio(
                             label="Partition",
                             choices=[
@@ -423,14 +426,14 @@ The data might not be 100% representative, due to the sampling and optimistic me
                                 "Most frequent (n_docs)",
                             ],
                             value="Most frequent (n_docs)",
-                    )
         # Define the graph output
     with gr.Row():
         graph_output = gr.Plot(label="Graph")
     with gr.Row():
         reverse_search_headline = gr.Markdown(value="# Reverse metrics search")
     with gr.Row():
         with gr.Column(scale=1):
             # Define the dropdown for grouping
@@ -445,7 +448,7 @@ The data might not be 100% representative, due to the sampling and optimistic me
                 label="Stat name",
                 multiselect=False,
             )
         with gr.Column(scale=1):
             reverse_search_button = gr.Button("Search")
             reverse_search_add_button = gr.Button("Add to selection")
@@ -457,7 +460,6 @@ The data might not be 100% representative, due to the sampling and optimistic me
                 placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
             )
     update_button.click(
         fn=update_graph,
         inputs=[
@@ -476,25 +478,24 @@ The data might not be 100% representative, due to the sampling and optimistic me
         outputs=[graph_output, exported_data, export_data_json],
     )
-    for inp in [normalization_checkbox, rounding, group_regex, direction_checkbox, top_select, log_scale_x_checkbox, log_scale_y_checkbox]:
         inp.change(
             fn=plot_data,
             inputs=[
-            exported_data,
-            metric_name_dropdown,
-            normalization_checkbox,
-            rounding,
-            grouping_dropdown,
-            top_select,
-            direction_checkbox,
-            group_regex,
-            log_scale_x_checkbox,
-            log_scale_y_checkbox,
-        ],
-        outputs=[graph_output],
-    )
     datasets_selected.change(
         fn=fetch_groups,
@@ -526,13 +527,13 @@ The data might not be 100% representative, due to the sampling and optimistic me
         outputs=datasets_selected,
     )
     datasets_refetch.click(
         fn=fetch_datasets,
         inputs=[base_folder],
         outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
     )
     def update_datasets_with_regex(regex, selected_runs, all_runs):
         if not regex:
             return
@@ -542,12 +543,14 @@ The data might not be 100% representative, due to the sampling and optimistic me
         dst_union = new_dsts.union(selected_runs or [])
         return gr.update(value=sorted(list(dst_union)))
     regex_button.click(
         fn=update_datasets_with_regex,
         inputs=[regex_select, datasets_selected, datasets],
         outputs=datasets_selected,
     )
     def update_grouping_options(grouping):
         if grouping == "histogram":
             return {
@@ -560,13 +563,13 @@ The data might not be 100% representative, due to the sampling and optimistic me
                 group_choices: gr.Column(visible=True),
             }
     grouping_dropdown.select(
         fn=update_grouping_options,
         inputs=[grouping_dropdown],
         outputs=[normalization_checkbox, group_choices],
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch()

+import heapq
 import json
 import os
 import re
 import tempfile
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from pathlib import Path
 from typing import Literal
+import gradio as gr
 import plotly.express as px
+import plotly.graph_objects as go
 import tenacity
+from datatrove.io import get_datafolder
+from datatrove.utils.stats import MetricStatsDict
+PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
+METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
 def find_folders(base_folder, path):
         return gr.update(choices=[], value=None)
     if type == "intersection":
+        new_choices = set.intersection(*(set(g) for g in GROUPS))
     else:
         new_choices = set.union(*(set(g) for g in GROUPS))
     value = None
 def fetch_metrics(base_folder, datasets, group, old_metrics, type="intersection"):
     with ThreadPoolExecutor() as executor:
+        metrics = list(
+            executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")], datasets))
     if len(metrics) == 0:
         return gr.update(choices=[], value=None)
 def reverse_search(base_folder, possible_datasets, grouping, metric_name):
     with ThreadPoolExecutor() as executor:
+        found_datasets = list(executor.map(
+            lambda dataset: dataset if metric_exists(base_folder, dataset, metric_name, grouping) else None,
+            possible_datasets))
     found_datasets = [dataset for dataset in found_datasets if dataset is not None]
     return "\n".join(found_datasets)
     return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
 def metric_exists(base_folder, path, metric_name, group_by):
     base_folder = get_datafolder(base_folder)
     return base_folder.exists(f"{path}/{group_by}/{metric_name}/metric.json")
 @tenacity.retry(stop=tenacity.stop_after_attempt(5))
 def load_metrics(base_folder, path, metric_name, group_by):
     base_folder = get_datafolder(base_folder)
     with base_folder.open(
+            f"{path}/{group_by}/{metric_name}/metric.json",
     ) as f:
         json_metric = json.load(f)
         # No idea why this is necessary, but it is, otheriwse the Metric StatsDict is malformed
     metrics = load_metrics(base_folder, dataset_path, metric_name, grouping)
     return metrics
 def prepare_for_group_plotting(metric, top_k, direction: PARTITION_OPTIONS, regex: str | None, rounding: int):
     regex_compiled = re.compile(regex) if regex else None
     metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
     else:
         keys = heapq.nsmallest(top_k, means, key=means.get)
     means = [means[key] for key in keys]
     stds = [metric[key].standard_deviation for key in keys]
     return keys, means, stds
 def plot_scatter(
+        data: dict[str, dict[float, float]],
+        metric_name: str,
+        log_scale_x: bool,
+        log_scale_y: bool,
+        normalization: bool,
+        rounding: int,
+        progress: gr.Progress,
 ):
     fig = go.Figure()
 def plot_bars(
+        data: dict[str, list[dict[str, float]]],
+        metric_name: str,
+        top_k: int,
+        direction: PARTITION_OPTIONS,
+        regex: str | None,
+        rounding: int,
+        log_scale_x: bool,
+        log_scale_y: bool,
+        progress: gr.Progress,
 ):
     fig = go.Figure()
     x = []
         x, y, stds = prepare_for_group_plotting(histogram, top_k, direction, regex, rounding)
         fig.add_trace(go.Bar(
+            x=x,
+            y=y,
+            name=f"{name} Mean",
             marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
             error_y=dict(type='data', array=stds, visible=True)
         ))
 def update_graph(
+        base_folder,
+        datasets,
+        metric_name,
+        grouping,
+        log_scale_x,
+        log_scale_y,
+        rounding,
+        normalization,
+        top_k,
+        direction,
+        regex,
+        progress=gr.Progress(),
 ):
     if len(datasets) <= 0 or not metric_name or not grouping:
         return None
         )
     data = {path: result for path, result in zip(datasets, data)}
+    return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x,
+                     log_scale_y, progress), data, export_data(data, metric_name)
+def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y,
+              progress=gr.Progress()):
     if rounding is None or top_k is None:
         return None
     graph_fc = (
         if grouping == "histogram"
         else partial(plot_bars, top_k=top_k, direction=direction, regex=regex, rounding=rounding)
     )
+    return graph_fc(data=data, metric_name=metric_name, progress=progress, log_scale_x=log_scale_x,
+                    log_scale_y=log_scale_y)
 # Create the Gradio interface
                 multiselect=False,
             )
             update_button = gr.Button("Update Graph", variant="primary")
     with gr.Row():
                             value=100,
                             interactive=True,
                         )
                         direction_checkbox = gr.Radio(
                             label="Partition",
                             choices=[
                                 "Most frequent (n_docs)",
                             ],
                             value="Most frequent (n_docs)",
+                        )
         # Define the graph output
     with gr.Row():
         graph_output = gr.Plot(label="Graph")
     with gr.Row():
         reverse_search_headline = gr.Markdown(value="# Reverse metrics search")
     with gr.Row():
         with gr.Column(scale=1):
             # Define the dropdown for grouping
                 label="Stat name",
                 multiselect=False,
             )
         with gr.Column(scale=1):
             reverse_search_button = gr.Button("Search")
             reverse_search_add_button = gr.Button("Add to selection")
                 placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
             )
     update_button.click(
         fn=update_graph,
         inputs=[
         outputs=[graph_output, exported_data, export_data_json],
     )
+    for inp in [normalization_checkbox, rounding, group_regex, direction_checkbox, top_select, log_scale_x_checkbox,
+                log_scale_y_checkbox]:
         inp.change(
             fn=plot_data,
             inputs=[
+                exported_data,
+                metric_name_dropdown,
+                normalization_checkbox,
+                rounding,
+                grouping_dropdown,
+                top_select,
+                direction_checkbox,
+                group_regex,
+                log_scale_x_checkbox,
+                log_scale_y_checkbox,
+            ],
+            outputs=[graph_output],
+        )
     datasets_selected.change(
         fn=fetch_groups,
         outputs=datasets_selected,
     )
     datasets_refetch.click(
         fn=fetch_datasets,
         inputs=[base_folder],
         outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
     )
     def update_datasets_with_regex(regex, selected_runs, all_runs):
         if not regex:
             return
         dst_union = new_dsts.union(selected_runs or [])
         return gr.update(value=sorted(list(dst_union)))
     regex_button.click(
         fn=update_datasets_with_regex,
         inputs=[regex_select, datasets_selected, datasets],
         outputs=datasets_selected,
     )
     def update_grouping_options(grouping):
         if grouping == "histogram":
             return {
                 group_choices: gr.Column(visible=True),
             }
     grouping_dropdown.select(
         fn=update_grouping_options,
         inputs=[grouping_dropdown],
         outputs=[normalization_checkbox, group_choices],
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch()