Datasets-Metrics-Viewer

Running

App Files Files Community

hynky HF Staff commited on Apr 6, 2024

Commit

c76a4d8

1 Parent(s): 6c72e3f

update for new structure

Browse files

Files changed (1) hide show

app.py +65 -29

app.py CHANGED Viewed

@@ -14,9 +14,36 @@ LOG_SCALE_STATS = {
     "length",
     "n_lines",
     "n_docs",
     "avg_words_per_line",
     "pages_with_lorem_ipsum",
 }
 def find_folders(base_folder, path):
@@ -36,7 +63,7 @@ def find_stats_folders(base_folder: DataFolder):
     # Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
     stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
     # Finally get the unique paths
-    return list(set(stats_folders))
 RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
@@ -87,7 +114,8 @@ def load_stats(path, stat_name, group_by):
         return MetricStatsDict() + MetricStatsDict(init=json_stat)
-def prepare_non_grouped_data(stats: MetricStatsDict, normalization):
     stats_rounded = defaultdict(lambda: 0)
     for key, value in stats.items():
         stats_rounded[float(key)] += value.total
@@ -97,19 +125,29 @@ def prepare_non_grouped_data(stats: MetricStatsDict, normalization):
     return stats_rounded
-def prepare_grouped_data(stats: MetricStatsDict, top_k, direction):
     import heapq
     means = {key: value.mean for key, value in stats.items()}
     # Use heap to get top_k keys
     if direction == "Top":
         keys = heapq.nlargest(top_k, means, key=means.get)
     else:
         keys = heapq.nsmallest(top_k, means, key=means.get)
-    print(keys)
-    return {key: means[key] for key in keys}
 import math
@@ -122,21 +160,7 @@ def plot_scatter(
 ):
     fig = go.Figure()
-    colors = iter(
-        [
-            "rgba(31, 119, 180, 0.5)",
-            "rgba(255, 127, 14, 0.5)",
-            "rgba(44, 160, 44, 0.5)",
-            "rgba(214, 39, 40, 0.5)",
-            "rgba(148, 103, 189, 0.5)",
-            "rgba(227, 119, 194, 0.5)",
-            "rgba(127, 127, 127, 0.5)",
-            "rgba(188, 189, 34, 0.5)",
-            "rgba(23, 190, 207, 0.5)",
-        ]
-    )
-    for name, histogram in histograms.items():
         if all(isinstance(k, str) for k in histogram.keys()):
             x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
         else:
@@ -145,7 +169,13 @@ def plot_scatter(
         y = [histogram[k] for k in x]
         fig.add_trace(
-            go.Scatter(x=x, y=y, mode="lines", name=name, line=dict(color=next(colors)))
         )
     xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
@@ -158,19 +188,20 @@ def plot_scatter(
         xaxis_type=xaxis_scale,
         width=1200,
         height=600,
     )
     return fig
-def plot_bars(histograms: dict[str, dict[float, float]], stat_name: str):
     fig = go.Figure()
-    for name, histogram in histograms.items():
-        x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
-        y = [histogram[k] for k in x]
-        fig.add_trace(go.Bar(x=x, y=y, name=name))
     fig.update_layout(
         title=f"Bar Plots for {stat_name}",
@@ -179,6 +210,7 @@ def plot_bars(histograms: dict[str, dict[float, float]], stat_name: str):
         autosize=True,
         width=1200,
         height=600,
     )
     return fig
@@ -203,8 +235,7 @@ def update_graph(
     print("Loading stats")
     histograms = {
-        path: prepare_fc(load_stats(path, stat_name, grouping))
-        for path in multiselect_crawls
     }
     print("Plotting")
@@ -266,7 +297,12 @@ Groupings:
                 )
                 direction_checkbox = gr.Radio(
                     label="Partition",
-                    choices=["Top", "Bottom"],
                 )
             update_button = gr.Button("Update Graph", variant="primary")

     "length",
     "n_lines",
     "n_docs",
+    "n_words",
     "avg_words_per_line",
     "pages_with_lorem_ipsum",
 }
+colors = list(
+    [
+        "rgba(31, 119, 180, 0.5)",
+        "rgba(255, 127, 14, 0.5)",
+        "rgba(44, 160, 44, 0.5)",
+        "rgba(214, 39, 40, 0.5)",
+        "rgba(148, 103, 189, 0.5)",
+        "rgba(227, 119, 194, 0.5)",
+        "rgba(127, 127, 127, 0.5)",
+        "rgba(188, 189, 34, 0.5)",
+        "rgba(23, 190, 207, 0.5)",
+        "rgba(255, 193, 7, 0.5)",
+        "rgba(40, 167, 69, 0.5)",
+        "rgba(23, 162, 184, 0.5)",
+        "rgba(108, 117, 125, 0.5)",
+        "rgba(0, 123, 255, 0.5)",
+        "rgba(220, 53, 69, 0.5)",
+        "rgba(255, 159, 67, 0.5)",
+        "rgba(255, 87, 34, 0.5)",
+        "rgba(41, 182, 246, 0.5)",
+        "rgba(142, 36, 170, 0.5)",
+        "rgba(0, 188, 212, 0.5)",
+        "rgba(255, 235, 59, 0.5)",
+        "rgba(156, 39, 176, 0.5)",
+    ]
+)
 def find_folders(base_folder, path):
     # Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
     stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
     # Finally get the unique paths
+    return sorted(list(set(stats_folders)))
 RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
         return MetricStatsDict() + MetricStatsDict(init=json_stat)
+def prepare_non_grouped_data(path, stat_name, grouping, normalization):
+    stats = load_stats(path, stat_name, grouping)
     stats_rounded = defaultdict(lambda: 0)
     for key, value in stats.items():
         stats_rounded[float(key)] += value.total
     return stats_rounded
+def prepare_grouped_data(path, stat_name, grouping, top_k, direction):
     import heapq
+    stats = load_stats(path, stat_name, grouping)
     means = {key: value.mean for key, value in stats.items()}
     # Use heap to get top_k keys
     if direction == "Top":
         keys = heapq.nlargest(top_k, means, key=means.get)
+    elif direction == "Most frequent (n_docs)":
+        n_docs = load_stats(path, "n_docs", grouping)
+        totals = {key: value.total for key, value in n_docs.items()}
+        keys = heapq.nlargest(top_k, totals, key=totals.get)
+    elif direction == "Most frequent (length)":
+        n_docs = load_stats(path, "n_docs", grouping)
+        totals = {key: value.total for key, value in n_docs.items()}
+        keys = heapq.nlargest(top_k, totals, key=totals.get)
     else:
         keys = heapq.nsmallest(top_k, means, key=means.get)
+    return [(key, means[key]) for key in keys]
 import math
 ):
     fig = go.Figure()
+    for i, (name, histogram) in enumerate(histograms.items()):
         if all(isinstance(k, str) for k in histogram.keys()):
             x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
         else:
         y = [histogram[k] for k in x]
         fig.add_trace(
+            go.Scatter(
+                x=x,
+                y=y,
+                mode="lines",
+                name=name,
+                line=dict(color=colors[i % len(colors)]),
+            )
         )
     xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
         xaxis_type=xaxis_scale,
         width=1200,
         height=600,
+        showlegend=True,
     )
     return fig
+def plot_bars(histograms: dict[str, list[tuple[str, float]]], stat_name: str):
     fig = go.Figure()
+    for i, (name, histogram) in enumerate(histograms.items()):
+        x = [k for k, v in histogram]
+        y = [v for k, v in histogram]
+        fig.add_trace(go.Bar(x=x, y=y, name=name, marker_color=colors[i % len(colors)]))
     fig.update_layout(
         title=f"Bar Plots for {stat_name}",
         autosize=True,
         width=1200,
         height=600,
+        showlegend=True,
     )
     return fig
     print("Loading stats")
     histograms = {
+        path: prepare_fc(path, stat_name, grouping) for path in multiselect_crawls
     }
     print("Plotting")
                 )
                 direction_checkbox = gr.Radio(
                     label="Partition",
+                    choices=[
+                        "Top",
+                        "Bottom",
+                        "Most frequent (n_docs)",
+                        "Most frequent (length)",
+                    ],
                 )
             update_button = gr.Button("Update Graph", variant="primary")