update for new structure
Browse files
app.py
CHANGED
@@ -14,9 +14,36 @@ LOG_SCALE_STATS = {
|
|
14 |
"length",
|
15 |
"n_lines",
|
16 |
"n_docs",
|
|
|
17 |
"avg_words_per_line",
|
18 |
"pages_with_lorem_ipsum",
|
19 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
def find_folders(base_folder, path):
|
@@ -36,7 +63,7 @@ def find_stats_folders(base_folder: DataFolder):
|
|
36 |
# Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
|
37 |
stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
|
38 |
# Finally get the unique paths
|
39 |
-
return list(set(stats_folders))
|
40 |
|
41 |
|
42 |
RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
|
@@ -87,7 +114,8 @@ def load_stats(path, stat_name, group_by):
|
|
87 |
return MetricStatsDict() + MetricStatsDict(init=json_stat)
|
88 |
|
89 |
|
90 |
-
def prepare_non_grouped_data(
|
|
|
91 |
stats_rounded = defaultdict(lambda: 0)
|
92 |
for key, value in stats.items():
|
93 |
stats_rounded[float(key)] += value.total
|
@@ -97,19 +125,29 @@ def prepare_non_grouped_data(stats: MetricStatsDict, normalization):
|
|
97 |
return stats_rounded
|
98 |
|
99 |
|
100 |
-
def prepare_grouped_data(
|
101 |
import heapq
|
102 |
|
|
|
|
|
103 |
means = {key: value.mean for key, value in stats.items()}
|
104 |
|
105 |
# Use heap to get top_k keys
|
106 |
if direction == "Top":
|
107 |
keys = heapq.nlargest(top_k, means, key=means.get)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
else:
|
109 |
keys = heapq.nsmallest(top_k, means, key=means.get)
|
110 |
-
print(keys)
|
111 |
|
112 |
-
return
|
113 |
|
114 |
|
115 |
import math
|
@@ -122,21 +160,7 @@ def plot_scatter(
|
|
122 |
):
|
123 |
fig = go.Figure()
|
124 |
|
125 |
-
|
126 |
-
[
|
127 |
-
"rgba(31, 119, 180, 0.5)",
|
128 |
-
"rgba(255, 127, 14, 0.5)",
|
129 |
-
"rgba(44, 160, 44, 0.5)",
|
130 |
-
"rgba(214, 39, 40, 0.5)",
|
131 |
-
"rgba(148, 103, 189, 0.5)",
|
132 |
-
"rgba(227, 119, 194, 0.5)",
|
133 |
-
"rgba(127, 127, 127, 0.5)",
|
134 |
-
"rgba(188, 189, 34, 0.5)",
|
135 |
-
"rgba(23, 190, 207, 0.5)",
|
136 |
-
]
|
137 |
-
)
|
138 |
-
|
139 |
-
for name, histogram in histograms.items():
|
140 |
if all(isinstance(k, str) for k in histogram.keys()):
|
141 |
x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
|
142 |
else:
|
@@ -145,7 +169,13 @@ def plot_scatter(
|
|
145 |
y = [histogram[k] for k in x]
|
146 |
|
147 |
fig.add_trace(
|
148 |
-
go.Scatter(
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
)
|
150 |
|
151 |
xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
|
@@ -158,19 +188,20 @@ def plot_scatter(
|
|
158 |
xaxis_type=xaxis_scale,
|
159 |
width=1200,
|
160 |
height=600,
|
|
|
161 |
)
|
162 |
|
163 |
return fig
|
164 |
|
165 |
|
166 |
-
def plot_bars(histograms: dict[str,
|
167 |
fig = go.Figure()
|
168 |
|
169 |
-
for name, histogram in histograms.items():
|
170 |
-
x = [k for k, v in
|
171 |
-
y = [
|
172 |
|
173 |
-
fig.add_trace(go.Bar(x=x, y=y, name=name))
|
174 |
|
175 |
fig.update_layout(
|
176 |
title=f"Bar Plots for {stat_name}",
|
@@ -179,6 +210,7 @@ def plot_bars(histograms: dict[str, dict[float, float]], stat_name: str):
|
|
179 |
autosize=True,
|
180 |
width=1200,
|
181 |
height=600,
|
|
|
182 |
)
|
183 |
|
184 |
return fig
|
@@ -203,8 +235,7 @@ def update_graph(
|
|
203 |
|
204 |
print("Loading stats")
|
205 |
histograms = {
|
206 |
-
path: prepare_fc(
|
207 |
-
for path in multiselect_crawls
|
208 |
}
|
209 |
|
210 |
print("Plotting")
|
@@ -266,7 +297,12 @@ Groupings:
|
|
266 |
)
|
267 |
direction_checkbox = gr.Radio(
|
268 |
label="Partition",
|
269 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
|
270 |
)
|
271 |
|
272 |
update_button = gr.Button("Update Graph", variant="primary")
|
|
|
14 |
"length",
|
15 |
"n_lines",
|
16 |
"n_docs",
|
17 |
+
"n_words",
|
18 |
"avg_words_per_line",
|
19 |
"pages_with_lorem_ipsum",
|
20 |
}
|
21 |
+
colors = list(
|
22 |
+
[
|
23 |
+
"rgba(31, 119, 180, 0.5)",
|
24 |
+
"rgba(255, 127, 14, 0.5)",
|
25 |
+
"rgba(44, 160, 44, 0.5)",
|
26 |
+
"rgba(214, 39, 40, 0.5)",
|
27 |
+
"rgba(148, 103, 189, 0.5)",
|
28 |
+
"rgba(227, 119, 194, 0.5)",
|
29 |
+
"rgba(127, 127, 127, 0.5)",
|
30 |
+
"rgba(188, 189, 34, 0.5)",
|
31 |
+
"rgba(23, 190, 207, 0.5)",
|
32 |
+
"rgba(255, 193, 7, 0.5)",
|
33 |
+
"rgba(40, 167, 69, 0.5)",
|
34 |
+
"rgba(23, 162, 184, 0.5)",
|
35 |
+
"rgba(108, 117, 125, 0.5)",
|
36 |
+
"rgba(0, 123, 255, 0.5)",
|
37 |
+
"rgba(220, 53, 69, 0.5)",
|
38 |
+
"rgba(255, 159, 67, 0.5)",
|
39 |
+
"rgba(255, 87, 34, 0.5)",
|
40 |
+
"rgba(41, 182, 246, 0.5)",
|
41 |
+
"rgba(142, 36, 170, 0.5)",
|
42 |
+
"rgba(0, 188, 212, 0.5)",
|
43 |
+
"rgba(255, 235, 59, 0.5)",
|
44 |
+
"rgba(156, 39, 176, 0.5)",
|
45 |
+
]
|
46 |
+
)
|
47 |
|
48 |
|
49 |
def find_folders(base_folder, path):
|
|
|
63 |
# Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
|
64 |
stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
|
65 |
# Finally get the unique paths
|
66 |
+
return sorted(list(set(stats_folders)))
|
67 |
|
68 |
|
69 |
RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
|
|
|
114 |
return MetricStatsDict() + MetricStatsDict(init=json_stat)
|
115 |
|
116 |
|
117 |
+
def prepare_non_grouped_data(path, stat_name, grouping, normalization):
|
118 |
+
stats = load_stats(path, stat_name, grouping)
|
119 |
stats_rounded = defaultdict(lambda: 0)
|
120 |
for key, value in stats.items():
|
121 |
stats_rounded[float(key)] += value.total
|
|
|
125 |
return stats_rounded
|
126 |
|
127 |
|
128 |
+
def prepare_grouped_data(path, stat_name, grouping, top_k, direction):
|
129 |
import heapq
|
130 |
|
131 |
+
stats = load_stats(path, stat_name, grouping)
|
132 |
+
|
133 |
means = {key: value.mean for key, value in stats.items()}
|
134 |
|
135 |
# Use heap to get top_k keys
|
136 |
if direction == "Top":
|
137 |
keys = heapq.nlargest(top_k, means, key=means.get)
|
138 |
+
elif direction == "Most frequent (n_docs)":
|
139 |
+
n_docs = load_stats(path, "n_docs", grouping)
|
140 |
+
totals = {key: value.total for key, value in n_docs.items()}
|
141 |
+
keys = heapq.nlargest(top_k, totals, key=totals.get)
|
142 |
+
|
143 |
+
elif direction == "Most frequent (length)":
|
144 |
+
n_docs = load_stats(path, "n_docs", grouping)
|
145 |
+
totals = {key: value.total for key, value in n_docs.items()}
|
146 |
+
keys = heapq.nlargest(top_k, totals, key=totals.get)
|
147 |
else:
|
148 |
keys = heapq.nsmallest(top_k, means, key=means.get)
|
|
|
149 |
|
150 |
+
return [(key, means[key]) for key in keys]
|
151 |
|
152 |
|
153 |
import math
|
|
|
160 |
):
|
161 |
fig = go.Figure()
|
162 |
|
163 |
+
for i, (name, histogram) in enumerate(histograms.items()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
if all(isinstance(k, str) for k in histogram.keys()):
|
165 |
x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
|
166 |
else:
|
|
|
169 |
y = [histogram[k] for k in x]
|
170 |
|
171 |
fig.add_trace(
|
172 |
+
go.Scatter(
|
173 |
+
x=x,
|
174 |
+
y=y,
|
175 |
+
mode="lines",
|
176 |
+
name=name,
|
177 |
+
line=dict(color=colors[i % len(colors)]),
|
178 |
+
)
|
179 |
)
|
180 |
|
181 |
xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
|
|
|
188 |
xaxis_type=xaxis_scale,
|
189 |
width=1200,
|
190 |
height=600,
|
191 |
+
showlegend=True,
|
192 |
)
|
193 |
|
194 |
return fig
|
195 |
|
196 |
|
197 |
+
def plot_bars(histograms: dict[str, list[tuple[str, float]]], stat_name: str):
|
198 |
fig = go.Figure()
|
199 |
|
200 |
+
for i, (name, histogram) in enumerate(histograms.items()):
|
201 |
+
x = [k for k, v in histogram]
|
202 |
+
y = [v for k, v in histogram]
|
203 |
|
204 |
+
fig.add_trace(go.Bar(x=x, y=y, name=name, marker_color=colors[i % len(colors)]))
|
205 |
|
206 |
fig.update_layout(
|
207 |
title=f"Bar Plots for {stat_name}",
|
|
|
210 |
autosize=True,
|
211 |
width=1200,
|
212 |
height=600,
|
213 |
+
showlegend=True,
|
214 |
)
|
215 |
|
216 |
return fig
|
|
|
235 |
|
236 |
print("Loading stats")
|
237 |
histograms = {
|
238 |
+
path: prepare_fc(path, stat_name, grouping) for path in multiselect_crawls
|
|
|
239 |
}
|
240 |
|
241 |
print("Plotting")
|
|
|
297 |
)
|
298 |
direction_checkbox = gr.Radio(
|
299 |
label="Partition",
|
300 |
+
choices=[
|
301 |
+
"Top",
|
302 |
+
"Bottom",
|
303 |
+
"Most frequent (n_docs)",
|
304 |
+
"Most frequent (length)",
|
305 |
+
],
|
306 |
)
|
307 |
|
308 |
update_button = gr.Button("Update Graph", variant="primary")
|