hynky HF Staff commited on
Commit
c76a4d8
·
1 Parent(s): 6c72e3f

update for new structure

Browse files
Files changed (1) hide show
  1. app.py +65 -29
app.py CHANGED
@@ -14,9 +14,36 @@ LOG_SCALE_STATS = {
14
  "length",
15
  "n_lines",
16
  "n_docs",
 
17
  "avg_words_per_line",
18
  "pages_with_lorem_ipsum",
19
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def find_folders(base_folder, path):
@@ -36,7 +63,7 @@ def find_stats_folders(base_folder: DataFolder):
36
  # Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
37
  stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
38
  # Finally get the unique paths
39
- return list(set(stats_folders))
40
 
41
 
42
  RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
@@ -87,7 +114,8 @@ def load_stats(path, stat_name, group_by):
87
  return MetricStatsDict() + MetricStatsDict(init=json_stat)
88
 
89
 
90
- def prepare_non_grouped_data(stats: MetricStatsDict, normalization):
 
91
  stats_rounded = defaultdict(lambda: 0)
92
  for key, value in stats.items():
93
  stats_rounded[float(key)] += value.total
@@ -97,19 +125,29 @@ def prepare_non_grouped_data(stats: MetricStatsDict, normalization):
97
  return stats_rounded
98
 
99
 
100
- def prepare_grouped_data(stats: MetricStatsDict, top_k, direction):
101
  import heapq
102
 
 
 
103
  means = {key: value.mean for key, value in stats.items()}
104
 
105
  # Use heap to get top_k keys
106
  if direction == "Top":
107
  keys = heapq.nlargest(top_k, means, key=means.get)
 
 
 
 
 
 
 
 
 
108
  else:
109
  keys = heapq.nsmallest(top_k, means, key=means.get)
110
- print(keys)
111
 
112
- return {key: means[key] for key in keys}
113
 
114
 
115
  import math
@@ -122,21 +160,7 @@ def plot_scatter(
122
  ):
123
  fig = go.Figure()
124
 
125
- colors = iter(
126
- [
127
- "rgba(31, 119, 180, 0.5)",
128
- "rgba(255, 127, 14, 0.5)",
129
- "rgba(44, 160, 44, 0.5)",
130
- "rgba(214, 39, 40, 0.5)",
131
- "rgba(148, 103, 189, 0.5)",
132
- "rgba(227, 119, 194, 0.5)",
133
- "rgba(127, 127, 127, 0.5)",
134
- "rgba(188, 189, 34, 0.5)",
135
- "rgba(23, 190, 207, 0.5)",
136
- ]
137
- )
138
-
139
- for name, histogram in histograms.items():
140
  if all(isinstance(k, str) for k in histogram.keys()):
141
  x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
142
  else:
@@ -145,7 +169,13 @@ def plot_scatter(
145
  y = [histogram[k] for k in x]
146
 
147
  fig.add_trace(
148
- go.Scatter(x=x, y=y, mode="lines", name=name, line=dict(color=next(colors)))
 
 
 
 
 
 
149
  )
150
 
151
  xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
@@ -158,19 +188,20 @@ def plot_scatter(
158
  xaxis_type=xaxis_scale,
159
  width=1200,
160
  height=600,
 
161
  )
162
 
163
  return fig
164
 
165
 
166
- def plot_bars(histograms: dict[str, dict[float, float]], stat_name: str):
167
  fig = go.Figure()
168
 
169
- for name, histogram in histograms.items():
170
- x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
171
- y = [histogram[k] for k in x]
172
 
173
- fig.add_trace(go.Bar(x=x, y=y, name=name))
174
 
175
  fig.update_layout(
176
  title=f"Bar Plots for {stat_name}",
@@ -179,6 +210,7 @@ def plot_bars(histograms: dict[str, dict[float, float]], stat_name: str):
179
  autosize=True,
180
  width=1200,
181
  height=600,
 
182
  )
183
 
184
  return fig
@@ -203,8 +235,7 @@ def update_graph(
203
 
204
  print("Loading stats")
205
  histograms = {
206
- path: prepare_fc(load_stats(path, stat_name, grouping))
207
- for path in multiselect_crawls
208
  }
209
 
210
  print("Plotting")
@@ -266,7 +297,12 @@ Groupings:
266
  )
267
  direction_checkbox = gr.Radio(
268
  label="Partition",
269
- choices=["Top", "Bottom"],
 
 
 
 
 
270
  )
271
 
272
  update_button = gr.Button("Update Graph", variant="primary")
 
14
  "length",
15
  "n_lines",
16
  "n_docs",
17
+ "n_words",
18
  "avg_words_per_line",
19
  "pages_with_lorem_ipsum",
20
  }
21
+ colors = list(
22
+ [
23
+ "rgba(31, 119, 180, 0.5)",
24
+ "rgba(255, 127, 14, 0.5)",
25
+ "rgba(44, 160, 44, 0.5)",
26
+ "rgba(214, 39, 40, 0.5)",
27
+ "rgba(148, 103, 189, 0.5)",
28
+ "rgba(227, 119, 194, 0.5)",
29
+ "rgba(127, 127, 127, 0.5)",
30
+ "rgba(188, 189, 34, 0.5)",
31
+ "rgba(23, 190, 207, 0.5)",
32
+ "rgba(255, 193, 7, 0.5)",
33
+ "rgba(40, 167, 69, 0.5)",
34
+ "rgba(23, 162, 184, 0.5)",
35
+ "rgba(108, 117, 125, 0.5)",
36
+ "rgba(0, 123, 255, 0.5)",
37
+ "rgba(220, 53, 69, 0.5)",
38
+ "rgba(255, 159, 67, 0.5)",
39
+ "rgba(255, 87, 34, 0.5)",
40
+ "rgba(41, 182, 246, 0.5)",
41
+ "rgba(142, 36, 170, 0.5)",
42
+ "rgba(0, 188, 212, 0.5)",
43
+ "rgba(255, 235, 59, 0.5)",
44
+ "rgba(156, 39, 176, 0.5)",
45
+ ]
46
+ )
47
 
48
 
49
  def find_folders(base_folder, path):
 
63
  # Then for each of stats.merged take the all but last two parts of the path (grouping/stat_name)
64
  stats_folders = [str(Path(x).parent.parent.parent) for x in stats_merged]
65
  # Finally get the unique paths
66
+ return sorted(list(set(stats_folders)))
67
 
68
 
69
  RUNS = sorted(find_stats_folders(BASE_DATA_FOLDER))
 
114
  return MetricStatsDict() + MetricStatsDict(init=json_stat)
115
 
116
 
117
+ def prepare_non_grouped_data(path, stat_name, grouping, normalization):
118
+ stats = load_stats(path, stat_name, grouping)
119
  stats_rounded = defaultdict(lambda: 0)
120
  for key, value in stats.items():
121
  stats_rounded[float(key)] += value.total
 
125
  return stats_rounded
126
 
127
 
128
+ def prepare_grouped_data(path, stat_name, grouping, top_k, direction):
129
  import heapq
130
 
131
+ stats = load_stats(path, stat_name, grouping)
132
+
133
  means = {key: value.mean for key, value in stats.items()}
134
 
135
  # Use heap to get top_k keys
136
  if direction == "Top":
137
  keys = heapq.nlargest(top_k, means, key=means.get)
138
+ elif direction == "Most frequent (n_docs)":
139
+ n_docs = load_stats(path, "n_docs", grouping)
140
+ totals = {key: value.total for key, value in n_docs.items()}
141
+ keys = heapq.nlargest(top_k, totals, key=totals.get)
142
+
143
+ elif direction == "Most frequent (length)":
144
+ n_docs = load_stats(path, "n_docs", grouping)
145
+ totals = {key: value.total for key, value in n_docs.items()}
146
+ keys = heapq.nlargest(top_k, totals, key=totals.get)
147
  else:
148
  keys = heapq.nsmallest(top_k, means, key=means.get)
 
149
 
150
+ return [(key, means[key]) for key in keys]
151
 
152
 
153
  import math
 
160
  ):
161
  fig = go.Figure()
162
 
163
+ for i, (name, histogram) in enumerate(histograms.items()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  if all(isinstance(k, str) for k in histogram.keys()):
165
  x = [k for k, v in sorted(histogram.items(), key=lambda item: item[1])]
166
  else:
 
169
  y = [histogram[k] for k in x]
170
 
171
  fig.add_trace(
172
+ go.Scatter(
173
+ x=x,
174
+ y=y,
175
+ mode="lines",
176
+ name=name,
177
+ line=dict(color=colors[i % len(colors)]),
178
+ )
179
  )
180
 
181
  xaxis_scale = "log" if stat_name in LOG_SCALE_STATS else "linear"
 
188
  xaxis_type=xaxis_scale,
189
  width=1200,
190
  height=600,
191
+ showlegend=True,
192
  )
193
 
194
  return fig
195
 
196
 
197
+ def plot_bars(histograms: dict[str, list[tuple[str, float]]], stat_name: str):
198
  fig = go.Figure()
199
 
200
+ for i, (name, histogram) in enumerate(histograms.items()):
201
+ x = [k for k, v in histogram]
202
+ y = [v for k, v in histogram]
203
 
204
+ fig.add_trace(go.Bar(x=x, y=y, name=name, marker_color=colors[i % len(colors)]))
205
 
206
  fig.update_layout(
207
  title=f"Bar Plots for {stat_name}",
 
210
  autosize=True,
211
  width=1200,
212
  height=600,
213
+ showlegend=True,
214
  )
215
 
216
  return fig
 
235
 
236
  print("Loading stats")
237
  histograms = {
238
+ path: prepare_fc(path, stat_name, grouping) for path in multiselect_crawls
 
239
  }
240
 
241
  print("Plotting")
 
297
  )
298
  direction_checkbox = gr.Radio(
299
  label="Partition",
300
+ choices=[
301
+ "Top",
302
+ "Bottom",
303
+ "Most frequent (n_docs)",
304
+ "Most frequent (length)",
305
+ ],
306
  )
307
 
308
  update_button = gr.Button("Update Graph", variant="primary")