victormiller commited on
Commit
0fdc018
1 Parent(s): f9cb337

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +633 -0
curated.py CHANGED
@@ -57,6 +57,638 @@ fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hove
57
  # Display treemap if you want to update the size.update_layout(width=800, height=600)
58
  treemap_chart = fig
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  filtering_process = Div(
@@ -65,6 +697,7 @@ filtering_process = Div(
65
  ),
66
  Section(
67
  H3("Wikipedia"),
 
68
  H4("Download and Extraction"),
69
  Ol(
70
  Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),
 
57
  # Display treemap if you want to update the size.update_layout(width=800, height=600)
58
  treemap_chart = fig
59
 
60
+ data = {
61
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
62
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491]
63
+ }
64
+
65
+ # Creating a dataframe
66
+ df = pd.DataFrame(data)
67
+
68
+ # Creating the stacked bar chart
69
+ fig = go.Figure()
70
+
71
+ # Add trace for each dataset
72
+ for dataset in df.columns[1:]:
73
+ fig.add_trace(go.Bar(
74
+ name=dataset,
75
+ x=df['Filter'],
76
+ y=df[dataset]
77
+ ))
78
+
79
+ # Update the layout
80
+ fig.update_layout(
81
+ barmode='group',
82
+ title='Wikipedia Bar Chart of Line Reductions by Filter for Each Dataset',
83
+ xaxis_title='Filter',
84
+ yaxis_title='Number of Lines',
85
+ legend_title='Dataset',
86
+ height=600,
87
+ width=1000
88
+ )
89
+
90
+ # Show the plot
91
+ wikipedia_bar = fig
92
+
93
+ data = {
94
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
95
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
96
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
97
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
98
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
99
+ 'PG19': [28752, 28683, 28682, 28632],
100
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
101
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
102
+ 'Europarl': [69814, 69814, 69814, 69814],
103
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
104
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
105
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
106
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
107
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
108
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
109
+ 'Phil Papers': [49389, 39175, 39175, 39128]
110
+ }
111
+
112
+ # Creating a dataframe
113
+ df = pd.DataFrame(data)
114
+
115
+ # Creating the stacked bar chart
116
+ fig = go.Figure()
117
+
118
+ # Add trace for each dataset
119
+ for dataset in df.columns[1:]:
120
+ fig.add_trace(go.Bar(
121
+ name=dataset,
122
+ x=df['Filter'],
123
+ y=df[dataset]
124
+ ))
125
+
126
+ # Update the layout
127
+ fig.update_layout(
128
+ barmode='group',
129
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
130
+ xaxis_title='Filter',
131
+ yaxis_title='Number of Lines',
132
+ legend_title='Dataset',
133
+ height=600,
134
+ width=1000
135
+ )
136
+
137
+ # Show the plot
138
+ freelaw_bar = fig
139
+
140
+ data = {
141
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
142
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
143
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
144
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
145
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
146
+ 'PG19': [28752, 28683, 28682, 28632],
147
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
148
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
149
+ 'Europarl': [69814, 69814, 69814, 69814],
150
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
151
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
152
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
153
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
154
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
155
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
156
+ 'Phil Papers': [49389, 39175, 39175, 39128]
157
+ }
158
+
159
+ # Creating a dataframe
160
+ df = pd.DataFrame(data)
161
+
162
+ # Creating the stacked bar chart
163
+ fig = go.Figure()
164
+
165
+ # Add trace for each dataset
166
+ for dataset in df.columns[1:]:
167
+ fig.add_trace(go.Bar(
168
+ name=dataset,
169
+ x=df['Filter'],
170
+ y=df[dataset]
171
+ ))
172
+
173
+ # Update the layout
174
+ fig.update_layout(
175
+ barmode='stack',
176
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
177
+ xaxis_title='Filter',
178
+ yaxis_title='Number of Lines',
179
+ legend_title='Dataset',
180
+ height=600,
181
+ width=1000
182
+ )
183
+
184
+ # Show the plot
185
+ diff2_stacked_bar = fig
186
+ data = {
187
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
188
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
189
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
190
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
191
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
192
+ 'PG19': [28752, 28683, 28682, 28632],
193
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
194
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
195
+ 'Europarl': [69814, 69814, 69814, 69814],
196
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
197
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
198
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
199
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
200
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
201
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
202
+ 'Phil Papers': [49389, 39175, 39175, 39128]
203
+ }
204
+
205
+ # Creating a dataframe
206
+ df = pd.DataFrame(data)
207
+
208
+ # Creating the stacked bar chart
209
+ fig = go.Figure()
210
+
211
+ # Add trace for each dataset
212
+ for dataset in df.columns[1:]:
213
+ fig.add_trace(go.Bar(
214
+ name=dataset,
215
+ x=df['Filter'],
216
+ y=df[dataset]
217
+ ))
218
+
219
+ # Update the layout
220
+ fig.update_layout(
221
+ barmode='stack',
222
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
223
+ xaxis_title='Filter',
224
+ yaxis_title='Number of Lines',
225
+ legend_title='Dataset',
226
+ height=600,
227
+ width=1000
228
+ )
229
+
230
+ # Show the plot
231
+ diff2_stacked_bar = fig
232
+ data = {
233
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
234
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
235
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
236
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
237
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
238
+ 'PG19': [28752, 28683, 28682, 28632],
239
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
240
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
241
+ 'Europarl': [69814, 69814, 69814, 69814],
242
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
243
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
244
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
245
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
246
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
247
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
248
+ 'Phil Papers': [49389, 39175, 39175, 39128]
249
+ }
250
+
251
+ # Creating a dataframe
252
+ df = pd.DataFrame(data)
253
+
254
+ # Creating the stacked bar chart
255
+ fig = go.Figure()
256
+
257
+ # Add trace for each dataset
258
+ for dataset in df.columns[1:]:
259
+ fig.add_trace(go.Bar(
260
+ name=dataset,
261
+ x=df['Filter'],
262
+ y=df[dataset]
263
+ ))
264
+
265
+ # Update the layout
266
+ fig.update_layout(
267
+ barmode='stack',
268
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
269
+ xaxis_title='Filter',
270
+ yaxis_title='Number of Lines',
271
+ legend_title='Dataset',
272
+ height=600,
273
+ width=1000
274
+ )
275
+
276
+ # Show the plot
277
+ diff2_stacked_bar = fig
278
+ data = {
279
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
280
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
281
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
282
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
283
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
284
+ 'PG19': [28752, 28683, 28682, 28632],
285
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
286
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
287
+ 'Europarl': [69814, 69814, 69814, 69814],
288
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
289
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
290
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
291
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
292
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
293
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
294
+ 'Phil Papers': [49389, 39175, 39175, 39128]
295
+ }
296
+
297
+ # Creating a dataframe
298
+ df = pd.DataFrame(data)
299
+
300
+ # Creating the stacked bar chart
301
+ fig = go.Figure()
302
+
303
+ # Add trace for each dataset
304
+ for dataset in df.columns[1:]:
305
+ fig.add_trace(go.Bar(
306
+ name=dataset,
307
+ x=df['Filter'],
308
+ y=df[dataset]
309
+ ))
310
+
311
+ # Update the layout
312
+ fig.update_layout(
313
+ barmode='stack',
314
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
315
+ xaxis_title='Filter',
316
+ yaxis_title='Number of Lines',
317
+ legend_title='Dataset',
318
+ height=600,
319
+ width=1000
320
+ )
321
+
322
+ # Show the plot
323
+ diff2_stacked_bar = fig
324
+ data = {
325
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
326
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
327
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
328
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
329
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
330
+ 'PG19': [28752, 28683, 28682, 28632],
331
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
332
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
333
+ 'Europarl': [69814, 69814, 69814, 69814],
334
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
335
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
336
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
337
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
338
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
339
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
340
+ 'Phil Papers': [49389, 39175, 39175, 39128]
341
+ }
342
+
343
+ # Creating a dataframe
344
+ df = pd.DataFrame(data)
345
+
346
+ # Creating the stacked bar chart
347
+ fig = go.Figure()
348
+
349
+ # Add trace for each dataset
350
+ for dataset in df.columns[1:]:
351
+ fig.add_trace(go.Bar(
352
+ name=dataset,
353
+ x=df['Filter'],
354
+ y=df[dataset]
355
+ ))
356
+
357
+ # Update the layout
358
+ fig.update_layout(
359
+ barmode='stack',
360
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
361
+ xaxis_title='Filter',
362
+ yaxis_title='Number of Lines',
363
+ legend_title='Dataset',
364
+ height=600,
365
+ width=1000
366
+ )
367
+
368
+ # Show the plot
369
+ diff2_stacked_bar = fig
370
+ data = {
371
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
372
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
373
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
374
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
375
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
376
+ 'PG19': [28752, 28683, 28682, 28632],
377
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
378
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
379
+ 'Europarl': [69814, 69814, 69814, 69814],
380
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
381
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
382
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
383
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
384
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
385
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
386
+ 'Phil Papers': [49389, 39175, 39175, 39128]
387
+ }
388
+
389
+ # Creating a dataframe
390
+ df = pd.DataFrame(data)
391
+
392
+ # Creating the stacked bar chart
393
+ fig = go.Figure()
394
+
395
+ # Add trace for each dataset
396
+ for dataset in df.columns[1:]:
397
+ fig.add_trace(go.Bar(
398
+ name=dataset,
399
+ x=df['Filter'],
400
+ y=df[dataset]
401
+ ))
402
+
403
+ # Update the layout
404
+ fig.update_layout(
405
+ barmode='stack',
406
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
407
+ xaxis_title='Filter',
408
+ yaxis_title='Number of Lines',
409
+ legend_title='Dataset',
410
+ height=600,
411
+ width=1000
412
+ )
413
+
414
+ # Show the plot
415
+ diff2_stacked_bar = fig
416
+ data = {
417
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
418
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
419
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
420
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
421
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
422
+ 'PG19': [28752, 28683, 28682, 28632],
423
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
424
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
425
+ 'Europarl': [69814, 69814, 69814, 69814],
426
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
427
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
428
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
429
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
430
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
431
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
432
+ 'Phil Papers': [49389, 39175, 39175, 39128]
433
+ }
434
+
435
+ # Creating a dataframe
436
+ df = pd.DataFrame(data)
437
+
438
+ # Creating the stacked bar chart
439
+ fig = go.Figure()
440
+
441
+ # Add trace for each dataset
442
+ for dataset in df.columns[1:]:
443
+ fig.add_trace(go.Bar(
444
+ name=dataset,
445
+ x=df['Filter'],
446
+ y=df[dataset]
447
+ ))
448
+
449
+ # Update the layout
450
+ fig.update_layout(
451
+ barmode='stack',
452
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
453
+ xaxis_title='Filter',
454
+ yaxis_title='Number of Lines',
455
+ legend_title='Dataset',
456
+ height=600,
457
+ width=1000
458
+ )
459
+
460
+ # Show the plot
461
+ diff2_stacked_bar = fig
462
+ data = {
463
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
464
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
465
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
466
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
467
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
468
+ 'PG19': [28752, 28683, 28682, 28632],
469
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
470
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
471
+ 'Europarl': [69814, 69814, 69814, 69814],
472
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
473
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
474
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
475
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
476
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
477
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
478
+ 'Phil Papers': [49389, 39175, 39175, 39128]
479
+ }
480
+
481
+ # Creating a dataframe
482
+ df = pd.DataFrame(data)
483
+
484
+ # Creating the stacked bar chart
485
+ fig = go.Figure()
486
+
487
+ # Add trace for each dataset
488
+ for dataset in df.columns[1:]:
489
+ fig.add_trace(go.Bar(
490
+ name=dataset,
491
+ x=df['Filter'],
492
+ y=df[dataset]
493
+ ))
494
+
495
+ # Update the layout
496
+ fig.update_layout(
497
+ barmode='stack',
498
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
499
+ xaxis_title='Filter',
500
+ yaxis_title='Number of Lines',
501
+ legend_title='Dataset',
502
+ height=600,
503
+ width=1000
504
+ )
505
+
506
+ # Show the plot
507
+ diff2_stacked_bar = fig
508
+ data = {
509
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
510
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
511
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
512
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
513
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
514
+ 'PG19': [28752, 28683, 28682, 28632],
515
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
516
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
517
+ 'Europarl': [69814, 69814, 69814, 69814],
518
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
519
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
520
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
521
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
522
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
523
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
524
+ 'Phil Papers': [49389, 39175, 39175, 39128]
525
+ }
526
+
527
+ # Creating a dataframe
528
+ df = pd.DataFrame(data)
529
+
530
+ # Creating the stacked bar chart
531
+ fig = go.Figure()
532
+
533
+ # Add trace for each dataset
534
+ for dataset in df.columns[1:]:
535
+ fig.add_trace(go.Bar(
536
+ name=dataset,
537
+ x=df['Filter'],
538
+ y=df[dataset]
539
+ ))
540
+
541
+ # Update the layout
542
+ fig.update_layout(
543
+ barmode='stack',
544
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
545
+ xaxis_title='Filter',
546
+ yaxis_title='Number of Lines',
547
+ legend_title='Dataset',
548
+ height=600,
549
+ width=1000
550
+ )
551
+
552
+ # Show the plot
553
+ diff2_stacked_bar = fig
554
+ data = {
555
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
556
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
557
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
558
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
559
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
560
+ 'PG19': [28752, 28683, 28682, 28632],
561
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
562
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
563
+ 'Europarl': [69814, 69814, 69814, 69814],
564
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
565
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
566
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
567
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
568
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
569
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
570
+ 'Phil Papers': [49389, 39175, 39175, 39128]
571
+ }
572
+
573
+ # Creating a dataframe
574
+ df = pd.DataFrame(data)
575
+
576
+ # Creating the stacked bar chart
577
+ fig = go.Figure()
578
+
579
+ # Add trace for each dataset
580
+ for dataset in df.columns[1:]:
581
+ fig.add_trace(go.Bar(
582
+ name=dataset,
583
+ x=df['Filter'],
584
+ y=df[dataset]
585
+ ))
586
+
587
+ # Update the layout
588
+ fig.update_layout(
589
+ barmode='stack',
590
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
591
+ xaxis_title='Filter',
592
+ yaxis_title='Number of Lines',
593
+ legend_title='Dataset',
594
+ height=600,
595
+ width=1000
596
+ )
597
+
598
+ # Show the plot
599
+ diff2_stacked_bar = fig
600
+ data = {
601
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
602
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
603
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
604
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
605
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
606
+ 'PG19': [28752, 28683, 28682, 28632],
607
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
608
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
609
+ 'Europarl': [69814, 69814, 69814, 69814],
610
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
611
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
612
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
613
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
614
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
615
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
616
+ 'Phil Papers': [49389, 39175, 39175, 39128]
617
+ }
618
+
619
+ # Creating a dataframe
620
+ df = pd.DataFrame(data)
621
+
622
+ # Creating the stacked bar chart
623
+ fig = go.Figure()
624
+
625
+ # Add trace for each dataset
626
+ for dataset in df.columns[1:]:
627
+ fig.add_trace(go.Bar(
628
+ name=dataset,
629
+ x=df['Filter'],
630
+ y=df[dataset]
631
+ ))
632
+
633
+ # Update the layout
634
+ fig.update_layout(
635
+ barmode='stack',
636
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
637
+ xaxis_title='Filter',
638
+ yaxis_title='Number of Lines',
639
+ legend_title='Dataset',
640
+ height=600,
641
+ width=1000
642
+ )
643
+
644
+ # Show the plot
645
+ diff2_stacked_bar = fig
646
+ data = {
647
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
648
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
649
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
650
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
651
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
652
+ 'PG19': [28752, 28683, 28682, 28632],
653
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
654
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
655
+ 'Europarl': [69814, 69814, 69814, 69814],
656
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
657
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
658
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
659
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
660
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
661
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
662
+ 'Phil Papers': [49389, 39175, 39175, 39128]
663
+ }
664
+
665
+ # Creating a dataframe
666
+ df = pd.DataFrame(data)
667
+
668
+ # Creating the stacked bar chart
669
+ fig = go.Figure()
670
+
671
+ # Add trace for each dataset
672
+ for dataset in df.columns[1:]:
673
+ fig.add_trace(go.Bar(
674
+ name=dataset,
675
+ x=df['Filter'],
676
+ y=df[dataset]
677
+ ))
678
+
679
+ # Update the layout
680
+ fig.update_layout(
681
+ barmode='stack',
682
+ title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
683
+ xaxis_title='Filter',
684
+ yaxis_title='Number of Lines',
685
+ legend_title='Dataset',
686
+ height=600,
687
+ width=1000
688
+ )
689
+
690
+ # Show the plot
691
+ diff2_stacked_bar = fig
692
 
693
 
694
  filtering_process = Div(
 
697
  ),
698
  Section(
699
  H3("Wikipedia"),
700
+ wikipedia_bar,
701
  H4("Download and Extraction"),
702
  Ol(
703
  Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),