Spaces:
Running
Running
Merge conflicts
Browse files- curated.py +37 -63
- main.py +2 -2
- web.py +1 -0
curated.py
CHANGED
@@ -131,18 +131,16 @@ wikipedia_filter = pd.DataFrame(
|
|
131 |
"0.00%",
|
132 |
],
|
133 |
"Percent Removed After Local Dedup": [
|
134 |
-
"",
|
135 |
],
|
136 |
"Total Percentage Remaining": [
|
137 |
-
"",
|
138 |
],
|
139 |
}
|
140 |
)
|
141 |
|
142 |
table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
|
143 |
-
table_div_wikipedia = Div(
|
144 |
-
NotStr(table_html_wikipedia), style="margin-left: auto; width: 80%; align: center;"
|
145 |
-
)
|
146 |
|
147 |
freelaw_filter = pd.DataFrame(
|
148 |
{
|
@@ -171,9 +169,7 @@ freelaw_filter = pd.DataFrame(
|
|
171 |
)
|
172 |
|
173 |
table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
|
174 |
-
table_div_freelaw = Div(
|
175 |
-
NotStr(table_html_freelaw), style="margin-left: auto; width: 80%; align: center;"
|
176 |
-
)
|
177 |
|
178 |
dmm_filter = pd.DataFrame(
|
179 |
{
|
@@ -193,18 +189,16 @@ dmm_filter = pd.DataFrame(
|
|
193 |
"0.00%",
|
194 |
],
|
195 |
"Percent Removed After Local Dedup": [
|
196 |
-
"",
|
197 |
],
|
198 |
"Total Percentage Remaining": [
|
199 |
-
"%",
|
200 |
],
|
201 |
}
|
202 |
)
|
203 |
|
204 |
table_html_dmm = dmm_filter.to_html(index=False, border=0)
|
205 |
-
table_div_dmm = Div(
|
206 |
-
NotStr(table_html_dmm), style="margin-left: auto; width: 80%; align: center;"
|
207 |
-
)
|
208 |
|
209 |
|
210 |
uspto_filter = pd.DataFrame(
|
@@ -225,18 +219,16 @@ uspto_filter = pd.DataFrame(
|
|
225 |
"0.01%",
|
226 |
],
|
227 |
"Percent Removed After Local Dedup": [
|
228 |
-
"",
|
229 |
],
|
230 |
"Total Percentage Remaining": [
|
231 |
-
"%",
|
232 |
],
|
233 |
}
|
234 |
)
|
235 |
|
236 |
table_html_uspto = uspto_filter.to_html(index=False, border=0)
|
237 |
-
table_div_uspto = Div(
|
238 |
-
NotStr(table_html_uspto), style="margin-left: auto; width: 80%; align: center;"
|
239 |
-
)
|
240 |
|
241 |
pg19_filter = pd.DataFrame(
|
242 |
{
|
@@ -256,18 +248,16 @@ pg19_filter = pd.DataFrame(
|
|
256 |
"0.17%",
|
257 |
],
|
258 |
"Percent Removed After Local Dedup": [
|
259 |
-
"",
|
260 |
],
|
261 |
"Total Percentage Remaining": [
|
262 |
-
"%",
|
263 |
],
|
264 |
}
|
265 |
)
|
266 |
|
267 |
table_html_pg19 = pg19_filter.to_html(index=False, border=0)
|
268 |
-
table_div_pg19 = Div(
|
269 |
-
NotStr(table_html_pg19), style="margin-left: auto; width: 80%; align: center;"
|
270 |
-
)
|
271 |
|
272 |
|
273 |
hn_filter = pd.DataFrame(
|
@@ -288,18 +278,16 @@ hn_filter = pd.DataFrame(
|
|
288 |
"0.34%",
|
289 |
],
|
290 |
"Percent Removed After Local Dedup": [
|
291 |
-
"",
|
292 |
],
|
293 |
"Total Percentage Remaining": [
|
294 |
-
"%",
|
295 |
],
|
296 |
}
|
297 |
)
|
298 |
|
299 |
table_html_hn = hn_filter.to_html(index=False, border=0)
|
300 |
-
table_div_hn = Div(
|
301 |
-
NotStr(table_html_hn), style="margin-left: auto; width: 80%; align: center;"
|
302 |
-
)
|
303 |
|
304 |
|
305 |
uirc_filter = pd.DataFrame(
|
@@ -320,18 +308,16 @@ uirc_filter = pd.DataFrame(
|
|
320 |
"1.12%",
|
321 |
],
|
322 |
"Percent Removed After Local Dedup": [
|
323 |
-
"",
|
324 |
],
|
325 |
"Total Percentage Remaining": [
|
326 |
-
"%",
|
327 |
],
|
328 |
}
|
329 |
)
|
330 |
|
331 |
table_html_uirc = uirc_filter.to_html(index=False, border=0)
|
332 |
-
table_div_uirc = Div(
|
333 |
-
NotStr(table_html_uirc), style="margin-left: auto; width: 80%; align: center;"
|
334 |
-
)
|
335 |
|
336 |
up_filter = pd.DataFrame(
|
337 |
{
|
@@ -351,18 +337,16 @@ up_filter = pd.DataFrame(
|
|
351 |
"0.00%",
|
352 |
],
|
353 |
"Percent Removed After Local Dedup": [
|
354 |
-
"",
|
355 |
],
|
356 |
"Total Percentage Remaining": [
|
357 |
-
"%",
|
358 |
],
|
359 |
}
|
360 |
)
|
361 |
|
362 |
table_html_up = up_filter.to_html(index=False, border=0)
|
363 |
-
table_div_up = Div(
|
364 |
-
NotStr(table_html_up), style="margin-left: auto; width: 80%; align: center;"
|
365 |
-
)
|
366 |
|
367 |
se_filter = pd.DataFrame(
|
368 |
{
|
@@ -382,18 +366,16 @@ se_filter = pd.DataFrame(
|
|
382 |
"0.00%",
|
383 |
],
|
384 |
"Percent Removed After Local Dedup": [
|
385 |
-
"",
|
386 |
],
|
387 |
"Total Percentage Remaining": [
|
388 |
-
"%",
|
389 |
],
|
390 |
}
|
391 |
)
|
392 |
|
393 |
table_html_se = se_filter.to_html(index=False, border=0)
|
394 |
-
table_div_se = Div(
|
395 |
-
NotStr(table_html_se), style="margin-left: auto; width: 80%; align: center;"
|
396 |
-
)
|
397 |
|
398 |
arx_filter = pd.DataFrame(
|
399 |
{
|
@@ -413,18 +395,16 @@ arx_filter = pd.DataFrame(
|
|
413 |
"0.07%",
|
414 |
],
|
415 |
"Percent Removed After Local Dedup": [
|
416 |
-
"",
|
417 |
],
|
418 |
"Total Percentage Remaining": [
|
419 |
-
"%",
|
420 |
],
|
421 |
}
|
422 |
)
|
423 |
|
424 |
table_html_arx = arx_filter.to_html(index=False, border=0)
|
425 |
-
table_div_arx = Div(
|
426 |
-
NotStr(table_html_arx), style="margin-left: auto; width: 80%; align: center;"
|
427 |
-
)
|
428 |
|
429 |
s2o_filter = pd.DataFrame(
|
430 |
{
|
@@ -444,18 +424,16 @@ s2o_filter = pd.DataFrame(
|
|
444 |
"0.00%",
|
445 |
],
|
446 |
"Percent Removed After Local Dedup": [
|
447 |
-
"",
|
448 |
],
|
449 |
"Total Percentage Remaining": [
|
450 |
-
"%",
|
451 |
],
|
452 |
}
|
453 |
)
|
454 |
|
455 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
456 |
-
table_div_s2o = Div(
|
457 |
-
NotStr(table_html_s2o), style="margin-left: auto; width: 80%; align: center;"
|
458 |
-
)
|
459 |
|
460 |
med_filter = pd.DataFrame(
|
461 |
{
|
@@ -475,18 +453,16 @@ med_filter = pd.DataFrame(
|
|
475 |
"0.02%",
|
476 |
],
|
477 |
"Percent Removed After Local Dedup": [
|
478 |
-
"",
|
479 |
],
|
480 |
"Total Percentage Remaining": [
|
481 |
-
"%",
|
482 |
],
|
483 |
}
|
484 |
)
|
485 |
|
486 |
table_html_med = med_filter.to_html(index=False, border=0)
|
487 |
-
table_div_med = Div(
|
488 |
-
NotStr(table_html_med), style="margin-left: auto; width: 80%; align: center;"
|
489 |
-
)
|
490 |
|
491 |
phil_filter = pd.DataFrame(
|
492 |
{
|
@@ -506,18 +482,16 @@ phil_filter = pd.DataFrame(
|
|
506 |
"0.12%",
|
507 |
],
|
508 |
"Percent Removed After Local Dedup": [
|
509 |
-
"",
|
510 |
],
|
511 |
"Total Percentage Remaining": [
|
512 |
-
"%",
|
513 |
],
|
514 |
}
|
515 |
)
|
516 |
|
517 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
518 |
-
table_div_phil = Div(
|
519 |
-
NotStr(table_html_phil), style="margin-left: auto; width: 80%; align: center;"
|
520 |
-
)
|
521 |
## end individual tables showing filterin
|
522 |
|
523 |
|
|
|
131 |
"0.00%",
|
132 |
],
|
133 |
"Percent Removed After Local Dedup": [
|
134 |
+
"0.31%",
|
135 |
],
|
136 |
"Total Percentage Remaining": [
|
137 |
+
"97.84%",
|
138 |
],
|
139 |
}
|
140 |
)
|
141 |
|
142 |
table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
|
143 |
+
table_div_wikipedia = Div(NotStr(table_html_wikipedia))
|
|
|
|
|
144 |
|
145 |
freelaw_filter = pd.DataFrame(
|
146 |
{
|
|
|
169 |
)
|
170 |
|
171 |
table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
|
172 |
+
table_div_freelaw = Div(NotStr(table_html_freelaw))
|
|
|
|
|
173 |
|
174 |
dmm_filter = pd.DataFrame(
|
175 |
{
|
|
|
189 |
"0.00%",
|
190 |
],
|
191 |
"Percent Removed After Local Dedup": [
|
192 |
+
"0.00%",
|
193 |
],
|
194 |
"Total Percentage Remaining": [
|
195 |
+
"100.00%",
|
196 |
],
|
197 |
}
|
198 |
)
|
199 |
|
200 |
table_html_dmm = dmm_filter.to_html(index=False, border=0)
|
201 |
+
table_div_dmm = Div(NotStr(table_html_dmm))
|
|
|
|
|
202 |
|
203 |
|
204 |
uspto_filter = pd.DataFrame(
|
|
|
219 |
"0.01%",
|
220 |
],
|
221 |
"Percent Removed After Local Dedup": [
|
222 |
+
"22.94%",
|
223 |
],
|
224 |
"Total Percentage Remaining": [
|
225 |
+
"75.60%",
|
226 |
],
|
227 |
}
|
228 |
)
|
229 |
|
230 |
table_html_uspto = uspto_filter.to_html(index=False, border=0)
|
231 |
+
table_div_uspto = Div(NotStr(table_html_uspto))
|
|
|
|
|
232 |
|
233 |
pg19_filter = pd.DataFrame(
|
234 |
{
|
|
|
248 |
"0.17%",
|
249 |
],
|
250 |
"Percent Removed After Local Dedup": [
|
251 |
+
"0.80%",
|
252 |
],
|
253 |
"Total Percentage Remaining": [
|
254 |
+
"98.78%",
|
255 |
],
|
256 |
}
|
257 |
)
|
258 |
|
259 |
table_html_pg19 = pg19_filter.to_html(index=False, border=0)
|
260 |
+
table_div_pg19 = Div(NotStr(table_html_pg19))
|
|
|
|
|
261 |
|
262 |
|
263 |
hn_filter = pd.DataFrame(
|
|
|
278 |
"0.34%",
|
279 |
],
|
280 |
"Percent Removed After Local Dedup": [
|
281 |
+
"61.84%",
|
282 |
],
|
283 |
"Total Percentage Remaining": [
|
284 |
+
"37.03%",
|
285 |
],
|
286 |
}
|
287 |
)
|
288 |
|
289 |
table_html_hn = hn_filter.to_html(index=False, border=0)
|
290 |
+
table_div_hn = Div(NotStr(table_html_hn))
|
|
|
|
|
291 |
|
292 |
|
293 |
uirc_filter = pd.DataFrame(
|
|
|
308 |
"1.12%",
|
309 |
],
|
310 |
"Percent Removed After Local Dedup": [
|
311 |
+
"0.66%",
|
312 |
],
|
313 |
"Total Percentage Remaining": [
|
314 |
+
"60.72%",
|
315 |
],
|
316 |
}
|
317 |
)
|
318 |
|
319 |
table_html_uirc = uirc_filter.to_html(index=False, border=0)
|
320 |
+
table_div_uirc = Div(NotStr(table_html_uirc))
|
|
|
|
|
321 |
|
322 |
up_filter = pd.DataFrame(
|
323 |
{
|
|
|
337 |
"0.00%",
|
338 |
],
|
339 |
"Percent Removed After Local Dedup": [
|
340 |
+
"1.00%",
|
341 |
],
|
342 |
"Total Percentage Remaining": [
|
343 |
+
"99.00%",
|
344 |
],
|
345 |
}
|
346 |
)
|
347 |
|
348 |
table_html_up = up_filter.to_html(index=False, border=0)
|
349 |
+
table_div_up = Div(NotStr(table_html_up))
|
|
|
|
|
350 |
|
351 |
se_filter = pd.DataFrame(
|
352 |
{
|
|
|
366 |
"0.00%",
|
367 |
],
|
368 |
"Percent Removed After Local Dedup": [
|
369 |
+
"0.00%",
|
370 |
],
|
371 |
"Total Percentage Remaining": [
|
372 |
+
"100.00%",
|
373 |
],
|
374 |
}
|
375 |
)
|
376 |
|
377 |
table_html_se = se_filter.to_html(index=False, border=0)
|
378 |
+
table_div_se = Div(NotStr(table_html_se))
|
|
|
|
|
379 |
|
380 |
arx_filter = pd.DataFrame(
|
381 |
{
|
|
|
395 |
"0.07%",
|
396 |
],
|
397 |
"Percent Removed After Local Dedup": [
|
398 |
+
"0.00%",
|
399 |
],
|
400 |
"Total Percentage Remaining": [
|
401 |
+
"92.20%",
|
402 |
],
|
403 |
}
|
404 |
)
|
405 |
|
406 |
table_html_arx = arx_filter.to_html(index=False, border=0)
|
407 |
+
table_div_arx = Div(NotStr(table_html_arx))
|
|
|
|
|
408 |
|
409 |
s2o_filter = pd.DataFrame(
|
410 |
{
|
|
|
424 |
"0.00%",
|
425 |
],
|
426 |
"Percent Removed After Local Dedup": [
|
427 |
+
"0.00%",
|
428 |
],
|
429 |
"Total Percentage Remaining": [
|
430 |
+
"100.00%",
|
431 |
],
|
432 |
}
|
433 |
)
|
434 |
|
435 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
436 |
+
table_div_s2o = Div(NotStr(table_html_s2o))
|
|
|
|
|
437 |
|
438 |
med_filter = pd.DataFrame(
|
439 |
{
|
|
|
453 |
"0.02%",
|
454 |
],
|
455 |
"Percent Removed After Local Dedup": [
|
456 |
+
"0.00%",
|
457 |
],
|
458 |
"Total Percentage Remaining": [
|
459 |
+
"91.14%",
|
460 |
],
|
461 |
}
|
462 |
)
|
463 |
|
464 |
table_html_med = med_filter.to_html(index=False, border=0)
|
465 |
+
table_div_med = Div(NotStr(table_html_med))
|
|
|
|
|
466 |
|
467 |
phil_filter = pd.DataFrame(
|
468 |
{
|
|
|
482 |
"0.12%",
|
483 |
],
|
484 |
"Percent Removed After Local Dedup": [
|
485 |
+
"0.00%",
|
486 |
],
|
487 |
"Total Percentage Remaining": [
|
488 |
+
"79.22%",
|
489 |
],
|
490 |
}
|
491 |
)
|
492 |
|
493 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
494 |
+
table_div_phil = Div(NotStr(table_html_phil))
|
|
|
|
|
495 |
## end individual tables showing filterin
|
496 |
|
497 |
|
main.py
CHANGED
@@ -757,7 +757,7 @@ dataset_sources = pd.DataFrame(
|
|
757 |
"StackExchange",
|
758 |
],
|
759 |
"Raw Data Size": [
|
760 |
-
"
|
761 |
"712 GB",
|
762 |
"210 GB",
|
763 |
"23 GB",
|
@@ -770,7 +770,7 @@ dataset_sources = pd.DataFrame(
|
|
770 |
"45 GB",
|
771 |
],
|
772 |
"Token Count": [
|
773 |
-
"
|
774 |
"154.96B",
|
775 |
"4.75B",
|
776 |
"7.34B",
|
|
|
757 |
"StackExchange",
|
758 |
],
|
759 |
"Raw Data Size": [
|
760 |
+
"9.2 TB",
|
761 |
"712 GB",
|
762 |
"210 GB",
|
763 |
"23 GB",
|
|
|
770 |
"45 GB",
|
771 |
],
|
772 |
"Token Count": [
|
773 |
+
"4.83T",
|
774 |
"154.96B",
|
775 |
"4.75B",
|
776 |
"7.34B",
|
web.py
CHANGED
@@ -376,6 +376,7 @@ def web_data():
|
|
376 |
return Div(
|
377 |
Section(
|
378 |
Div(
|
|
|
379 |
H2("Common Crawl Snapshot Processing"),
|
380 |
H3("What This Section Contains"),
|
381 |
P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
|
|
|
376 |
return Div(
|
377 |
Section(
|
378 |
Div(
|
379 |
+
H1("Web Data Processing"),
|
380 |
H2("Common Crawl Snapshot Processing"),
|
381 |
H3("What This Section Contains"),
|
382 |
P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
|