Spaces:
Sleeping
Sleeping
⚰️ 🎨 clean up and rm verbose testing code
Browse filesSigned-off-by: peter szemraj <[email protected]>
- aggregate.py +1 -1
- app.py +2 -23
- utils.py +1 -1
aggregate.py
CHANGED
@@ -7,8 +7,8 @@ How it works:
|
|
7 |
2. The language model does it.
|
8 |
3. Yaay!
|
9 |
"""
|
10 |
-
import pprint as pp
|
11 |
import logging
|
|
|
12 |
import time
|
13 |
|
14 |
import torch
|
|
|
7 |
2. The language model does it.
|
8 |
3. Yaay!
|
9 |
"""
|
|
|
10 |
import logging
|
11 |
+
import pprint as pp
|
12 |
import time
|
13 |
|
14 |
import torch
|
app.py
CHANGED
@@ -19,9 +19,9 @@ import contextlib
|
|
19 |
import gc
|
20 |
import logging
|
21 |
import os
|
|
|
22 |
import random
|
23 |
import re
|
24 |
-
import pprint as pp
|
25 |
import sys
|
26 |
import time
|
27 |
from pathlib import Path
|
@@ -47,13 +47,12 @@ from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
|
47 |
from utils import (
|
48 |
contraction_aware_tokenize,
|
49 |
extract_batches,
|
50 |
-
extract_keywords,
|
51 |
load_example_filenames,
|
52 |
remove_stagnant_files,
|
|
|
53 |
saves_summary,
|
54 |
textlist2html,
|
55 |
truncate_word_count,
|
56 |
-
remove_stopwords,
|
57 |
)
|
58 |
|
59 |
_here = Path(__file__).parent
|
@@ -268,22 +267,6 @@ def proc_submission(
|
|
268 |
model_input_text = truncation_validated["processed_text"]
|
269 |
msg = None
|
270 |
|
271 |
-
if predrop_stopwords:
|
272 |
-
# TODO: remove this
|
273 |
-
|
274 |
-
outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
|
275 |
-
outdir.mkdir(parents=True, exist_ok=True)
|
276 |
-
keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
|
277 |
-
keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
|
278 |
-
cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
|
279 |
-
cln_outdir = outdir.parent / "source-text"
|
280 |
-
cln_outdir.mkdir(parents=True, exist_ok=True)
|
281 |
-
with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
|
282 |
-
f.write(cln_text)
|
283 |
-
sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
|
284 |
-
with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
|
285 |
-
f.write(model_input_text)
|
286 |
-
logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
|
287 |
if len(input_text) < 50:
|
288 |
# this is essentially a different case from the above
|
289 |
msg = f"""
|
@@ -326,7 +309,6 @@ def proc_submission(
|
|
326 |
|
327 |
html += ""
|
328 |
|
329 |
-
# save to file
|
330 |
settings["remove_stopwords"] = predrop_stopwords
|
331 |
settings["model_name"] = model_name
|
332 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
@@ -460,9 +442,6 @@ def parse_args():
|
|
460 |
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
461 |
help="Set the logging level",
|
462 |
)
|
463 |
-
# if "--help" in sys.argv or "-h" in sys.argv:
|
464 |
-
# parser.print_help()
|
465 |
-
# sys.exit(0)
|
466 |
|
467 |
return parser.parse_args()
|
468 |
|
|
|
19 |
import gc
|
20 |
import logging
|
21 |
import os
|
22 |
+
import pprint as pp
|
23 |
import random
|
24 |
import re
|
|
|
25 |
import sys
|
26 |
import time
|
27 |
from pathlib import Path
|
|
|
47 |
from utils import (
|
48 |
contraction_aware_tokenize,
|
49 |
extract_batches,
|
|
|
50 |
load_example_filenames,
|
51 |
remove_stagnant_files,
|
52 |
+
remove_stopwords,
|
53 |
saves_summary,
|
54 |
textlist2html,
|
55 |
truncate_word_count,
|
|
|
56 |
)
|
57 |
|
58 |
_here = Path(__file__).parent
|
|
|
267 |
model_input_text = truncation_validated["processed_text"]
|
268 |
msg = None
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
if len(input_text) < 50:
|
271 |
# this is essentially a different case from the above
|
272 |
msg = f"""
|
|
|
309 |
|
310 |
html += ""
|
311 |
|
|
|
312 |
settings["remove_stopwords"] = predrop_stopwords
|
313 |
settings["model_name"] = model_name
|
314 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
|
|
442 |
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
443 |
help="Set the logging level",
|
444 |
)
|
|
|
|
|
|
|
445 |
|
446 |
return parser.parse_args()
|
447 |
|
utils.py
CHANGED
@@ -19,7 +19,7 @@ logging.basicConfig(
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
-
from nltk.tokenize import
|
23 |
from rapidfuzz import fuzz
|
24 |
|
25 |
STOPWORDS = set(
|
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
+
from nltk.tokenize import WhitespaceTokenizer, sent_tokenize, word_tokenize
|
23 |
from rapidfuzz import fuzz
|
24 |
|
25 |
STOPWORDS = set(
|