Spaces:
Sleeping
Sleeping
✨ option to drop stopwords pre-summ
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
@@ -32,6 +32,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
32 |
logging.basicConfig(
|
33 |
level=logging.INFO,
|
34 |
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
|
|
35 |
)
|
36 |
|
37 |
import gradio as gr
|
@@ -50,6 +51,7 @@ from utils import (
|
|
50 |
saves_summary,
|
51 |
textlist2html,
|
52 |
truncate_word_count,
|
|
|
53 |
)
|
54 |
|
55 |
_here = Path(__file__).parent
|
@@ -194,6 +196,7 @@ def proc_submission(
|
|
194 |
length_penalty: float,
|
195 |
repetition_penalty: float,
|
196 |
no_repeat_ngram_size: int,
|
|
|
197 |
max_input_length: int = 6144,
|
198 |
):
|
199 |
"""
|
@@ -230,11 +233,14 @@ def proc_submission(
|
|
230 |
"do_sample": False,
|
231 |
}
|
232 |
max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
|
233 |
-
logging.info(
|
|
|
|
|
234 |
|
235 |
st = time.perf_counter()
|
236 |
history = {}
|
237 |
clean_text = clean(input_text, lower=False)
|
|
|
238 |
processed = truncate_word_count(clean_text, max_words=max_input_length)
|
239 |
|
240 |
if processed["was_truncated"]:
|
@@ -296,6 +302,7 @@ def proc_submission(
|
|
296 |
html += ""
|
297 |
|
298 |
# save to file
|
|
|
299 |
settings["model_name"] = model_name
|
300 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
301 |
return html, full_summary, scores_out, saved_file
|
@@ -607,6 +614,10 @@ if __name__ == "__main__":
|
|
607 |
label="no repeat ngram size",
|
608 |
value=3,
|
609 |
)
|
|
|
|
|
|
|
|
|
610 |
with gr.Column():
|
611 |
gr.Markdown("## About")
|
612 |
gr.Markdown(
|
@@ -638,6 +649,7 @@ if __name__ == "__main__":
|
|
638 |
length_penalty,
|
639 |
repetition_penalty,
|
640 |
no_repeat_ngram_size,
|
|
|
641 |
],
|
642 |
outputs=[output_text, summary_text, summary_scores, text_file],
|
643 |
)
|
|
|
32 |
logging.basicConfig(
|
33 |
level=logging.INFO,
|
34 |
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
35 |
+
datefmt="%Y-%b-%d %H:%M:%S",
|
36 |
)
|
37 |
|
38 |
import gradio as gr
|
|
|
51 |
saves_summary,
|
52 |
textlist2html,
|
53 |
truncate_word_count,
|
54 |
+
remove_stopwords,
|
55 |
)
|
56 |
|
57 |
_here = Path(__file__).parent
|
|
|
196 |
length_penalty: float,
|
197 |
repetition_penalty: float,
|
198 |
no_repeat_ngram_size: int,
|
199 |
+
predrop_stopwords: bool,
|
200 |
max_input_length: int = 6144,
|
201 |
):
|
202 |
"""
|
|
|
233 |
"do_sample": False,
|
234 |
}
|
235 |
max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
|
236 |
+
logging.info(
|
237 |
+
f"max_input_length set to: {max_input_length}. pre-drop stopwords: {predrop_stopwords}"
|
238 |
+
)
|
239 |
|
240 |
st = time.perf_counter()
|
241 |
history = {}
|
242 |
clean_text = clean(input_text, lower=False)
|
243 |
+
clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
|
244 |
processed = truncate_word_count(clean_text, max_words=max_input_length)
|
245 |
|
246 |
if processed["was_truncated"]:
|
|
|
302 |
html += ""
|
303 |
|
304 |
# save to file
|
305 |
+
settings["remove_stopwords"] = predrop_stopwords
|
306 |
settings["model_name"] = model_name
|
307 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
308 |
return html, full_summary, scores_out, saved_file
|
|
|
614 |
label="no repeat ngram size",
|
615 |
value=3,
|
616 |
)
|
617 |
+
predrop_stopwords = gr.Checkbox(
|
618 |
+
label="Drop Stopwords (Pre-Truncation)",
|
619 |
+
value=False,
|
620 |
+
)
|
621 |
with gr.Column():
|
622 |
gr.Markdown("## About")
|
623 |
gr.Markdown(
|
|
|
649 |
length_penalty,
|
650 |
repetition_penalty,
|
651 |
no_repeat_ngram_size,
|
652 |
+
predrop_stopwords,
|
653 |
],
|
654 |
outputs=[output_text, summary_text, summary_scores, text_file],
|
655 |
)
|
utils.py
CHANGED
@@ -19,34 +19,74 @@ logging.basicConfig(
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
-
from nltk.tokenize import word_tokenize
|
23 |
from rapidfuzz import fuzz
|
24 |
|
25 |
-
# Define stopwords
|
26 |
STOPWORDS = set(
|
27 |
"a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
|
28 |
)
|
29 |
|
30 |
|
31 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"""
|
33 |
-
remove_stopwords - Remove stopwords from
|
34 |
|
35 |
-
:param str text: text
|
36 |
-
:param
|
37 |
-
:
|
|
|
38 |
"""
|
39 |
-
words = word_tokenize(text)
|
40 |
-
filtered_words = []
|
41 |
|
|
|
42 |
for word in words:
|
43 |
-
|
|
|
44 |
|
45 |
if word.lower() not in stopwords:
|
46 |
filtered_words.append(word)
|
47 |
|
48 |
filtered_text = " ".join(filtered_words)
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
return filtered_text
|
51 |
|
52 |
|
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
+
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
|
23 |
from rapidfuzz import fuzz
|
24 |
|
|
|
25 |
STOPWORDS = set(
|
26 |
"a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def custom_tokenize(text: str) -> List[str]:
|
31 |
+
"""custom_tokenize - merges words containing apostrophes as one token."""
|
32 |
+
|
33 |
+
# Tokenize the text using the WhitespaceTokenizer
|
34 |
+
tokenizer = WhitespaceTokenizer()
|
35 |
+
tokens = tokenizer.tokenize(text)
|
36 |
+
|
37 |
+
merged_tokens = []
|
38 |
+
merged_token = ""
|
39 |
+
|
40 |
+
for token in tokens:
|
41 |
+
if re.search(r"\w+'\w+", token):
|
42 |
+
# Token contains an apostrophe, merge with previous token
|
43 |
+
merged_token += token
|
44 |
+
else:
|
45 |
+
# no apostrophe, add previous merged token (if any) and current
|
46 |
+
if merged_token:
|
47 |
+
merged_tokens.append(merged_token)
|
48 |
+
merged_token = ""
|
49 |
+
merged_tokens.append(token)
|
50 |
+
|
51 |
+
# Add the last merged token (if any)
|
52 |
+
if merged_token:
|
53 |
+
merged_tokens.append(merged_token)
|
54 |
+
|
55 |
+
return merged_tokens
|
56 |
+
|
57 |
+
|
58 |
+
def remove_stopwords(
|
59 |
+
text: str, stopwords: List[str] = STOPWORDS, use_custom_tokenize: bool = True
|
60 |
+
) -> str:
|
61 |
"""
|
62 |
+
remove_stopwords - Remove stopwords from text.
|
63 |
|
64 |
+
:param str text: input text
|
65 |
+
:param List[str] stopwords: list of stopwords, defaults to STOPWORDS
|
66 |
+
:param bool use_custom_tokenize: use custom apostrophe tokenizer, defaults to True
|
67 |
+
:return str: text with stopwords removed
|
68 |
"""
|
69 |
+
words = custom_tokenize(text) if use_custom_tokenize else word_tokenize(text)
|
|
|
70 |
|
71 |
+
filtered_words = []
|
72 |
for word in words:
|
73 |
+
# Remove leading and trailing punctuation marks
|
74 |
+
word = word.strip(string.punctuation)
|
75 |
|
76 |
if word.lower() not in stopwords:
|
77 |
filtered_words.append(word)
|
78 |
|
79 |
filtered_text = " ".join(filtered_words)
|
80 |
|
81 |
+
# Replace multiple consecutive whitespaces with a single space
|
82 |
+
filtered_text = re.sub(r"\s+", " ", filtered_text)
|
83 |
+
filtered_text = filtered_text.strip()
|
84 |
+
|
85 |
+
# Restore original whitespaces around punctuation marks
|
86 |
+
filtered_text = re.sub(
|
87 |
+
r"\s*([{}])\s*".format(re.escape(string.punctuation)), r"\1", filtered_text
|
88 |
+
)
|
89 |
+
|
90 |
return filtered_text
|
91 |
|
92 |
|