Spaces:
Sleeping
Sleeping
๐ docs
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
@@ -212,6 +212,7 @@ def proc_submission(
|
|
212 |
length_penalty (float): the length penalty to use
|
213 |
repetition_penalty (float): the repetition penalty to use
|
214 |
no_repeat_ngram_size (int): the no repeat ngram size to use
|
|
|
215 |
max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
|
216 |
|
217 |
Note:
|
@@ -219,7 +220,7 @@ def proc_submission(
|
|
219 |
environment variable APP_MAX_WORDS to a different value.
|
220 |
|
221 |
Returns:
|
222 |
-
|
223 |
"""
|
224 |
|
225 |
remove_stagnant_files() # clean up old files
|
@@ -257,7 +258,7 @@ def proc_submission(
|
|
257 |
msg = f"""
|
258 |
<div style="background-color: #FFA500; color: white; padding: 20px;">
|
259 |
<h3>Warning</h3>
|
260 |
-
<p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/
|
261 |
<p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
|
262 |
</div>
|
263 |
"""
|
@@ -267,6 +268,22 @@ def proc_submission(
|
|
267 |
model_input_text = truncation_validated["processed_text"]
|
268 |
msg = None
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
if len(input_text) < 50:
|
271 |
# this is essentially a different case from the above
|
272 |
msg = f"""
|
@@ -589,8 +606,8 @@ if __name__ == "__main__":
|
|
589 |
)
|
590 |
gr.Markdown(
|
591 |
f"""Aggregate the above batches into a cohesive summary.
|
592 |
-
-
|
593 |
-
-
|
594 |
"""
|
595 |
)
|
596 |
with gr.Column(variant="panel"):
|
|
|
212 |
length_penalty (float): the length penalty to use
|
213 |
repetition_penalty (float): the repetition penalty to use
|
214 |
no_repeat_ngram_size (int): the no repeat ngram size to use
|
215 |
+
predrop_stopwords (bool): whether to pre-drop stopwords before truncating/summarizing
|
216 |
max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
|
217 |
|
218 |
Note:
|
|
|
220 |
environment variable APP_MAX_WORDS to a different value.
|
221 |
|
222 |
Returns:
|
223 |
+
tuple (4): a tuple containing the following:
|
224 |
"""
|
225 |
|
226 |
remove_stagnant_files() # clean up old files
|
|
|
258 |
msg = f"""
|
259 |
<div style="background-color: #FFA500; color: white; padding: 20px;">
|
260 |
<h3>Warning</h3>
|
261 |
+
<p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/input_wc:.2f}% of the original text.</p>
|
262 |
<p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
|
263 |
</div>
|
264 |
"""
|
|
|
268 |
model_input_text = truncation_validated["processed_text"]
|
269 |
msg = None
|
270 |
|
271 |
+
if predrop_stopwords:
|
272 |
+
# TODO: remove this
|
273 |
+
|
274 |
+
outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
|
275 |
+
outdir.mkdir(parents=True, exist_ok=True)
|
276 |
+
keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
|
277 |
+
keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
|
278 |
+
cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
|
279 |
+
cln_outdir = outdir.parent / "source-text"
|
280 |
+
cln_outdir.mkdir(parents=True, exist_ok=True)
|
281 |
+
with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
|
282 |
+
f.write(cln_text)
|
283 |
+
sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
|
284 |
+
with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
|
285 |
+
f.write(model_input_text)
|
286 |
+
logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
|
287 |
if len(input_text) < 50:
|
288 |
# this is essentially a different case from the above
|
289 |
msg = f"""
|
|
|
606 |
)
|
607 |
gr.Markdown(
|
608 |
f"""Aggregate the above batches into a cohesive summary.
|
609 |
+
- A secondary instruct-tuned LM consolidates info
|
610 |
+
- Current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
|
611 |
"""
|
612 |
)
|
613 |
with gr.Column(variant="panel"):
|