Spaces:
Sleeping
Sleeping
✨ enable new checkpoints
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
@@ -1,33 +1,70 @@
|
|
1 |
-
import os
|
2 |
import contextlib
|
3 |
import logging
|
|
|
4 |
import random
|
5 |
import re
|
6 |
import time
|
7 |
from pathlib import Path
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
import gradio as gr
|
10 |
import nltk
|
|
|
11 |
from cleantext import clean
|
12 |
from doctr.io import DocumentFile
|
13 |
from doctr.models import ocr_predictor
|
14 |
-
from pdf2text import convert_PDF_to_Text
|
15 |
|
|
|
16 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
17 |
-
from utils import load_example_filenames,
|
18 |
|
19 |
_here = Path(__file__).parent
|
20 |
|
21 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
def proc_submission(
|
29 |
input_text: str,
|
30 |
-
|
31 |
num_beams,
|
32 |
token_batch_length,
|
33 |
length_penalty,
|
@@ -40,7 +77,7 @@ def proc_submission(
|
|
40 |
|
41 |
Args:
|
42 |
input_text (str): the input text to summarize
|
43 |
-
|
44 |
num_beams (int): the number of beams to use
|
45 |
token_batch_length (int): the length of the token batches to use
|
46 |
length_penalty (float): the length penalty to use
|
@@ -66,7 +103,7 @@ def proc_submission(
|
|
66 |
st = time.perf_counter()
|
67 |
history = {}
|
68 |
clean_text = clean(input_text, lower=False)
|
69 |
-
max_input_length = 2048 if "base" in
|
70 |
processed = truncate_word_count(clean_text, max_input_length)
|
71 |
|
72 |
if processed["was_truncated"]:
|
@@ -100,14 +137,13 @@ def proc_submission(
|
|
100 |
|
101 |
return msg, "", []
|
102 |
|
103 |
-
_summaries =
|
104 |
-
tr_in,
|
105 |
-
|
106 |
-
|
107 |
-
batch_length=token_batch_length,
|
108 |
**settings,
|
109 |
)
|
110 |
-
sum_text = [f"Section {i}
|
111 |
sum_scores = [
|
112 |
f" - Section {i}: {round(s['summary_score'],4)}"
|
113 |
for i, s in enumerate(_summaries)
|
@@ -204,18 +240,6 @@ def load_uploaded_file(file_obj, max_pages=20):
|
|
204 |
|
205 |
if __name__ == "__main__":
|
206 |
logging.info("Starting app instance")
|
207 |
-
os.environ[
|
208 |
-
"TOKENIZERS_PARALLELISM"
|
209 |
-
] = "false" # parallelism on tokenizers is buggy with gradio
|
210 |
-
logging.info("Loading summ models")
|
211 |
-
with contextlib.redirect_stdout(None):
|
212 |
-
model, tokenizer = load_model_and_tokenizer(
|
213 |
-
"pszemraj/pegasus-x-large-book-summary"
|
214 |
-
)
|
215 |
-
model_sm, tokenizer_sm = load_model_and_tokenizer(
|
216 |
-
"pszemraj/long-t5-tglobal-base-16384-book-summary"
|
217 |
-
)
|
218 |
-
|
219 |
logging.info("Loading OCR model")
|
220 |
with contextlib.redirect_stdout(None):
|
221 |
ocr_model = ocr_predictor(
|
@@ -229,24 +253,19 @@ if __name__ == "__main__":
|
|
229 |
demo = gr.Blocks()
|
230 |
_examples = list(name_to_path.keys())
|
231 |
with demo:
|
232 |
-
|
233 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
234 |
gr.Markdown(
|
235 |
"This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
|
236 |
)
|
237 |
with gr.Column():
|
238 |
-
|
239 |
gr.Markdown("## Load Inputs & Select Parameters")
|
240 |
gr.Markdown(
|
241 |
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
|
242 |
)
|
243 |
with gr.Row(variant="compact"):
|
244 |
with gr.Column(scale=0.5, variant="compact"):
|
245 |
-
|
246 |
-
|
247 |
-
choices=["LongT5-base", "Pegasus-X-large"],
|
248 |
-
label="Model Variant",
|
249 |
-
value="LongT5-base",
|
250 |
)
|
251 |
num_beams = gr.Radio(
|
252 |
choices=[2, 3, 4],
|
@@ -336,7 +355,7 @@ if __name__ == "__main__":
|
|
336 |
value=3,
|
337 |
)
|
338 |
with gr.Column():
|
339 |
-
gr.Markdown("### About
|
340 |
gr.Markdown(
|
341 |
"These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
|
342 |
)
|
@@ -354,7 +373,7 @@ if __name__ == "__main__":
|
|
354 |
fn=proc_submission,
|
355 |
inputs=[
|
356 |
input_text,
|
357 |
-
|
358 |
num_beams,
|
359 |
token_batch_length,
|
360 |
length_penalty,
|
|
|
|
|
1 |
import contextlib
|
2 |
import logging
|
3 |
+
import os
|
4 |
import random
|
5 |
import re
|
6 |
import time
|
7 |
from pathlib import Path
|
8 |
|
9 |
+
os.environ["USE_TORCH"] = "1"
|
10 |
+
os.environ[
|
11 |
+
"TOKENIZERS_PARALLELISM"
|
12 |
+
] = "false" # parallelism on tokenizers is buggy with gradio
|
13 |
+
|
14 |
+
logging.basicConfig(
|
15 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
16 |
+
)
|
17 |
+
|
18 |
import gradio as gr
|
19 |
import nltk
|
20 |
+
import torch
|
21 |
from cleantext import clean
|
22 |
from doctr.io import DocumentFile
|
23 |
from doctr.models import ocr_predictor
|
|
|
24 |
|
25 |
+
from pdf2text import convert_PDF_to_Text
|
26 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
27 |
+
from utils import load_example_filenames, saves_summary, truncate_word_count
|
28 |
|
29 |
_here = Path(__file__).parent
|
30 |
|
31 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
32 |
|
33 |
+
|
34 |
+
MODEL_OPTIONS = [
|
35 |
+
"pszemraj/long-t5-tglobal-base-16384-book-summary",
|
36 |
+
"pszemraj/long-t5-tglobal-base-sci-simplify",
|
37 |
+
"pszemraj/long-t5-tglobal-base-sci-simplify-elife",
|
38 |
+
"pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
|
39 |
+
"pszemraj/pegasus-x-large-book-summary",
|
40 |
+
]
|
41 |
+
|
42 |
+
|
43 |
+
def predict(
|
44 |
+
input_text: str,
|
45 |
+
model_name: str,
|
46 |
+
token_batch_length: int = 1024,
|
47 |
+
empty_cache: bool = True,
|
48 |
+
**settings,
|
49 |
+
):
|
50 |
+
"""helper fn to support multiple models at once"""
|
51 |
+
if torch.cuda.is_available() and empty_cache:
|
52 |
+
torch.cuda.empty_cache()
|
53 |
+
|
54 |
+
model, tokenizer = load_model_and_tokenizer(model_name)
|
55 |
+
summaries = summarize_via_tokenbatches(
|
56 |
+
input_text,
|
57 |
+
model,
|
58 |
+
tokenizer,
|
59 |
+
batch_length=token_batch_length,
|
60 |
+
**settings,
|
61 |
+
)
|
62 |
+
return summaries
|
63 |
|
64 |
|
65 |
def proc_submission(
|
66 |
input_text: str,
|
67 |
+
model_name: str,
|
68 |
num_beams,
|
69 |
token_batch_length,
|
70 |
length_penalty,
|
|
|
77 |
|
78 |
Args:
|
79 |
input_text (str): the input text to summarize
|
80 |
+
model_name (str): the hf model tag of the model to use
|
81 |
num_beams (int): the number of beams to use
|
82 |
token_batch_length (int): the length of the token batches to use
|
83 |
length_penalty (float): the length penalty to use
|
|
|
103 |
st = time.perf_counter()
|
104 |
history = {}
|
105 |
clean_text = clean(input_text, lower=False)
|
106 |
+
max_input_length = 2048 if "base" in model_name.lower() else max_input_length
|
107 |
processed = truncate_word_count(clean_text, max_input_length)
|
108 |
|
109 |
if processed["was_truncated"]:
|
|
|
137 |
|
138 |
return msg, "", []
|
139 |
|
140 |
+
_summaries = predict(
|
141 |
+
input_text=tr_in,
|
142 |
+
model_name=model_name,
|
143 |
+
token_batch_length=token_batch_length,
|
|
|
144 |
**settings,
|
145 |
)
|
146 |
+
sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
|
147 |
sum_scores = [
|
148 |
f" - Section {i}: {round(s['summary_score'],4)}"
|
149 |
for i, s in enumerate(_summaries)
|
|
|
240 |
|
241 |
if __name__ == "__main__":
|
242 |
logging.info("Starting app instance")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
logging.info("Loading OCR model")
|
244 |
with contextlib.redirect_stdout(None):
|
245 |
ocr_model = ocr_predictor(
|
|
|
253 |
demo = gr.Blocks()
|
254 |
_examples = list(name_to_path.keys())
|
255 |
with demo:
|
|
|
256 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
257 |
gr.Markdown(
|
258 |
"This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
|
259 |
)
|
260 |
with gr.Column():
|
|
|
261 |
gr.Markdown("## Load Inputs & Select Parameters")
|
262 |
gr.Markdown(
|
263 |
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
|
264 |
)
|
265 |
with gr.Row(variant="compact"):
|
266 |
with gr.Column(scale=0.5, variant="compact"):
|
267 |
+
model_name = gr.Dropdown(
|
268 |
+
choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
|
|
|
|
|
|
|
269 |
)
|
270 |
num_beams = gr.Radio(
|
271 |
choices=[2, 3, 4],
|
|
|
355 |
value=3,
|
356 |
)
|
357 |
with gr.Column():
|
358 |
+
gr.Markdown("### About")
|
359 |
gr.Markdown(
|
360 |
"These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
|
361 |
)
|
|
|
373 |
fn=proc_submission,
|
374 |
inputs=[
|
375 |
input_text,
|
376 |
+
model_name,
|
377 |
num_beams,
|
378 |
token_batch_length,
|
379 |
length_penalty,
|