Spaces:
Sleeping
Sleeping
🚸 🎨
Browse filesSigned-off-by: peter szemraj <[email protected]>
- app.py +1 -0
- summarize.py +2 -0
- utils.py +3 -3
app.py
CHANGED
@@ -334,6 +334,7 @@ if __name__ == "__main__":
|
|
334 |
uploaded_file = gr.File(
|
335 |
label="File Upload",
|
336 |
file_count="single",
|
|
|
337 |
type="file",
|
338 |
)
|
339 |
with gr.Row():
|
|
|
334 |
uploaded_file = gr.File(
|
335 |
label="File Upload",
|
336 |
file_count="single",
|
337 |
+
file_types=[".txt", ".md", ".pdf"],
|
338 |
type="file",
|
339 |
)
|
340 |
with gr.Row():
|
summarize.py
CHANGED
@@ -114,7 +114,9 @@ def summarize_via_tokenbatches(
|
|
114 |
tokenizer (): the tokenizer to use for summarization
|
115 |
batch_length (int, optional): the length of each batch. Defaults to 2048.
|
116 |
batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
|
|
|
117 |
|
|
|
118 |
Returns:
|
119 |
list: a list of dictionaries containing the input tokens, the summary, and the summary score
|
120 |
"""
|
|
|
114 |
tokenizer (): the tokenizer to use for summarization
|
115 |
batch_length (int, optional): the length of each batch. Defaults to 2048.
|
116 |
batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
|
117 |
+
min_batch_length (int, optional): the minimum length of each batch. Defaults to 512.
|
118 |
|
119 |
+
**kwargs: any additional arguments to pass to the model for inference
|
120 |
Returns:
|
121 |
list: a list of dictionaries containing the input tokens, the summary, and the summary score
|
122 |
"""
|
utils.py
CHANGED
@@ -156,7 +156,7 @@ def extract_keywords(
|
|
156 |
for keyword in keywords:
|
157 |
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
158 |
final_keywords.append(keyword)
|
159 |
-
logger.
|
160 |
return final_keywords
|
161 |
|
162 |
|
@@ -178,9 +178,9 @@ def saves_summary(
|
|
178 |
full_summary = "\n".join(sum_text)
|
179 |
|
180 |
keywords = "_".join(extract_keywords(full_summary))
|
181 |
-
logger.
|
182 |
outpath = (
|
183 |
-
Path.cwd() / f"document_summary_{get_timestamp()}
|
184 |
if outpath is None
|
185 |
else Path(outpath)
|
186 |
)
|
|
|
156 |
for keyword in keywords:
|
157 |
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
158 |
final_keywords.append(keyword)
|
159 |
+
logger.debug(f"Keywords (final):\t{final_keywords}")
|
160 |
return final_keywords
|
161 |
|
162 |
|
|
|
178 |
full_summary = "\n".join(sum_text)
|
179 |
|
180 |
keywords = "_".join(extract_keywords(full_summary))
|
181 |
+
logger.debug(f"kw:\t{keywords}")
|
182 |
outpath = (
|
183 |
+
Path.cwd() / f"document_summary_{keywords}_{get_timestamp()}.txt"
|
184 |
if outpath is None
|
185 |
else Path(outpath)
|
186 |
)
|