Spaces:
Sleeping
Sleeping
Commit
·
df82c16
0
Parent(s):
Duplicate from Gladiator/Text-Summarizer
Browse filesCo-authored-by: Atharva Ingle <[email protected]>
- .github/workflows/push_code_to_hf.yml +19 -0
- .gitignore +145 -0
- README.md +15 -0
- app.py +130 -0
- examples/tfile.txt +55 -0
- extractive_summarizer/bert_parent.py +176 -0
- extractive_summarizer/cluster_features.py +165 -0
- extractive_summarizer/model_processors.py +401 -0
- extractive_summarizer/sentence_handler.py +73 -0
- requirements.txt +12 -0
- utils.py +137 -0
.github/workflows/push_code_to_hf.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v2
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
+
run: git push --force https://Gladiator:[email protected]/spaces/Gladiator/Text-Summarizer main
|
.gitignore
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
98 |
+
__pypackages__/
|
99 |
+
|
100 |
+
# Celery stuff
|
101 |
+
celerybeat-schedule
|
102 |
+
celerybeat.pid
|
103 |
+
|
104 |
+
# SageMath parsed files
|
105 |
+
*.sage.py
|
106 |
+
|
107 |
+
# Environments
|
108 |
+
.env
|
109 |
+
.venv
|
110 |
+
env/
|
111 |
+
venv/
|
112 |
+
ENV/
|
113 |
+
env.bak/
|
114 |
+
venv.bak/
|
115 |
+
|
116 |
+
# Spyder project settings
|
117 |
+
.spyderproject
|
118 |
+
.spyproject
|
119 |
+
|
120 |
+
# Rope project settings
|
121 |
+
.ropeproject
|
122 |
+
|
123 |
+
# mkdocs documentation
|
124 |
+
/site
|
125 |
+
|
126 |
+
# mypy
|
127 |
+
.mypy_cache/
|
128 |
+
.dmypy.json
|
129 |
+
dmypy.json
|
130 |
+
|
131 |
+
# Pyre type checker
|
132 |
+
.pyre/
|
133 |
+
|
134 |
+
# pytype static type analyzer
|
135 |
+
.pytype/
|
136 |
+
|
137 |
+
# Cython debug symbols
|
138 |
+
cython_debug/
|
139 |
+
|
140 |
+
# local stuff
|
141 |
+
Docs/
|
142 |
+
.DS_Store
|
143 |
+
.vscode/
|
144 |
+
test.ipynb
|
145 |
+
test.py
|
README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Text Summarizer
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
duplicated_from: Gladiator/Text-Summarizer
|
10 |
+
---
|
11 |
+
|
12 |
+
# Text Summarizer
|
13 |
+
Text summarizer using Transformers
|
14 |
+
|
15 |
+
### This app is deployed on HuggingFace 🤗 Spaces [here](https://huggingface.co/spaces/Gladiator/Text-Summarizer)
|
app.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import validators
|
3 |
+
import streamlit as st
|
4 |
+
from transformers import AutoTokenizer, pipeline
|
5 |
+
|
6 |
+
# local modules
|
7 |
+
from extractive_summarizer.model_processors import Summarizer
|
8 |
+
from utils import (
|
9 |
+
clean_text,
|
10 |
+
fetch_article_text,
|
11 |
+
preprocess_text_for_abstractive_summarization,
|
12 |
+
read_text_from_file,
|
13 |
+
)
|
14 |
+
|
15 |
+
from rouge import Rouge
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
# ---------------------------------
|
19 |
+
# Main Application
|
20 |
+
# ---------------------------------
|
21 |
+
st.title("Text Summarizer 📝")
|
22 |
+
|
23 |
+
st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
24 |
+
st.markdown(
|
25 |
+
"Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
|
26 |
+
)
|
27 |
+
summarize_type = st.sidebar.selectbox(
|
28 |
+
"Summarization type", options=["Extractive", "Abstractive"]
|
29 |
+
)
|
30 |
+
|
31 |
+
st.markdown(
|
32 |
+
"Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
|
33 |
+
)
|
34 |
+
st.markdown(
|
35 |
+
"""- Raw text in text box
|
36 |
+
- URL of article/news to be summarized
|
37 |
+
- .txt, .pdf, .docx file formats"""
|
38 |
+
)
|
39 |
+
st.markdown(
|
40 |
+
"""This app supports two type of summarization:
|
41 |
+
|
42 |
+
1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
|
43 |
+
2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
|
44 |
+
)
|
45 |
+
st.markdown("---")
|
46 |
+
# ---------------------------
|
47 |
+
# SETUP & Constants
|
48 |
+
nltk.download("punkt")
|
49 |
+
abs_tokenizer_name = "facebook/bart-large-cnn"
|
50 |
+
abs_model_name = "facebook/bart-large-cnn"
|
51 |
+
abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
|
52 |
+
abs_max_length = 90
|
53 |
+
abs_min_length = 30
|
54 |
+
# ---------------------------
|
55 |
+
|
56 |
+
inp_text = st.text_input("Enter text or a url here")
|
57 |
+
st.markdown(
|
58 |
+
"<h3 style='text-align: center; color: green;'>OR</h3>",
|
59 |
+
unsafe_allow_html=True,
|
60 |
+
)
|
61 |
+
uploaded_file = st.file_uploader(
|
62 |
+
"Upload a .txt, .pdf, .docx file for summarization"
|
63 |
+
)
|
64 |
+
|
65 |
+
is_url = validators.url(inp_text)
|
66 |
+
if is_url:
|
67 |
+
# complete text, chunks to summarize (list of sentences for long docs)
|
68 |
+
text, cleaned_txt = fetch_article_text(url=inp_text)
|
69 |
+
elif uploaded_file:
|
70 |
+
cleaned_txt = read_text_from_file(uploaded_file)
|
71 |
+
cleaned_txt = clean_text(cleaned_txt)
|
72 |
+
else:
|
73 |
+
cleaned_txt = clean_text(inp_text)
|
74 |
+
|
75 |
+
# view summarized text (expander)
|
76 |
+
with st.expander("View input text"):
|
77 |
+
if is_url:
|
78 |
+
st.write(cleaned_txt[0])
|
79 |
+
else:
|
80 |
+
st.write(cleaned_txt)
|
81 |
+
summarize = st.button("Summarize")
|
82 |
+
|
83 |
+
# called on toggle button [summarize]
|
84 |
+
if summarize:
|
85 |
+
if summarize_type == "Extractive":
|
86 |
+
if is_url:
|
87 |
+
text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
88 |
+
else:
|
89 |
+
text_to_summarize = cleaned_txt
|
90 |
+
# extractive summarizer
|
91 |
+
|
92 |
+
with st.spinner(
|
93 |
+
text="Creating extractive summary. This might take a few seconds ..."
|
94 |
+
):
|
95 |
+
ext_model = Summarizer()
|
96 |
+
summarized_text = ext_model(text_to_summarize, num_sentences=5)
|
97 |
+
|
98 |
+
elif summarize_type == "Abstractive":
|
99 |
+
with st.spinner(
|
100 |
+
text="Creating abstractive summary. This might take a few seconds ..."
|
101 |
+
):
|
102 |
+
text_to_summarize = cleaned_txt
|
103 |
+
abs_summarizer = pipeline(
|
104 |
+
"summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
|
105 |
+
)
|
106 |
+
|
107 |
+
if is_url is False:
|
108 |
+
# list of chunks
|
109 |
+
text_to_summarize = preprocess_text_for_abstractive_summarization(
|
110 |
+
tokenizer=abs_tokenizer, text=cleaned_txt
|
111 |
+
)
|
112 |
+
|
113 |
+
tmp_sum = abs_summarizer(
|
114 |
+
text_to_summarize,
|
115 |
+
max_length=abs_max_length,
|
116 |
+
min_length=abs_min_length,
|
117 |
+
do_sample=False,
|
118 |
+
)
|
119 |
+
|
120 |
+
summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
|
121 |
+
|
122 |
+
# final summarized output
|
123 |
+
st.subheader("Summarized text")
|
124 |
+
st.info(summarized_text)
|
125 |
+
|
126 |
+
st.subheader("Rogue Scores")
|
127 |
+
rouge_sc = Rouge()
|
128 |
+
ground_truth = cleaned_txt[0] if is_url else cleaned_txt
|
129 |
+
score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
|
130 |
+
st.code(score)
|
examples/tfile.txt
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
New York: Gun violence has rocked the first three weeks of Eric Adams' tenure as mayor of New York, piling pressure on the ex-cop to deliver on his promise to improve public safety in America's largest city.
|
2 |
+
A shooting Friday in the neighborhood of Harlem left one police officer dead and another in critical condition.
|
3 |
+
|
4 |
+
It was the latest flashpoint in the Democratic mayor's nascent rule, in which he has yet to present a comprehensive plan to rein in the crime he has decried.
|
5 |
+
|
6 |
+
"It is our city against the killers," said Adams, a retired police captain, on Friday night at Harlem Hospital, where the officers -- who had been responding to a domestic disturbance -- were taken following the incident.
|
7 |
+
|
8 |
+
The recent shootings also include a shocking incident in which an 11-month-old girl was hit in the cheek by a stray bullet in the Bronx as she was in a parked car with her mother.
|
9 |
+
|
10 |
+
They are seen as part of a broader trend of gun violence fueled by the accessibility of firearms, against the backdrop of the social and economic toll of the Covid-19 pandemic.
|
11 |
+
|
12 |
+
And they're testing the new mayor's tough-on-crime campaign message, while setting up a potential showdown with the left flank of his party over police funding and crime reduction strategies.
|
13 |
+
|
14 |
+
"This is a sea of crime that's been fed by many rivers. We have to dam each one of those rivers," Adams told CNN's "State of the Union" talk show Sunday.
|
15 |
+
|
16 |
+
"These crimes did not start during my administration," he added. "They have been here for far too long in many parts of our community."
|
17 |
+
|
18 |
+
Earlier, Adams urged federal action on gun control while calling on New Yorkers to work with the police to stem violence.
|
19 |
+
|
20 |
+
"No matter how painful this moment is, don't give up on these people in this city," he said Friday.
|
21 |
+
|
22 |
+
Budget negotiations
|
23 |
+
|
24 |
+
Adams, 61, has clashed with his leftist critics, many of whom are vocal online and have pushed to "defund" the New York Police Department, the nation's largest.
|
25 |
+
|
26 |
+
Now that call may be coming to a head as Adams, whose position on policing has long rankled New Yorkers on the left, prepares to negotiate a new city budget.
|
27 |
+
|
28 |
+
He said recently he would consider exempting the police force, with a budget exceeding $5 billion, from citywide cost-cutting measures.
|
29 |
+
|
30 |
+
It was not clear whether those details would be part of the "real plan" for the city Adams said Sunday he would roll out this week.
|
31 |
+
|
32 |
+
Politicians who use "defund the police" as a rallying cry appear unlikely to give any leeway to Adams, who has already aggravated progressives over issues including remote learning.
|
33 |
+
|
34 |
+
Kristin Richardson Jordan, a leftist city council member, won her Harlem district on a "defund" platform, which advocates replacing policing with alternative public safety systems.
|
35 |
+
|
36 |
+
She expressed sadness over the killing of the police officer Friday, but added: "To be clear, the death of police officers is not what abolition is. Abolition is an end to violence altogether."
|
37 |
+
|
38 |
+
Blueprint for safety
|
39 |
+
|
40 |
+
Last year, police recorded 488 homicides in the city of nine million people, up 4.3 percent from 2020 -- though Jeffrey Butts, director of the research and evaluation center at John Jay College of Criminal Justice, points out that 25 years ago New York experienced four times the number of homicides it sees today.
|
41 |
+
|
42 |
+
While saying he disagrees with the notion of "defunding the police," Butts also told AFP "more police funding is not an appropriate response."
|
43 |
+
|
44 |
+
"How are those resources used? To what end? What's the strategy?" he said. "The foundation of our approach has to be economic well-being, health and the well-being of communities, which is a much broader public policy conversation."
|
45 |
+
|
46 |
+
Adriano Espaillat, a congressman whose district includes Harlem and parts of the Bronx, said Saturday "the federal government must play a pivotal role" in stemming the violence, citing a need for legislation mandating stronger background checks and accountability of gun manufacturers.
|
47 |
+
|
48 |
+
Ken Sherrill, a professor emeritus of political science at Hunter College, expressed surprise that Adams has not yet unveiled his pitch to tackle crime -- but said this is the moment to "mold public opinion."
|
49 |
+
|
50 |
+
"This hands the mayor an immense opportunity, and if he doesn't seize it I'm sure he will regret it," Sherrill told AFP.
|
51 |
+
|
52 |
+
Adams offered scant details about his upcoming public safety blueprint, but he said Sunday it would include the reinstitution of a "plainclothes anti-gun unit" and a bolstered police presence in the city's sprawling subway system.
|
53 |
+
|
54 |
+
|
55 |
+
But a top priority will be firearms: "We have to stop the flow of guns," Adams said.
|
extractive_summarizer/bert_parent.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import streamlit as st
|
5 |
+
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
+
from transformers import (AlbertModel, AlbertTokenizer, BertModel,
|
8 |
+
BertTokenizer, DistilBertModel, DistilBertTokenizer,
|
9 |
+
PreTrainedModel, PreTrainedTokenizer, XLMModel,
|
10 |
+
XLMTokenizer, XLNetModel, XLNetTokenizer)
|
11 |
+
|
12 |
+
@st.cache()
|
13 |
+
def load_hf_model(base_model, model_name, device):
|
14 |
+
model = base_model.from_pretrained(model_name, output_hidden_states=True).to(device)
|
15 |
+
return model
|
16 |
+
|
17 |
+
class BertParent(object):
|
18 |
+
"""
|
19 |
+
Base handler for BERT models.
|
20 |
+
"""
|
21 |
+
|
22 |
+
MODELS = {
|
23 |
+
'bert-base-uncased': (BertModel, BertTokenizer),
|
24 |
+
'bert-large-uncased': (BertModel, BertTokenizer),
|
25 |
+
'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
|
26 |
+
'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
|
27 |
+
'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
|
28 |
+
'albert-base-v1': (AlbertModel, AlbertTokenizer),
|
29 |
+
'albert-large-v1': (AlbertModel, AlbertTokenizer)
|
30 |
+
}
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
model: str,
|
35 |
+
custom_model: PreTrainedModel = None,
|
36 |
+
custom_tokenizer: PreTrainedTokenizer = None,
|
37 |
+
gpu_id: int = 0,
|
38 |
+
):
|
39 |
+
"""
|
40 |
+
:param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
|
41 |
+
:param custom_model: This is optional if a custom bert model is used.
|
42 |
+
:param custom_tokenizer: Place to use custom tokenizer.
|
43 |
+
"""
|
44 |
+
base_model, base_tokenizer = self.MODELS.get(model, (None, None))
|
45 |
+
|
46 |
+
self.device = torch.device("cpu")
|
47 |
+
if torch.cuda.is_available():
|
48 |
+
assert (
|
49 |
+
isinstance(gpu_id, int) and (0 <= gpu_id and gpu_id < torch.cuda.device_count())
|
50 |
+
), f"`gpu_id` must be an integer between 0 to {torch.cuda.device_count() - 1}. But got: {gpu_id}"
|
51 |
+
|
52 |
+
self.device = torch.device(f"cuda:{gpu_id}")
|
53 |
+
|
54 |
+
if custom_model:
|
55 |
+
self.model = custom_model.to(self.device)
|
56 |
+
else:
|
57 |
+
# self.model = base_model.from_pretrained(
|
58 |
+
# model, output_hidden_states=True).to(self.device)
|
59 |
+
self.model = load_hf_model(base_model, model, self.device)
|
60 |
+
|
61 |
+
if custom_tokenizer:
|
62 |
+
self.tokenizer = custom_tokenizer
|
63 |
+
else:
|
64 |
+
self.tokenizer = base_tokenizer.from_pretrained(model)
|
65 |
+
|
66 |
+
self.model.eval()
|
67 |
+
|
68 |
+
|
69 |
+
def tokenize_input(self, text: str) -> torch.tensor:
|
70 |
+
"""
|
71 |
+
Tokenizes the text input.
|
72 |
+
:param text: Text to tokenize.
|
73 |
+
:return: Returns a torch tensor.
|
74 |
+
"""
|
75 |
+
tokenized_text = self.tokenizer.tokenize(text)
|
76 |
+
indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
|
77 |
+
return torch.tensor([indexed_tokens]).to(self.device)
|
78 |
+
|
79 |
+
def _pooled_handler(self, hidden: torch.Tensor,
|
80 |
+
reduce_option: str) -> torch.Tensor:
|
81 |
+
"""
|
82 |
+
Handles torch tensor.
|
83 |
+
:param hidden: The hidden torch tensor to process.
|
84 |
+
:param reduce_option: The reduce option to use, such as mean, etc.
|
85 |
+
:return: Returns a torch tensor.
|
86 |
+
"""
|
87 |
+
|
88 |
+
if reduce_option == 'max':
|
89 |
+
return hidden.max(dim=1)[0].squeeze()
|
90 |
+
|
91 |
+
elif reduce_option == 'median':
|
92 |
+
return hidden.median(dim=1)[0].squeeze()
|
93 |
+
|
94 |
+
return hidden.mean(dim=1).squeeze()
|
95 |
+
|
96 |
+
def extract_embeddings(
|
97 |
+
self,
|
98 |
+
text: str,
|
99 |
+
hidden: Union[List[int], int] = -2,
|
100 |
+
reduce_option: str = 'mean',
|
101 |
+
hidden_concat: bool = False,
|
102 |
+
) -> torch.Tensor:
|
103 |
+
"""
|
104 |
+
Extracts the embeddings for the given text.
|
105 |
+
:param text: The text to extract embeddings for.
|
106 |
+
:param hidden: The hidden layer(s) to use for a readout handler.
|
107 |
+
:param squeeze: If we should squeeze the outputs (required for some layers).
|
108 |
+
:param reduce_option: How we should reduce the items.
|
109 |
+
:param hidden_concat: Whether or not to concat multiple hidden layers.
|
110 |
+
:return: A torch vector.
|
111 |
+
"""
|
112 |
+
tokens_tensor = self.tokenize_input(text)
|
113 |
+
pooled, hidden_states = self.model(tokens_tensor)[-2:]
|
114 |
+
|
115 |
+
# deprecated temporary keyword functions.
|
116 |
+
if reduce_option == 'concat_last_4':
|
117 |
+
last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
|
118 |
+
cat_hidden_states = torch.cat(tuple(last_4), dim=-1)
|
119 |
+
return torch.mean(cat_hidden_states, dim=1).squeeze()
|
120 |
+
|
121 |
+
elif reduce_option == 'reduce_last_4':
|
122 |
+
last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
|
123 |
+
return torch.cat(tuple(last_4), dim=1).mean(axis=1).squeeze()
|
124 |
+
|
125 |
+
elif type(hidden) == int:
|
126 |
+
hidden_s = hidden_states[hidden]
|
127 |
+
return self._pooled_handler(hidden_s, reduce_option)
|
128 |
+
|
129 |
+
elif hidden_concat:
|
130 |
+
last_states = [hidden_states[i] for i in hidden]
|
131 |
+
cat_hidden_states = torch.cat(tuple(last_states), dim=-1)
|
132 |
+
return torch.mean(cat_hidden_states, dim=1).squeeze()
|
133 |
+
|
134 |
+
last_states = [hidden_states[i] for i in hidden]
|
135 |
+
hidden_s = torch.cat(tuple(last_states), dim=1)
|
136 |
+
|
137 |
+
return self._pooled_handler(hidden_s, reduce_option)
|
138 |
+
|
139 |
+
def create_matrix(
|
140 |
+
self,
|
141 |
+
content: List[str],
|
142 |
+
hidden: Union[List[int], int] = -2,
|
143 |
+
reduce_option: str = 'mean',
|
144 |
+
hidden_concat: bool = False,
|
145 |
+
) -> ndarray:
|
146 |
+
"""
|
147 |
+
Create matrix from the embeddings.
|
148 |
+
:param content: The list of sentences.
|
149 |
+
:param hidden: Which hidden layer to use.
|
150 |
+
:param reduce_option: The reduce option to run.
|
151 |
+
:param hidden_concat: Whether or not to concat multiple hidden layers.
|
152 |
+
:return: A numpy array matrix of the given content.
|
153 |
+
"""
|
154 |
+
|
155 |
+
return np.asarray([
|
156 |
+
np.squeeze(self.extract_embeddings(
|
157 |
+
t, hidden=hidden, reduce_option=reduce_option, hidden_concat=hidden_concat
|
158 |
+
).data.cpu().numpy()) for t in content
|
159 |
+
])
|
160 |
+
|
161 |
+
def __call__(
|
162 |
+
self,
|
163 |
+
content: List[str],
|
164 |
+
hidden: int = -2,
|
165 |
+
reduce_option: str = 'mean',
|
166 |
+
hidden_concat: bool = False,
|
167 |
+
) -> ndarray:
|
168 |
+
"""
|
169 |
+
Create matrix from the embeddings.
|
170 |
+
:param content: The list of sentences.
|
171 |
+
:param hidden: Which hidden layer to use.
|
172 |
+
:param reduce_option: The reduce option to run.
|
173 |
+
:param hidden_concat: Whether or not to concat multiple hidden layers.
|
174 |
+
:return: A numpy array matrix of the given content.
|
175 |
+
"""
|
176 |
+
return self.create_matrix(content, hidden, reduce_option, hidden_concat)
|
extractive_summarizer/cluster_features.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from numpy import ndarray
|
5 |
+
from sklearn.cluster import KMeans
|
6 |
+
from sklearn.decomposition import PCA
|
7 |
+
from sklearn.mixture import GaussianMixture
|
8 |
+
|
9 |
+
|
10 |
+
class ClusterFeatures(object):
|
11 |
+
"""
|
12 |
+
Basic handling of clustering features.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
features: ndarray,
|
18 |
+
algorithm: str = 'kmeans',
|
19 |
+
pca_k: int = None,
|
20 |
+
random_state: int = 12345,
|
21 |
+
):
|
22 |
+
"""
|
23 |
+
:param features: the embedding matrix created by bert parent.
|
24 |
+
:param algorithm: Which clustering algorithm to use.
|
25 |
+
:param pca_k: If you want the features to be ran through pca, this is the components number.
|
26 |
+
:param random_state: Random state.
|
27 |
+
"""
|
28 |
+
if pca_k:
|
29 |
+
self.features = PCA(n_components=pca_k).fit_transform(features)
|
30 |
+
else:
|
31 |
+
self.features = features
|
32 |
+
|
33 |
+
self.algorithm = algorithm
|
34 |
+
self.pca_k = pca_k
|
35 |
+
self.random_state = random_state
|
36 |
+
|
37 |
+
def __get_model(self, k: int):
|
38 |
+
"""
|
39 |
+
Retrieve clustering model.
|
40 |
+
|
41 |
+
:param k: amount of clusters.
|
42 |
+
:return: Clustering model.
|
43 |
+
"""
|
44 |
+
|
45 |
+
if self.algorithm == 'gmm':
|
46 |
+
return GaussianMixture(n_components=k, random_state=self.random_state)
|
47 |
+
return KMeans(n_clusters=k, random_state=self.random_state)
|
48 |
+
|
49 |
+
def __get_centroids(self, model):
|
50 |
+
"""
|
51 |
+
Retrieve centroids of model.
|
52 |
+
|
53 |
+
:param model: Clustering model.
|
54 |
+
:return: Centroids.
|
55 |
+
"""
|
56 |
+
if self.algorithm == 'gmm':
|
57 |
+
return model.means_
|
58 |
+
return model.cluster_centers_
|
59 |
+
|
60 |
+
def __find_closest_args(self, centroids: np.ndarray) -> Dict:
|
61 |
+
"""
|
62 |
+
Find the closest arguments to centroid.
|
63 |
+
|
64 |
+
:param centroids: Centroids to find closest.
|
65 |
+
:return: Closest arguments.
|
66 |
+
"""
|
67 |
+
centroid_min = 1e10
|
68 |
+
cur_arg = -1
|
69 |
+
args = {}
|
70 |
+
used_idx = []
|
71 |
+
|
72 |
+
for j, centroid in enumerate(centroids):
|
73 |
+
|
74 |
+
for i, feature in enumerate(self.features):
|
75 |
+
value = np.linalg.norm(feature - centroid)
|
76 |
+
|
77 |
+
if value < centroid_min and i not in used_idx:
|
78 |
+
cur_arg = i
|
79 |
+
centroid_min = value
|
80 |
+
|
81 |
+
used_idx.append(cur_arg)
|
82 |
+
args[j] = cur_arg
|
83 |
+
centroid_min = 1e10
|
84 |
+
cur_arg = -1
|
85 |
+
|
86 |
+
return args
|
87 |
+
|
88 |
+
def calculate_elbow(self, k_max: int) -> List[float]:
|
89 |
+
"""
|
90 |
+
Calculates elbow up to the provided k_max.
|
91 |
+
|
92 |
+
:param k_max: K_max to calculate elbow for.
|
93 |
+
:return: The inertias up to k_max.
|
94 |
+
"""
|
95 |
+
inertias = []
|
96 |
+
|
97 |
+
for k in range(1, min(k_max, len(self.features))):
|
98 |
+
model = self.__get_model(k).fit(self.features)
|
99 |
+
|
100 |
+
inertias.append(model.inertia_)
|
101 |
+
|
102 |
+
return inertias
|
103 |
+
|
104 |
+
def calculate_optimal_cluster(self, k_max: int):
|
105 |
+
"""
|
106 |
+
Calculates the optimal cluster based on Elbow.
|
107 |
+
|
108 |
+
:param k_max: The max k to search elbow for.
|
109 |
+
:return: The optimal cluster size.
|
110 |
+
"""
|
111 |
+
delta_1 = []
|
112 |
+
delta_2 = []
|
113 |
+
|
114 |
+
max_strength = 0
|
115 |
+
k = 1
|
116 |
+
|
117 |
+
inertias = self.calculate_elbow(k_max)
|
118 |
+
|
119 |
+
for i in range(len(inertias)):
|
120 |
+
delta_1.append(inertias[i] - inertias[i - 1] if i > 0 else 0.0)
|
121 |
+
delta_2.append(delta_1[i] - delta_1[i - 1] if i > 1 else 0.0)
|
122 |
+
|
123 |
+
for j in range(len(inertias)):
|
124 |
+
strength = 0 if j <= 1 or j == len(inertias) - 1 else delta_2[j + 1] - delta_1[j + 1]
|
125 |
+
|
126 |
+
if strength > max_strength:
|
127 |
+
max_strength = strength
|
128 |
+
k = j + 1
|
129 |
+
|
130 |
+
return k
|
131 |
+
|
132 |
+
def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
|
133 |
+
"""
|
134 |
+
Clusters sentences based on the ratio.
|
135 |
+
|
136 |
+
:param ratio: Ratio to use for clustering.
|
137 |
+
:param num_sentences: Number of sentences. Overrides ratio.
|
138 |
+
:return: Sentences index that qualify for summary.
|
139 |
+
"""
|
140 |
+
|
141 |
+
if num_sentences is not None:
|
142 |
+
if num_sentences == 0:
|
143 |
+
return []
|
144 |
+
|
145 |
+
k = min(num_sentences, len(self.features))
|
146 |
+
else:
|
147 |
+
k = max(int(len(self.features) * ratio), 1)
|
148 |
+
|
149 |
+
model = self.__get_model(k).fit(self.features)
|
150 |
+
|
151 |
+
centroids = self.__get_centroids(model)
|
152 |
+
cluster_args = self.__find_closest_args(centroids)
|
153 |
+
|
154 |
+
sorted_values = sorted(cluster_args.values())
|
155 |
+
return sorted_values
|
156 |
+
|
157 |
+
def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
|
158 |
+
"""
|
159 |
+
Clusters sentences based on the ratio.
|
160 |
+
|
161 |
+
:param ratio: Ratio to use for clustering.
|
162 |
+
:param num_sentences: Number of sentences. Overrides ratio.
|
163 |
+
:return: Sentences index that qualify for summary.
|
164 |
+
"""
|
165 |
+
return self.cluster(ratio)
|
extractive_summarizer/model_processors.py
ADDED
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Tuple, Union
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from transformers import (AlbertModel, AlbertTokenizer, BartModel,
|
5 |
+
BartTokenizer, BertModel, BertTokenizer,
|
6 |
+
CamembertModel, CamembertTokenizer, CTRLModel,
|
7 |
+
CTRLTokenizer, DistilBertModel, DistilBertTokenizer,
|
8 |
+
GPT2Model, GPT2Tokenizer, LongformerModel,
|
9 |
+
LongformerTokenizer, OpenAIGPTModel,
|
10 |
+
OpenAIGPTTokenizer, PreTrainedModel,
|
11 |
+
PreTrainedTokenizer, RobertaModel, RobertaTokenizer,
|
12 |
+
TransfoXLModel, TransfoXLTokenizer, XLMModel,
|
13 |
+
XLMTokenizer, XLNetModel, XLNetTokenizer)
|
14 |
+
|
15 |
+
from extractive_summarizer.bert_parent import BertParent
|
16 |
+
from extractive_summarizer.cluster_features import ClusterFeatures
|
17 |
+
from extractive_summarizer.sentence_handler import SentenceHandler
|
18 |
+
|
19 |
+
|
20 |
+
class ModelProcessor(object):
|
21 |
+
aggregate_map = {
|
22 |
+
'mean': np.mean,
|
23 |
+
'min': np.min,
|
24 |
+
'median': np.median,
|
25 |
+
'max': np.max,
|
26 |
+
}
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
model: str = 'bert-large-uncased',
|
31 |
+
custom_model: PreTrainedModel = None,
|
32 |
+
custom_tokenizer: PreTrainedTokenizer = None,
|
33 |
+
hidden: Union[List[int], int] = -2,
|
34 |
+
reduce_option: str = 'mean',
|
35 |
+
sentence_handler: SentenceHandler = SentenceHandler(),
|
36 |
+
random_state: int = 12345,
|
37 |
+
hidden_concat: bool = False,
|
38 |
+
gpu_id: int = 0,
|
39 |
+
):
|
40 |
+
"""
|
41 |
+
This is the parent Bert Summarizer model. New methods should implement this class.
|
42 |
+
|
43 |
+
:param model: This parameter is associated with the inherit string parameters from the transformers library.
|
44 |
+
:param custom_model: If you have a pre-trained model, you can add the model class here.
|
45 |
+
:param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
|
46 |
+
:param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
|
47 |
+
:param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
|
48 |
+
:param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
|
49 |
+
CoreferenceHandler instance
|
50 |
+
:param random_state: The random state to reproduce summarizations.
|
51 |
+
:param hidden_concat: Whether or not to concat multiple hidden layers.
|
52 |
+
:param gpu_id: GPU device index if CUDA is available.
|
53 |
+
"""
|
54 |
+
np.random.seed(random_state)
|
55 |
+
self.model = BertParent(model, custom_model, custom_tokenizer, gpu_id)
|
56 |
+
self.hidden = hidden
|
57 |
+
self.reduce_option = reduce_option
|
58 |
+
self.sentence_handler = sentence_handler
|
59 |
+
self.random_state = random_state
|
60 |
+
self.hidden_concat = hidden_concat
|
61 |
+
|
62 |
+
def cluster_runner(
|
63 |
+
self,
|
64 |
+
content: List[str],
|
65 |
+
ratio: float = 0.2,
|
66 |
+
algorithm: str = 'kmeans',
|
67 |
+
use_first: bool = True,
|
68 |
+
num_sentences: int = None
|
69 |
+
) -> Tuple[List[str], np.ndarray]:
|
70 |
+
"""
|
71 |
+
Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.
|
72 |
+
|
73 |
+
:param content: Content list of sentences.
|
74 |
+
:param ratio: The ratio to use for clustering.
|
75 |
+
:param algorithm: Type of algorithm to use for clustering.
|
76 |
+
:param use_first: Return the first sentence in the output (helpful for news stories, etc).
|
77 |
+
:param num_sentences: Number of sentences to use for summarization.
|
78 |
+
:return: A tuple of summarized sentences and embeddings
|
79 |
+
"""
|
80 |
+
if num_sentences is not None:
|
81 |
+
num_sentences = num_sentences if use_first else num_sentences
|
82 |
+
|
83 |
+
hidden = self.model(
|
84 |
+
content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat)
|
85 |
+
hidden_args = ClusterFeatures(
|
86 |
+
hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)
|
87 |
+
|
88 |
+
if use_first:
|
89 |
+
|
90 |
+
if not hidden_args:
|
91 |
+
hidden_args.append(0)
|
92 |
+
|
93 |
+
elif hidden_args[0] != 0:
|
94 |
+
hidden_args.insert(0, 0)
|
95 |
+
|
96 |
+
sentences = [content[j] for j in hidden_args]
|
97 |
+
embeddings = np.asarray([hidden[j] for j in hidden_args])
|
98 |
+
|
99 |
+
return sentences, embeddings
|
100 |
+
|
101 |
+
def __run_clusters(
|
102 |
+
self,
|
103 |
+
content: List[str],
|
104 |
+
ratio: float = 0.2,
|
105 |
+
algorithm: str = 'kmeans',
|
106 |
+
use_first: bool = True,
|
107 |
+
num_sentences: int = None
|
108 |
+
) -> List[str]:
|
109 |
+
"""
|
110 |
+
Runs clusters and returns sentences.
|
111 |
+
|
112 |
+
:param content: The content of sentences.
|
113 |
+
:param ratio: Ratio to use for for clustering.
|
114 |
+
:param algorithm: Algorithm selection for clustering.
|
115 |
+
:param use_first: Whether to use first sentence
|
116 |
+
:param num_sentences: Number of sentences. Overrides ratio.
|
117 |
+
:return: summarized sentences
|
118 |
+
"""
|
119 |
+
sentences, _ = self.cluster_runner(
|
120 |
+
content, ratio, algorithm, use_first, num_sentences)
|
121 |
+
return sentences
|
122 |
+
|
123 |
+
def __retrieve_summarized_embeddings(
|
124 |
+
self,
|
125 |
+
content: List[str],
|
126 |
+
ratio: float = 0.2,
|
127 |
+
algorithm: str = 'kmeans',
|
128 |
+
use_first: bool = True,
|
129 |
+
num_sentences: int = None
|
130 |
+
) -> np.ndarray:
|
131 |
+
"""
|
132 |
+
Retrieves embeddings of the summarized sentences.
|
133 |
+
|
134 |
+
:param content: The content of sentences.
|
135 |
+
:param ratio: Ratio to use for for clustering.
|
136 |
+
:param algorithm: Algorithm selection for clustering.
|
137 |
+
:param use_first: Whether to use first sentence
|
138 |
+
:return: Summarized embeddings
|
139 |
+
"""
|
140 |
+
_, embeddings = self.cluster_runner(
|
141 |
+
content, ratio, algorithm, use_first, num_sentences)
|
142 |
+
return embeddings
|
143 |
+
|
144 |
+
def calculate_elbow(
|
145 |
+
self,
|
146 |
+
body: str,
|
147 |
+
algorithm: str = 'kmeans',
|
148 |
+
min_length: int = 40,
|
149 |
+
max_length: int = 600,
|
150 |
+
k_max: int = None,
|
151 |
+
) -> List[float]:
|
152 |
+
"""
|
153 |
+
Calculates elbow across the clusters.
|
154 |
+
|
155 |
+
:param body: The input body to summarize.
|
156 |
+
:param algorithm: The algorithm to use for clustering.
|
157 |
+
:param min_length: The min length to use.
|
158 |
+
:param max_length: The max length to use.
|
159 |
+
:param k_max: The maximum number of clusters to search.
|
160 |
+
:return: List of elbow inertia values.
|
161 |
+
"""
|
162 |
+
sentences = self.sentence_handler(body, min_length, max_length)
|
163 |
+
|
164 |
+
if k_max is None:
|
165 |
+
k_max = len(sentences) - 1
|
166 |
+
|
167 |
+
hidden = self.model(sentences, self.hidden,
|
168 |
+
self.reduce_option, hidden_concat=self.hidden_concat)
|
169 |
+
elbow = ClusterFeatures(
|
170 |
+
hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max)
|
171 |
+
|
172 |
+
return elbow
|
173 |
+
|
174 |
+
def calculate_optimal_k(
|
175 |
+
self,
|
176 |
+
body: str,
|
177 |
+
algorithm: str = 'kmeans',
|
178 |
+
min_length: int = 40,
|
179 |
+
max_length: int = 600,
|
180 |
+
k_max: int = None,
|
181 |
+
):
|
182 |
+
"""
|
183 |
+
Calculates the optimal Elbow K.
|
184 |
+
|
185 |
+
:param body: The input body to summarize.
|
186 |
+
:param algorithm: The algorithm to use for clustering.
|
187 |
+
:param min_length: The min length to use.
|
188 |
+
:param max_length: The max length to use.
|
189 |
+
:param k_max: The maximum number of clusters to search.
|
190 |
+
:return:
|
191 |
+
"""
|
192 |
+
sentences = self.sentence_handler(body, min_length, max_length)
|
193 |
+
|
194 |
+
if k_max is None:
|
195 |
+
k_max = len(sentences) - 1
|
196 |
+
|
197 |
+
hidden = self.model(sentences, self.hidden,
|
198 |
+
self.reduce_option, hidden_concat=self.hidden_concat)
|
199 |
+
optimal_k = ClusterFeatures(
|
200 |
+
hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max)
|
201 |
+
|
202 |
+
return optimal_k
|
203 |
+
|
204 |
+
def run_embeddings(
|
205 |
+
self,
|
206 |
+
body: str,
|
207 |
+
ratio: float = 0.2,
|
208 |
+
min_length: int = 40,
|
209 |
+
max_length: int = 600,
|
210 |
+
use_first: bool = True,
|
211 |
+
algorithm: str = 'kmeans',
|
212 |
+
num_sentences: int = None,
|
213 |
+
aggregate: str = None,
|
214 |
+
) -> Optional[np.ndarray]:
|
215 |
+
"""
|
216 |
+
Preprocesses the sentences, runs the clusters to find the centroids, then combines the embeddings.
|
217 |
+
|
218 |
+
:param body: The raw string body to process
|
219 |
+
:param ratio: Ratio of sentences to use
|
220 |
+
:param min_length: Minimum length of sentence candidates to utilize for the summary.
|
221 |
+
:param max_length: Maximum length of sentence candidates to utilize for the summary
|
222 |
+
:param use_first: Whether or not to use the first sentence
|
223 |
+
:param algorithm: Which clustering algorithm to use. (kmeans, gmm)
|
224 |
+
:param num_sentences: Number of sentences to use. Overrides ratio.
|
225 |
+
:param aggregate: One of mean, median, max, min. Applied on zero axis
|
226 |
+
:return: A summary embedding
|
227 |
+
"""
|
228 |
+
sentences = self.sentence_handler(body, min_length, max_length)
|
229 |
+
|
230 |
+
if sentences:
|
231 |
+
embeddings = self.__retrieve_summarized_embeddings(
|
232 |
+
sentences, ratio, algorithm, use_first, num_sentences)
|
233 |
+
|
234 |
+
if aggregate is not None:
|
235 |
+
assert aggregate in [
|
236 |
+
'mean', 'median', 'max', 'min'], "aggregate must be mean, min, max, or median"
|
237 |
+
embeddings = self.aggregate_map[aggregate](embeddings, axis=0)
|
238 |
+
|
239 |
+
return embeddings
|
240 |
+
|
241 |
+
return None
|
242 |
+
|
243 |
+
def run(
|
244 |
+
self,
|
245 |
+
body: str,
|
246 |
+
ratio: float = 0.2,
|
247 |
+
min_length: int = 40,
|
248 |
+
max_length: int = 600,
|
249 |
+
use_first: bool = True,
|
250 |
+
algorithm: str = 'kmeans',
|
251 |
+
num_sentences: int = None,
|
252 |
+
return_as_list: bool = False
|
253 |
+
) -> Union[List, str]:
|
254 |
+
"""
|
255 |
+
Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
|
256 |
+
|
257 |
+
:param body: The raw string body to process
|
258 |
+
:param ratio: Ratio of sentences to use
|
259 |
+
:param min_length: Minimum length of sentence candidates to utilize for the summary.
|
260 |
+
:param max_length: Maximum length of sentence candidates to utilize for the summary
|
261 |
+
:param use_first: Whether or not to use the first sentence
|
262 |
+
:param algorithm: Which clustering algorithm to use. (kmeans, gmm)
|
263 |
+
:param num_sentences: Number of sentences to use (overrides ratio).
|
264 |
+
:param return_as_list: Whether or not to return sentences as list.
|
265 |
+
:return: A summary sentence
|
266 |
+
"""
|
267 |
+
sentences = self.sentence_handler(body, min_length, max_length)
|
268 |
+
|
269 |
+
if sentences:
|
270 |
+
sentences = self.__run_clusters(
|
271 |
+
sentences, ratio, algorithm, use_first, num_sentences)
|
272 |
+
|
273 |
+
if return_as_list:
|
274 |
+
return sentences
|
275 |
+
else:
|
276 |
+
return ' '.join(sentences)
|
277 |
+
|
278 |
+
def __call__(
|
279 |
+
self,
|
280 |
+
body: str,
|
281 |
+
ratio: float = 0.2,
|
282 |
+
min_length: int = 40,
|
283 |
+
max_length: int = 600,
|
284 |
+
use_first: bool = True,
|
285 |
+
algorithm: str = 'kmeans',
|
286 |
+
num_sentences: int = None,
|
287 |
+
return_as_list: bool = False,
|
288 |
+
) -> str:
|
289 |
+
"""
|
290 |
+
(utility that wraps around the run function)
|
291 |
+
Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
|
292 |
+
|
293 |
+
:param body: The raw string body to process.
|
294 |
+
:param ratio: Ratio of sentences to use.
|
295 |
+
:param min_length: Minimum length of sentence candidates to utilize for the summary.
|
296 |
+
:param max_length: Maximum length of sentence candidates to utilize for the summary.
|
297 |
+
:param use_first: Whether or not to use the first sentence.
|
298 |
+
:param algorithm: Which clustering algorithm to use. (kmeans, gmm)
|
299 |
+
:param Number of sentences to use (overrides ratio).
|
300 |
+
:param return_as_list: Whether or not to return sentences as list.
|
301 |
+
:return: A summary sentence.
|
302 |
+
"""
|
303 |
+
return self.run(
|
304 |
+
body, ratio, min_length, max_length, algorithm=algorithm, use_first=use_first, num_sentences=num_sentences,
|
305 |
+
return_as_list=return_as_list
|
306 |
+
)
|
307 |
+
|
308 |
+
|
309 |
+
class Summarizer(ModelProcessor):
|
310 |
+
|
311 |
+
def __init__(
|
312 |
+
self,
|
313 |
+
model: str = 'bert-large-uncased',
|
314 |
+
custom_model: PreTrainedModel = None,
|
315 |
+
custom_tokenizer: PreTrainedTokenizer = None,
|
316 |
+
hidden: Union[List[int], int] = -2,
|
317 |
+
reduce_option: str = 'mean',
|
318 |
+
sentence_handler: SentenceHandler = SentenceHandler(),
|
319 |
+
random_state: int = 12345,
|
320 |
+
hidden_concat: bool = False,
|
321 |
+
gpu_id: int = 0,
|
322 |
+
):
|
323 |
+
"""
|
324 |
+
This is the main Bert Summarizer class.
|
325 |
+
|
326 |
+
:param model: This parameter is associated with the inherit string parameters from the transformers library.
|
327 |
+
:param custom_model: If you have a pre-trained model, you can add the model class here.
|
328 |
+
:param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
|
329 |
+
:param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
|
330 |
+
:param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
|
331 |
+
:param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
|
332 |
+
:param language: Which language to use for training.
|
333 |
+
:param random_state: The random state to reproduce summarizations.
|
334 |
+
:param hidden_concat: Whether or not to concat multiple hidden layers.
|
335 |
+
:param gpu_id: GPU device index if CUDA is available.
|
336 |
+
"""
|
337 |
+
|
338 |
+
super(Summarizer, self).__init__(
|
339 |
+
model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
|
340 |
+
)
|
341 |
+
|
342 |
+
|
343 |
+
class TransformerSummarizer(ModelProcessor):
|
344 |
+
"""
|
345 |
+
Another type of Summarizer class to choose keyword based model and tokenizer
|
346 |
+
"""
|
347 |
+
|
348 |
+
MODEL_DICT = {
|
349 |
+
'Bert': (BertModel, BertTokenizer),
|
350 |
+
'OpenAIGPT': (OpenAIGPTModel, OpenAIGPTTokenizer),
|
351 |
+
'GPT2': (GPT2Model, GPT2Tokenizer),
|
352 |
+
'CTRL': (CTRLModel, CTRLTokenizer),
|
353 |
+
'TransfoXL': (TransfoXLModel, TransfoXLTokenizer),
|
354 |
+
'XLNet': (XLNetModel, XLNetTokenizer),
|
355 |
+
'XLM': (XLMModel, XLMTokenizer),
|
356 |
+
'DistilBert': (DistilBertModel, DistilBertTokenizer),
|
357 |
+
}
|
358 |
+
|
359 |
+
def __init__(
|
360 |
+
self,
|
361 |
+
transformer_type: str = 'Bert',
|
362 |
+
transformer_model_key: str = 'bert-base-uncased',
|
363 |
+
transformer_tokenizer_key: str = None,
|
364 |
+
hidden: Union[List[int], int] = -2,
|
365 |
+
reduce_option: str = 'mean',
|
366 |
+
sentence_handler: SentenceHandler = SentenceHandler(),
|
367 |
+
random_state: int = 12345,
|
368 |
+
hidden_concat: bool = False,
|
369 |
+
gpu_id: int = 0,
|
370 |
+
):
|
371 |
+
"""
|
372 |
+
:param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
|
373 |
+
:param transformer_model_key: The transformer model key. This is the directory for the model.
|
374 |
+
:param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
|
375 |
+
:param hidden: The hidden output layers to use for the summarization.
|
376 |
+
:param reduce_option: The reduce option, such as mean, max, min, median, etc.
|
377 |
+
:param sentence_handler: The sentence handler class to process the raw text.
|
378 |
+
:param random_state: The random state to use.
|
379 |
+
:param hidden_concat: Deprecated hidden concat option.
|
380 |
+
:param gpu_id: GPU device index if CUDA is available.
|
381 |
+
"""
|
382 |
+
try:
|
383 |
+
self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
|
384 |
+
self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
|
385 |
+
self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
|
386 |
+
self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
|
387 |
+
self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
|
388 |
+
except Exception:
|
389 |
+
pass # older transformer version
|
390 |
+
|
391 |
+
model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
|
392 |
+
model = model_clz.from_pretrained(
|
393 |
+
transformer_model_key, output_hidden_states=True)
|
394 |
+
|
395 |
+
tokenizer = tokenizer_clz.from_pretrained(
|
396 |
+
transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
|
397 |
+
)
|
398 |
+
|
399 |
+
super().__init__(
|
400 |
+
None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
|
401 |
+
)
|
extractive_summarizer/sentence_handler.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
from spacy.lang.en import English
|
4 |
+
|
5 |
+
|
6 |
+
class SentenceHandler(object):
|
7 |
+
|
8 |
+
def __init__(self, language=English):
|
9 |
+
"""
|
10 |
+
Base Sentence Handler with Spacy support.
|
11 |
+
|
12 |
+
:param language: Determines the language to use with spacy.
|
13 |
+
"""
|
14 |
+
self.nlp = language()
|
15 |
+
|
16 |
+
try:
|
17 |
+
# Supports spacy 2.0
|
18 |
+
self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
|
19 |
+
self.is_spacy_3 = False
|
20 |
+
except Exception:
|
21 |
+
# Supports spacy 3.0
|
22 |
+
self.nlp.add_pipe("sentencizer")
|
23 |
+
self.is_spacy_3 = True
|
24 |
+
|
25 |
+
def sentence_processor(self, doc,
|
26 |
+
min_length: int = 40,
|
27 |
+
max_length: int = 600) -> List[str]:
|
28 |
+
"""
|
29 |
+
Processes a given spacy document and turns them into sentences.
|
30 |
+
|
31 |
+
:param doc: The document to use from spacy.
|
32 |
+
:param min_length: The minimum length a sentence should be to be considered.
|
33 |
+
:param max_length: The maximum length a sentence should be to be considered.
|
34 |
+
:return: Sentences.
|
35 |
+
"""
|
36 |
+
to_return = []
|
37 |
+
|
38 |
+
for c in doc.sents:
|
39 |
+
if max_length > len(c.text.strip()) > min_length:
|
40 |
+
|
41 |
+
if self.is_spacy_3:
|
42 |
+
to_return.append(c.text.strip())
|
43 |
+
else:
|
44 |
+
to_return.append(c.string.strip())
|
45 |
+
|
46 |
+
return to_return
|
47 |
+
|
48 |
+
def process(self, body: str,
|
49 |
+
min_length: int = 40,
|
50 |
+
max_length: int = 600) -> List[str]:
|
51 |
+
"""
|
52 |
+
Processes the content sentences.
|
53 |
+
|
54 |
+
:param body: The raw string body to process
|
55 |
+
:param min_length: Minimum length that the sentences must be
|
56 |
+
:param max_length: Max length that the sentences mus fall under
|
57 |
+
:return: Returns a list of sentences.
|
58 |
+
"""
|
59 |
+
doc = self.nlp(body)
|
60 |
+
return self.sentence_processor(doc, min_length, max_length)
|
61 |
+
|
62 |
+
def __call__(self, body: str,
|
63 |
+
min_length: int = 40,
|
64 |
+
max_length: int = 600) -> List[str]:
|
65 |
+
"""
|
66 |
+
Processes the content sentences.
|
67 |
+
|
68 |
+
:param body: The raw string body to process
|
69 |
+
:param min_length: Minimum length that the sentences must be
|
70 |
+
:param max_length: Max length that the sentences mus fall under
|
71 |
+
:return: Returns a list of sentences.
|
72 |
+
"""
|
73 |
+
return self.process(body, min_length, max_length)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
torch
|
3 |
+
spacy
|
4 |
+
scikit-learn
|
5 |
+
transformers
|
6 |
+
streamlit
|
7 |
+
sentencepiece
|
8 |
+
beautifulsoup4
|
9 |
+
nltk
|
10 |
+
PyPDF2
|
11 |
+
docx2txt
|
12 |
+
rouge
|
utils.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import docx2txt
|
4 |
+
from io import StringIO
|
5 |
+
from PyPDF2 import PdfFileReader
|
6 |
+
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from nltk.tokenize import sent_tokenize
|
9 |
+
|
10 |
+
emoji_pattern = re.compile(
|
11 |
+
"["
|
12 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
13 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
14 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
15 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
16 |
+
u"\U00002702-\U000027B0"
|
17 |
+
u"\U000024C2-\U0001F251"
|
18 |
+
"]+",
|
19 |
+
flags=re.UNICODE,
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def clean_text(x):
|
24 |
+
# x = x.lower() # lowercase
|
25 |
+
x = x.encode("ascii", "ignore").decode() # unicode
|
26 |
+
x = re.sub(r"https*\S+", " ", x) # url
|
27 |
+
x = re.sub(r"@\S+", " ", x) # mentions
|
28 |
+
x = re.sub(r"#\S+", " ", x) # hastags
|
29 |
+
# x = x.replace("'", "") # remove ticks
|
30 |
+
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
|
31 |
+
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
32 |
+
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
33 |
+
x = emoji_pattern.sub(r"", x) # emojis
|
34 |
+
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
|
35 |
+
|
36 |
+
return x
|
37 |
+
|
38 |
+
|
39 |
+
def fetch_article_text(url: str):
|
40 |
+
|
41 |
+
r = requests.get(url)
|
42 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
43 |
+
results = soup.find_all(["h1", "p"])
|
44 |
+
text = [result.text for result in results]
|
45 |
+
ARTICLE = " ".join(text)
|
46 |
+
ARTICLE = ARTICLE.replace(".", ".<eos>")
|
47 |
+
ARTICLE = ARTICLE.replace("!", "!<eos>")
|
48 |
+
ARTICLE = ARTICLE.replace("?", "?<eos>")
|
49 |
+
sentences = ARTICLE.split("<eos>")
|
50 |
+
current_chunk = 0
|
51 |
+
chunks = []
|
52 |
+
for sentence in sentences:
|
53 |
+
if len(chunks) == current_chunk + 1:
|
54 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
|
55 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
56 |
+
else:
|
57 |
+
current_chunk += 1
|
58 |
+
chunks.append(sentence.split(" "))
|
59 |
+
else:
|
60 |
+
print(current_chunk)
|
61 |
+
chunks.append(sentence.split(" "))
|
62 |
+
|
63 |
+
for chunk_id in range(len(chunks)):
|
64 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
65 |
+
|
66 |
+
return ARTICLE, chunks
|
67 |
+
|
68 |
+
|
69 |
+
def preprocess_text_for_abstractive_summarization(tokenizer, text):
|
70 |
+
sentences = sent_tokenize(text)
|
71 |
+
|
72 |
+
# initialize
|
73 |
+
length = 0
|
74 |
+
chunk = ""
|
75 |
+
chunks = []
|
76 |
+
count = -1
|
77 |
+
for sentence in sentences:
|
78 |
+
count += 1
|
79 |
+
combined_length = (
|
80 |
+
len(tokenizer.tokenize(sentence)) + length
|
81 |
+
) # add the no. of sentence tokens to the length counter
|
82 |
+
|
83 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
84 |
+
chunk += sentence + " " # add the sentence to the chunk
|
85 |
+
length = combined_length # update the length counter
|
86 |
+
|
87 |
+
# if it is the last sentence
|
88 |
+
if count == len(sentences) - 1:
|
89 |
+
chunks.append(chunk.strip()) # save the chunk
|
90 |
+
|
91 |
+
else:
|
92 |
+
chunks.append(chunk.strip()) # save the chunk
|
93 |
+
|
94 |
+
# reset
|
95 |
+
length = 0
|
96 |
+
chunk = ""
|
97 |
+
|
98 |
+
# take care of the overflow sentence
|
99 |
+
chunk += sentence + " "
|
100 |
+
length = len(tokenizer.tokenize(sentence))
|
101 |
+
|
102 |
+
return chunks
|
103 |
+
|
104 |
+
|
105 |
+
def read_pdf(file):
|
106 |
+
pdfReader = PdfFileReader(file)
|
107 |
+
count = pdfReader.numPages
|
108 |
+
all_page_text = ""
|
109 |
+
for i in range(count):
|
110 |
+
page = pdfReader.getPage(i)
|
111 |
+
all_page_text += page.extractText()
|
112 |
+
|
113 |
+
return all_page_text
|
114 |
+
|
115 |
+
|
116 |
+
def read_text_from_file(file):
|
117 |
+
|
118 |
+
# read text file
|
119 |
+
if file.type == "text/plain":
|
120 |
+
# To convert to a string based IO:
|
121 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
122 |
+
|
123 |
+
# To read file as string:
|
124 |
+
file_content = stringio.read()
|
125 |
+
|
126 |
+
# read pdf file
|
127 |
+
elif file.type == "application/pdf":
|
128 |
+
file_content = read_pdf(file)
|
129 |
+
|
130 |
+
# read docx file
|
131 |
+
elif (
|
132 |
+
file.type
|
133 |
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
134 |
+
):
|
135 |
+
file_content = docx2txt.process(file)
|
136 |
+
|
137 |
+
return file_content
|