Spaces:
Configuration error
Configuration error
Commit
·
3ae0207
1
Parent(s):
35d8fdb
Upload 5 files
Browse files- .gitignore +152 -0
- README.md +24 -13
- app.py +77 -0
- requirements.txt +5 -0
- utils.py +138 -0
.gitignore
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
105 |
+
__pypackages__/
|
106 |
+
|
107 |
+
# Celery stuff
|
108 |
+
celerybeat-schedule
|
109 |
+
celerybeat.pid
|
110 |
+
|
111 |
+
# SageMath parsed files
|
112 |
+
*.sage.py
|
113 |
+
|
114 |
+
# Environments
|
115 |
+
.env
|
116 |
+
.venv
|
117 |
+
env/
|
118 |
+
venv/
|
119 |
+
ENV/
|
120 |
+
env.bak/
|
121 |
+
venv.bak/
|
122 |
+
|
123 |
+
# Spyder project settings
|
124 |
+
.spyderproject
|
125 |
+
.spyproject
|
126 |
+
|
127 |
+
# Rope project settings
|
128 |
+
.ropeproject
|
129 |
+
|
130 |
+
# mkdocs documentation
|
131 |
+
/site
|
132 |
+
|
133 |
+
# mypy
|
134 |
+
.mypy_cache/
|
135 |
+
.dmypy.json
|
136 |
+
dmypy.json
|
137 |
+
|
138 |
+
# Pyre type checker
|
139 |
+
.pyre/
|
140 |
+
|
141 |
+
# pytype static type analyzer
|
142 |
+
.pytype/
|
143 |
+
|
144 |
+
# Cython debug symbols
|
145 |
+
cython_debug/
|
146 |
+
|
147 |
+
# PyCharm
|
148 |
+
# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
|
149 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
150 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
151 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
152 |
+
#.idea/
|
README.md
CHANGED
@@ -1,13 +1,24 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Text Summarizer
|
3 |
+
|
4 |
+
This Project provides you with a brief summary of the given Text.
|
5 |
+
The Project allows you to paste or Upload PDF file to summarize it , It also allows you to customize the summarization % of the Final summary!
|
6 |
+
|
7 |
+
## Concept
|
8 |
+
The Project is based on the concept of Extractive Text Summarization.
|
9 |
+
In this concept we read all the sentences given in the TEXT and then spereate each word in them.
|
10 |
+
Then we grade each of the unique words with a value which corelates to the number of times it has occured in the text.
|
11 |
+
|
12 |
+
Then we normalize these grades and reassign them to all the words in the sentences. Then we calculate the grades for the whole sentences and rank the sentences on the grades they get.
|
13 |
+
Then we summarize the top "n%" of the sentences as output.
|
14 |
+
<img src='https://imgs.search.brave.com/m5dNQYnKvcBHxQBBwzS_lUNmEKAEMcA3WvZZ9EYvQSM/rs:fit:1200:1016:1/g:ce/aHR0cHM6Ly9taXJv/Lm1lZGl1bS5jb20v/bWF4LzI5MjAvMSo1/X3Q0RUpsMUl5OUIx/dzVFdFgxWm9nLmpw/ZWc'>
|
15 |
+
## Deployment
|
16 |
+
|
17 |
+
The Project is Deployed on Hugging face
|
18 |
+
|
19 |
+
```bash
|
20 |
+
https://huggingface.co/spaces/SRDdev/Summarize
|
21 |
+
```
|
22 |
+
|
23 |
+
|
24 |
+
|
app.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
3 |
+
from string import punctuation
|
4 |
+
from spacy import tokens
|
5 |
+
import streamlit as st
|
6 |
+
from heapq import nlargest
|
7 |
+
import subprocess
|
8 |
+
subprocess.run("pip3 install PyPDF2".split())
|
9 |
+
subprocess.run("python3 -m spacy download en_core_web_sm".split())
|
10 |
+
import PyPDF2
|
11 |
+
from utils import (
|
12 |
+
clean_text,
|
13 |
+
fetch_article_text,
|
14 |
+
preprocess_text_for_abstractive_summarization,
|
15 |
+
read_text_from_file,
|
16 |
+
)
|
17 |
+
#---------------------Pre-Requiste------------------------#
|
18 |
+
stopwords = STOP_WORDS
|
19 |
+
punctuation = punctuation + '\n'
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
if __name__=="__main__":
|
24 |
+
st.title("Text Summarizer 📝")
|
25 |
+
st.subheader("Creator: Shreyas Dixit")
|
26 |
+
|
27 |
+
n = st.sidebar.slider('Summarization %',10,90,step=10)
|
28 |
+
n = n/100
|
29 |
+
type=st.selectbox('Pick one', ['PDF','Text'])
|
30 |
+
if type=="PDF":
|
31 |
+
#Upload file
|
32 |
+
uploaded_file = st.file_uploader("Choose a file",type=['pdf','txt','docx'])
|
33 |
+
text = read_text_from_file(uploaded_file)
|
34 |
+
# FileName = uploaded_file.name
|
35 |
+
# if uploaded_file is not None:
|
36 |
+
# pdfFileObj = open("{FileName}", 'rb')
|
37 |
+
# pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
|
38 |
+
# pageObj = pdfReader.getPage(0)
|
39 |
+
# text = (pageObj.extractText())
|
40 |
+
# pdfFileObj.close()
|
41 |
+
elif type=="Text":
|
42 |
+
#Text
|
43 |
+
text=st.text_area("Input text !")
|
44 |
+
|
45 |
+
if st.button('Summarize'):
|
46 |
+
nlp = spacy.load('en_core_web_sm')
|
47 |
+
doc = nlp(text)
|
48 |
+
#Word tokenization
|
49 |
+
tokens = [tokens.text for tokens in doc]
|
50 |
+
word_frquency = {}
|
51 |
+
for word in doc:
|
52 |
+
if word.text.lower() not in stopwords:
|
53 |
+
if word.text.lower() not in punctuation:
|
54 |
+
if word.text not in word_frquency.keys():
|
55 |
+
word_frquency[word.text] = 1
|
56 |
+
else:
|
57 |
+
word_frquency[word.text] += 1
|
58 |
+
#Normalize the values
|
59 |
+
max_word = max(word_frquency.values())
|
60 |
+
for word in word_frquency.keys():
|
61 |
+
word_frquency[word] = word_frquency[word]/max_word
|
62 |
+
#Sentence Tokenization
|
63 |
+
sentence_token = [sent for sent in doc.sents]
|
64 |
+
sentence_score = {}
|
65 |
+
for sent in sentence_token:
|
66 |
+
for word in sent:
|
67 |
+
if word.text.lower() in word_frquency.keys():
|
68 |
+
if sent not in sentence_score.keys():
|
69 |
+
sentence_score[sent] = word_frquency[word.text.lower()]
|
70 |
+
else:
|
71 |
+
sentence_score[sent] += word_frquency[word.text.lower()]
|
72 |
+
#Creating a Summary
|
73 |
+
select_length = int(len(sentence_token)*n)
|
74 |
+
summary = nlargest(select_length,sentence_score,key = sentence_score.get)
|
75 |
+
summary = [word.text for word in summary]
|
76 |
+
summary = ' '.join(summary)
|
77 |
+
st.markdown(summary)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
heapq-max==0.21
|
2 |
+
spacy==2.2.4
|
3 |
+
spacy-loggers==1.0.2
|
4 |
+
streamlit==1.9.0
|
5 |
+
PyPDF2
|
utils.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import docx2txt
|
4 |
+
from io import StringIO
|
5 |
+
import PyPDF2
|
6 |
+
from PyPDF2 import PdfFileReader
|
7 |
+
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from nltk.tokenize import sent_tokenize
|
10 |
+
|
11 |
+
emoji_pattern = re.compile(
|
12 |
+
"["
|
13 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
14 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
15 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
16 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
17 |
+
u"\U00002702-\U000027B0"
|
18 |
+
u"\U000024C2-\U0001F251"
|
19 |
+
"]+",
|
20 |
+
flags=re.UNICODE,
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def clean_text(x):
|
25 |
+
# x = x.lower() # lowercase
|
26 |
+
x = x.encode("ascii", "ignore").decode() # unicode
|
27 |
+
x = re.sub(r"https*\S+", " ", x) # url
|
28 |
+
x = re.sub(r"@\S+", " ", x) # mentions
|
29 |
+
x = re.sub(r"#\S+", " ", x) # hastags
|
30 |
+
# x = x.replace("'", "") # remove ticks
|
31 |
+
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
|
32 |
+
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
33 |
+
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
34 |
+
x = emoji_pattern.sub(r"", x) # emojis
|
35 |
+
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
|
36 |
+
|
37 |
+
return x
|
38 |
+
|
39 |
+
|
40 |
+
def fetch_article_text(url: str):
|
41 |
+
|
42 |
+
r = requests.get(url)
|
43 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
44 |
+
results = soup.find_all(["h1", "p"])
|
45 |
+
text = [result.text for result in results]
|
46 |
+
ARTICLE = " ".join(text)
|
47 |
+
ARTICLE = ARTICLE.replace(".", ".<eos>")
|
48 |
+
ARTICLE = ARTICLE.replace("!", "!<eos>")
|
49 |
+
ARTICLE = ARTICLE.replace("?", "?<eos>")
|
50 |
+
sentences = ARTICLE.split("<eos>")
|
51 |
+
current_chunk = 0
|
52 |
+
chunks = []
|
53 |
+
for sentence in sentences:
|
54 |
+
if len(chunks) == current_chunk + 1:
|
55 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
|
56 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
57 |
+
else:
|
58 |
+
current_chunk += 1
|
59 |
+
chunks.append(sentence.split(" "))
|
60 |
+
else:
|
61 |
+
print(current_chunk)
|
62 |
+
chunks.append(sentence.split(" "))
|
63 |
+
|
64 |
+
for chunk_id in range(len(chunks)):
|
65 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
66 |
+
|
67 |
+
return ARTICLE, chunks
|
68 |
+
|
69 |
+
|
70 |
+
def preprocess_text_for_abstractive_summarization(tokenizer, text):
|
71 |
+
sentences = sent_tokenize(text)
|
72 |
+
|
73 |
+
# initialize
|
74 |
+
length = 0
|
75 |
+
chunk = ""
|
76 |
+
chunks = []
|
77 |
+
count = -1
|
78 |
+
for sentence in sentences:
|
79 |
+
count += 1
|
80 |
+
combined_length = (
|
81 |
+
len(tokenizer.tokenize(sentence)) + length
|
82 |
+
) # add the no. of sentence tokens to the length counter
|
83 |
+
|
84 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
85 |
+
chunk += sentence + " " # add the sentence to the chunk
|
86 |
+
length = combined_length # update the length counter
|
87 |
+
|
88 |
+
# if it is the last sentence
|
89 |
+
if count == len(sentences) - 1:
|
90 |
+
chunks.append(chunk.strip()) # save the chunk
|
91 |
+
|
92 |
+
else:
|
93 |
+
chunks.append(chunk.strip()) # save the chunk
|
94 |
+
|
95 |
+
# reset
|
96 |
+
length = 0
|
97 |
+
chunk = ""
|
98 |
+
|
99 |
+
# take care of the overflow sentence
|
100 |
+
chunk += sentence + " "
|
101 |
+
length = len(tokenizer.tokenize(sentence))
|
102 |
+
|
103 |
+
return chunks
|
104 |
+
|
105 |
+
|
106 |
+
def read_pdf(file):
|
107 |
+
pdfReader = PdfFileReader(file)
|
108 |
+
count = pdfReader.numPages
|
109 |
+
all_page_text = ""
|
110 |
+
for i in range(count):
|
111 |
+
page = pdfReader.getPage(i)
|
112 |
+
all_page_text += page.extractText()
|
113 |
+
|
114 |
+
return all_page_text
|
115 |
+
|
116 |
+
|
117 |
+
def read_text_from_file(file):
|
118 |
+
|
119 |
+
# read text file
|
120 |
+
if file.type == "text/plain":
|
121 |
+
# To convert to a string based IO:
|
122 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
123 |
+
|
124 |
+
# To read file as string:
|
125 |
+
file_content = stringio.read()
|
126 |
+
|
127 |
+
# read pdf file
|
128 |
+
elif file.type == "application/pdf":
|
129 |
+
file_content = read_pdf(file)
|
130 |
+
|
131 |
+
# read docx file
|
132 |
+
elif (
|
133 |
+
file.type
|
134 |
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
135 |
+
):
|
136 |
+
file_content = docx2txt.process(file)
|
137 |
+
|
138 |
+
return file_content
|