girijareddy commited on
Commit
f0c69ca
·
1 Parent(s): b20680e

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +152 -0
  2. app.py +77 -0
  3. requirements.txt +5 -0
  4. utils.py +138 -0
.gitignore ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105
+ __pypackages__/
106
+
107
+ # Celery stuff
108
+ celerybeat-schedule
109
+ celerybeat.pid
110
+
111
+ # SageMath parsed files
112
+ *.sage.py
113
+
114
+ # Environments
115
+ .env
116
+ .venv
117
+ env/
118
+ venv/
119
+ ENV/
120
+ env.bak/
121
+ venv.bak/
122
+
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+
133
+ # mypy
134
+ .mypy_cache/
135
+ .dmypy.json
136
+ dmypy.json
137
+
138
+ # Pyre type checker
139
+ .pyre/
140
+
141
+ # pytype static type analyzer
142
+ .pytype/
143
+
144
+ # Cython debug symbols
145
+ cython_debug/
146
+
147
+ # PyCharm
148
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
151
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152
+ #.idea/
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.lang.en.stop_words import STOP_WORDS
3
+ from string import punctuation
4
+ from spacy import tokens
5
+ import streamlit as st
6
+ from heapq import nlargest
7
+ import subprocess
8
+ subprocess.run("pip3 install PyPDF2".split())
9
+ subprocess.run("python3 -m spacy download en_core_web_sm".split())
10
+ import PyPDF2
11
+ from utils import (
12
+ clean_text,
13
+ fetch_article_text,
14
+ preprocess_text_for_abstractive_summarization,
15
+ read_text_from_file,
16
+ )
17
+ #---------------------Pre-Requiste------------------------#
18
+ stopwords = STOP_WORDS
19
+ punctuation = punctuation + '\n'
20
+
21
+
22
+
23
+ if __name__=="__main__":
24
+ st.title("Text Summarizer 📝")
25
+ st.subheader("Creator: Shreyas Dixit")
26
+
27
+ n = st.sidebar.slider('Summarization %',10,90,step=10)
28
+ n = n/100
29
+ type=st.selectbox('Pick one', ['PDF','Text'])
30
+ if type=="PDF":
31
+ #Upload file
32
+ uploaded_file = st.file_uploader("Choose a file",type=['pdf','txt','docx'])
33
+ text = read_text_from_file(uploaded_file)
34
+ # FileName = uploaded_file.name
35
+ # if uploaded_file is not None:
36
+ # pdfFileObj = open("{FileName}", 'rb')
37
+ # pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
38
+ # pageObj = pdfReader.getPage(0)
39
+ # text = (pageObj.extractText())
40
+ # pdfFileObj.close()
41
+ elif type=="Text":
42
+ #Text
43
+ text=st.text_area("Input text !")
44
+
45
+ if st.button('Summarize'):
46
+ nlp = spacy.load('en_core_web_sm')
47
+ doc = nlp(text)
48
+ #Word tokenization
49
+ tokens = [tokens.text for tokens in doc]
50
+ word_frquency = {}
51
+ for word in doc:
52
+ if word.text.lower() not in stopwords:
53
+ if word.text.lower() not in punctuation:
54
+ if word.text not in word_frquency.keys():
55
+ word_frquency[word.text] = 1
56
+ else:
57
+ word_frquency[word.text] += 1
58
+ #Normalize the values
59
+ max_word = max(word_frquency.values())
60
+ for word in word_frquency.keys():
61
+ word_frquency[word] = word_frquency[word]/max_word
62
+ #Sentence Tokenization
63
+ sentence_token = [sent for sent in doc.sents]
64
+ sentence_score = {}
65
+ for sent in sentence_token:
66
+ for word in sent:
67
+ if word.text.lower() in word_frquency.keys():
68
+ if sent not in sentence_score.keys():
69
+ sentence_score[sent] = word_frquency[word.text.lower()]
70
+ else:
71
+ sentence_score[sent] += word_frquency[word.text.lower()]
72
+ #Creating a Summary
73
+ select_length = int(len(sentence_token)*n)
74
+ summary = nlargest(select_length,sentence_score,key = sentence_score.get)
75
+ summary = [word.text for word in summary]
76
+ summary = ' '.join(summary)
77
+ st.markdown(summary)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ heapq-max==0.21
2
+ spacy==2.2.4
3
+ spacy-loggers==1.0.2
4
+ streamlit==1.9.0
5
+ PyPDF2
utils.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import docx2txt
4
+ from io import StringIO
5
+ import PyPDF2
6
+ from PyPDF2 import PdfFileReader
7
+
8
+ from bs4 import BeautifulSoup
9
+ from nltk.tokenize import sent_tokenize
10
+
11
+ emoji_pattern = re.compile(
12
+ "["
13
+ u"\U0001F600-\U0001F64F" # emoticons
14
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
15
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
16
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
17
+ u"\U00002702-\U000027B0"
18
+ u"\U000024C2-\U0001F251"
19
+ "]+",
20
+ flags=re.UNICODE,
21
+ )
22
+
23
+
24
+ def clean_text(x):
25
+ # x = x.lower() # lowercase
26
+ x = x.encode("ascii", "ignore").decode() # unicode
27
+ x = re.sub(r"https*\S+", " ", x) # url
28
+ x = re.sub(r"@\S+", " ", x) # mentions
29
+ x = re.sub(r"#\S+", " ", x) # hastags
30
+ # x = x.replace("'", "") # remove ticks
31
+ # x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
32
+ # x = re.sub(r"\w*\d+\w*", "", x) # numbers
33
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
34
+ x = emoji_pattern.sub(r"", x) # emojis
35
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
36
+
37
+ return x
38
+
39
+
40
+ def fetch_article_text(url: str):
41
+
42
+ r = requests.get(url)
43
+ soup = BeautifulSoup(r.text, "html.parser")
44
+ results = soup.find_all(["h1", "p"])
45
+ text = [result.text for result in results]
46
+ ARTICLE = " ".join(text)
47
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
48
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
49
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
50
+ sentences = ARTICLE.split("<eos>")
51
+ current_chunk = 0
52
+ chunks = []
53
+ for sentence in sentences:
54
+ if len(chunks) == current_chunk + 1:
55
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
56
+ chunks[current_chunk].extend(sentence.split(" "))
57
+ else:
58
+ current_chunk += 1
59
+ chunks.append(sentence.split(" "))
60
+ else:
61
+ print(current_chunk)
62
+ chunks.append(sentence.split(" "))
63
+
64
+ for chunk_id in range(len(chunks)):
65
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
66
+
67
+ return ARTICLE, chunks
68
+
69
+
70
+ def preprocess_text_for_abstractive_summarization(tokenizer, text):
71
+ sentences = sent_tokenize(text)
72
+
73
+ # initialize
74
+ length = 0
75
+ chunk = ""
76
+ chunks = []
77
+ count = -1
78
+ for sentence in sentences:
79
+ count += 1
80
+ combined_length = (
81
+ len(tokenizer.tokenize(sentence)) + length
82
+ ) # add the no. of sentence tokens to the length counter
83
+
84
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
85
+ chunk += sentence + " " # add the sentence to the chunk
86
+ length = combined_length # update the length counter
87
+
88
+ # if it is the last sentence
89
+ if count == len(sentences) - 1:
90
+ chunks.append(chunk.strip()) # save the chunk
91
+
92
+ else:
93
+ chunks.append(chunk.strip()) # save the chunk
94
+
95
+ # reset
96
+ length = 0
97
+ chunk = ""
98
+
99
+ # take care of the overflow sentence
100
+ chunk += sentence + " "
101
+ length = len(tokenizer.tokenize(sentence))
102
+
103
+ return chunks
104
+
105
+
106
+ def read_pdf(file):
107
+ pdfReader = PdfFileReader(file)
108
+ count = pdfReader.numPages
109
+ all_page_text = ""
110
+ for i in range(count):
111
+ page = pdfReader.getPage(i)
112
+ all_page_text += page.extractText()
113
+
114
+ return all_page_text
115
+
116
+
117
+ def read_text_from_file(file):
118
+
119
+ # read text file
120
+ if file.type == "text/plain":
121
+ # To convert to a string based IO:
122
+ stringio = StringIO(file.getvalue().decode("utf-8"))
123
+
124
+ # To read file as string:
125
+ file_content = stringio.read()
126
+
127
+ # read pdf file
128
+ elif file.type == "application/pdf":
129
+ file_content = read_pdf(file)
130
+
131
+ # read docx file
132
+ elif (
133
+ file.type
134
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
135
+ ):
136
+ file_content = docx2txt.process(file)
137
+
138
+ return file_content