girijareddy commited on
Commit
3ae0207
·
1 Parent(s): 35d8fdb

Upload 5 files

Browse files
Files changed (5) hide show
  1. .gitignore +152 -0
  2. README.md +24 -13
  3. app.py +77 -0
  4. requirements.txt +5 -0
  5. utils.py +138 -0
.gitignore ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105
+ __pypackages__/
106
+
107
+ # Celery stuff
108
+ celerybeat-schedule
109
+ celerybeat.pid
110
+
111
+ # SageMath parsed files
112
+ *.sage.py
113
+
114
+ # Environments
115
+ .env
116
+ .venv
117
+ env/
118
+ venv/
119
+ ENV/
120
+ env.bak/
121
+ venv.bak/
122
+
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+
133
+ # mypy
134
+ .mypy_cache/
135
+ .dmypy.json
136
+ dmypy.json
137
+
138
+ # Pyre type checker
139
+ .pyre/
140
+
141
+ # pytype static type analyzer
142
+ .pytype/
143
+
144
+ # Cython debug symbols
145
+ cython_debug/
146
+
147
+ # PyCharm
148
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
151
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152
+ #.idea/
README.md CHANGED
@@ -1,13 +1,24 @@
1
- ---
2
- title: Documentsummary
3
- emoji: 📈
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- license: unlicense
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Text Summarizer
3
+
4
+ This Project provides you with a brief summary of the given Text.
5
+ The Project allows you to paste or Upload PDF file to summarize it , It also allows you to customize the summarization % of the Final summary!
6
+
7
+ ## Concept
8
+ The Project is based on the concept of Extractive Text Summarization.
9
+ In this concept we read all the sentences given in the TEXT and then spereate each word in them.
10
+ Then we grade each of the unique words with a value which corelates to the number of times it has occured in the text.
11
+
12
+ Then we normalize these grades and reassign them to all the words in the sentences. Then we calculate the grades for the whole sentences and rank the sentences on the grades they get.
13
+ Then we summarize the top "n%" of the sentences as output.
14
+ <img src='https://imgs.search.brave.com/m5dNQYnKvcBHxQBBwzS_lUNmEKAEMcA3WvZZ9EYvQSM/rs:fit:1200:1016:1/g:ce/aHR0cHM6Ly9taXJv/Lm1lZGl1bS5jb20v/bWF4LzI5MjAvMSo1/X3Q0RUpsMUl5OUIx/dzVFdFgxWm9nLmpw/ZWc'>
15
+ ## Deployment
16
+
17
+ The Project is Deployed on Hugging face
18
+
19
+ ```bash
20
+ https://huggingface.co/spaces/SRDdev/Summarize
21
+ ```
22
+
23
+
24
+
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.lang.en.stop_words import STOP_WORDS
3
+ from string import punctuation
4
+ from spacy import tokens
5
+ import streamlit as st
6
+ from heapq import nlargest
7
+ import subprocess
8
+ subprocess.run("pip3 install PyPDF2".split())
9
+ subprocess.run("python3 -m spacy download en_core_web_sm".split())
10
+ import PyPDF2
11
+ from utils import (
12
+ clean_text,
13
+ fetch_article_text,
14
+ preprocess_text_for_abstractive_summarization,
15
+ read_text_from_file,
16
+ )
17
+ #---------------------Pre-Requiste------------------------#
18
+ stopwords = STOP_WORDS
19
+ punctuation = punctuation + '\n'
20
+
21
+
22
+
23
+ if __name__=="__main__":
24
+ st.title("Text Summarizer 📝")
25
+ st.subheader("Creator: Shreyas Dixit")
26
+
27
+ n = st.sidebar.slider('Summarization %',10,90,step=10)
28
+ n = n/100
29
+ type=st.selectbox('Pick one', ['PDF','Text'])
30
+ if type=="PDF":
31
+ #Upload file
32
+ uploaded_file = st.file_uploader("Choose a file",type=['pdf','txt','docx'])
33
+ text = read_text_from_file(uploaded_file)
34
+ # FileName = uploaded_file.name
35
+ # if uploaded_file is not None:
36
+ # pdfFileObj = open("{FileName}", 'rb')
37
+ # pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
38
+ # pageObj = pdfReader.getPage(0)
39
+ # text = (pageObj.extractText())
40
+ # pdfFileObj.close()
41
+ elif type=="Text":
42
+ #Text
43
+ text=st.text_area("Input text !")
44
+
45
+ if st.button('Summarize'):
46
+ nlp = spacy.load('en_core_web_sm')
47
+ doc = nlp(text)
48
+ #Word tokenization
49
+ tokens = [tokens.text for tokens in doc]
50
+ word_frquency = {}
51
+ for word in doc:
52
+ if word.text.lower() not in stopwords:
53
+ if word.text.lower() not in punctuation:
54
+ if word.text not in word_frquency.keys():
55
+ word_frquency[word.text] = 1
56
+ else:
57
+ word_frquency[word.text] += 1
58
+ #Normalize the values
59
+ max_word = max(word_frquency.values())
60
+ for word in word_frquency.keys():
61
+ word_frquency[word] = word_frquency[word]/max_word
62
+ #Sentence Tokenization
63
+ sentence_token = [sent for sent in doc.sents]
64
+ sentence_score = {}
65
+ for sent in sentence_token:
66
+ for word in sent:
67
+ if word.text.lower() in word_frquency.keys():
68
+ if sent not in sentence_score.keys():
69
+ sentence_score[sent] = word_frquency[word.text.lower()]
70
+ else:
71
+ sentence_score[sent] += word_frquency[word.text.lower()]
72
+ #Creating a Summary
73
+ select_length = int(len(sentence_token)*n)
74
+ summary = nlargest(select_length,sentence_score,key = sentence_score.get)
75
+ summary = [word.text for word in summary]
76
+ summary = ' '.join(summary)
77
+ st.markdown(summary)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ heapq-max==0.21
2
+ spacy==2.2.4
3
+ spacy-loggers==1.0.2
4
+ streamlit==1.9.0
5
+ PyPDF2
utils.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import docx2txt
4
+ from io import StringIO
5
+ import PyPDF2
6
+ from PyPDF2 import PdfFileReader
7
+
8
+ from bs4 import BeautifulSoup
9
+ from nltk.tokenize import sent_tokenize
10
+
11
+ emoji_pattern = re.compile(
12
+ "["
13
+ u"\U0001F600-\U0001F64F" # emoticons
14
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
15
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
16
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
17
+ u"\U00002702-\U000027B0"
18
+ u"\U000024C2-\U0001F251"
19
+ "]+",
20
+ flags=re.UNICODE,
21
+ )
22
+
23
+
24
+ def clean_text(x):
25
+ # x = x.lower() # lowercase
26
+ x = x.encode("ascii", "ignore").decode() # unicode
27
+ x = re.sub(r"https*\S+", " ", x) # url
28
+ x = re.sub(r"@\S+", " ", x) # mentions
29
+ x = re.sub(r"#\S+", " ", x) # hastags
30
+ # x = x.replace("'", "") # remove ticks
31
+ # x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
32
+ # x = re.sub(r"\w*\d+\w*", "", x) # numbers
33
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
34
+ x = emoji_pattern.sub(r"", x) # emojis
35
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
36
+
37
+ return x
38
+
39
+
40
+ def fetch_article_text(url: str):
41
+
42
+ r = requests.get(url)
43
+ soup = BeautifulSoup(r.text, "html.parser")
44
+ results = soup.find_all(["h1", "p"])
45
+ text = [result.text for result in results]
46
+ ARTICLE = " ".join(text)
47
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
48
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
49
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
50
+ sentences = ARTICLE.split("<eos>")
51
+ current_chunk = 0
52
+ chunks = []
53
+ for sentence in sentences:
54
+ if len(chunks) == current_chunk + 1:
55
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
56
+ chunks[current_chunk].extend(sentence.split(" "))
57
+ else:
58
+ current_chunk += 1
59
+ chunks.append(sentence.split(" "))
60
+ else:
61
+ print(current_chunk)
62
+ chunks.append(sentence.split(" "))
63
+
64
+ for chunk_id in range(len(chunks)):
65
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
66
+
67
+ return ARTICLE, chunks
68
+
69
+
70
+ def preprocess_text_for_abstractive_summarization(tokenizer, text):
71
+ sentences = sent_tokenize(text)
72
+
73
+ # initialize
74
+ length = 0
75
+ chunk = ""
76
+ chunks = []
77
+ count = -1
78
+ for sentence in sentences:
79
+ count += 1
80
+ combined_length = (
81
+ len(tokenizer.tokenize(sentence)) + length
82
+ ) # add the no. of sentence tokens to the length counter
83
+
84
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
85
+ chunk += sentence + " " # add the sentence to the chunk
86
+ length = combined_length # update the length counter
87
+
88
+ # if it is the last sentence
89
+ if count == len(sentences) - 1:
90
+ chunks.append(chunk.strip()) # save the chunk
91
+
92
+ else:
93
+ chunks.append(chunk.strip()) # save the chunk
94
+
95
+ # reset
96
+ length = 0
97
+ chunk = ""
98
+
99
+ # take care of the overflow sentence
100
+ chunk += sentence + " "
101
+ length = len(tokenizer.tokenize(sentence))
102
+
103
+ return chunks
104
+
105
+
106
+ def read_pdf(file):
107
+ pdfReader = PdfFileReader(file)
108
+ count = pdfReader.numPages
109
+ all_page_text = ""
110
+ for i in range(count):
111
+ page = pdfReader.getPage(i)
112
+ all_page_text += page.extractText()
113
+
114
+ return all_page_text
115
+
116
+
117
+ def read_text_from_file(file):
118
+
119
+ # read text file
120
+ if file.type == "text/plain":
121
+ # To convert to a string based IO:
122
+ stringio = StringIO(file.getvalue().decode("utf-8"))
123
+
124
+ # To read file as string:
125
+ file_content = stringio.read()
126
+
127
+ # read pdf file
128
+ elif file.type == "application/pdf":
129
+ file_content = read_pdf(file)
130
+
131
+ # read docx file
132
+ elif (
133
+ file.type
134
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
135
+ ):
136
+ file_content = docx2txt.process(file)
137
+
138
+ return file_content