Spaces:
Runtime error
Runtime error
olusegun.odewole
commited on
Commit
•
8749106
1
Parent(s):
da6a72e
first commit
Browse files- app.py +21 -0
- apps/__pycache__/paraphraseApp.cpython-38.pyc +0 -0
- apps/__pycache__/summarizeApp.cpython-38.pyc +0 -0
- apps/createPickle.py +10 -0
- apps/paraphraseApp.py +14 -0
- apps/summarizeApp.py +11 -0
- multiapp.py +48 -0
- oneliner.txt +0 -0
- paraphraser.py +20 -0
- requirements.txt +0 -0
- summarizer.py +239 -0
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from multiapp import MultiApp
|
3 |
+
# import your app modules here
|
4 |
+
from apps import paraphraseApp, summarizeApp
|
5 |
+
|
6 |
+
app = MultiApp()
|
7 |
+
|
8 |
+
st.markdown("""
|
9 |
+
# ArticleHelp
|
10 |
+
|
11 |
+
ArticleHelp provides two services - Paraphrasing and Summarizing. It utilizes TF-IDF Algorithm for summarization and transformer models for paraphrasing.
|
12 |
+
|
13 |
+
## Enter your text and see the magic!
|
14 |
+
|
15 |
+
""")
|
16 |
+
|
17 |
+
# Add all your application here
|
18 |
+
app.add_app("Paraphraser", paraphraseApp.app)
|
19 |
+
app.add_app("Summarizer", summarizeApp.app)
|
20 |
+
# The main app
|
21 |
+
app.run()
|
apps/__pycache__/paraphraseApp.cpython-38.pyc
ADDED
Binary file (775 Bytes). View file
|
|
apps/__pycache__/summarizeApp.cpython-38.pyc
ADDED
Binary file (598 Bytes). View file
|
|
apps/createPickle.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
import torch
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
|
7 |
+
pickle.dump(model, open('model.pkl', 'wb'))
|
8 |
+
pickle.dump(model, open('tokenizer.pkl', 'wb'))
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
model = model.to(device)
|
apps/paraphraseApp.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from paraphraser import get_paraphrased_sentences, model, tokenizer
|
3 |
+
|
4 |
+
def app():
|
5 |
+
st.title('Paraphraser')
|
6 |
+
st.write('Please provide the text to be paraphrased')
|
7 |
+
user_input = st.text_area('Enter text','')
|
8 |
+
paraphraseNo = st.slider('Number of Parapharases',1,2,10)
|
9 |
+
if st.button('Paraphrase'):
|
10 |
+
output = get_paraphrased_sentences(model, tokenizer, user_input, num_beams=10, num_return_sequences=paraphraseNo)
|
11 |
+
st.write("Paraphrased Text: ")
|
12 |
+
st.write(output)
|
13 |
+
|
14 |
+
##get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)
|
apps/summarizeApp.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from summarizer import run_summarization
|
3 |
+
|
4 |
+
def app():
|
5 |
+
st.title('Summarizer')
|
6 |
+
st.write('Please provide the text to be summarized')
|
7 |
+
user_input = st.text_area('Enter text','')
|
8 |
+
if st.button('Summarize'):
|
9 |
+
output1 = run_summarization(str(user_input))#,minLength,maxLength)
|
10 |
+
st.write("Text Summary: ")
|
11 |
+
st.write(output1)
|
multiapp.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
+
"""
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
class MultiApp:
|
6 |
+
"""Framework for combining multiple streamlit applications.
|
7 |
+
Usage:
|
8 |
+
def foo():
|
9 |
+
st.title("Hello Foo")
|
10 |
+
def bar():
|
11 |
+
st.title("Hello Bar")
|
12 |
+
app = MultiApp()
|
13 |
+
app.add_app("Foo", foo)
|
14 |
+
app.add_app("Bar", bar)
|
15 |
+
app.run()
|
16 |
+
It is also possible keep each application in a separate file.
|
17 |
+
import foo
|
18 |
+
import bar
|
19 |
+
app = MultiApp()
|
20 |
+
app.add_app("Foo", foo.app)
|
21 |
+
app.add_app("Bar", bar.app)
|
22 |
+
app.run()
|
23 |
+
"""
|
24 |
+
def __init__(self):
|
25 |
+
self.apps = []
|
26 |
+
|
27 |
+
def add_app(self, title, func):
|
28 |
+
"""Adds a new application.
|
29 |
+
Parameters
|
30 |
+
----------
|
31 |
+
func:
|
32 |
+
the python function to render this app.
|
33 |
+
title:
|
34 |
+
title of the app. Appears in the dropdown in the sidebar.
|
35 |
+
"""
|
36 |
+
self.apps.append({
|
37 |
+
"title": title,
|
38 |
+
"function": func
|
39 |
+
})
|
40 |
+
|
41 |
+
def run(self):
|
42 |
+
# app = st.sidebar.radio(
|
43 |
+
app = st.selectbox(
|
44 |
+
'Navigation',
|
45 |
+
self.apps,
|
46 |
+
format_func=lambda app: app['title'])
|
47 |
+
|
48 |
+
app['function']()
|
oneliner.txt
ADDED
Binary file (244 Bytes). View file
|
|
paraphraser.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import *
|
2 |
+
|
3 |
+
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
|
4 |
+
tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")
|
5 |
+
|
6 |
+
|
7 |
+
def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
|
8 |
+
# tokenize the text to be form of a list of token IDs
|
9 |
+
inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
|
10 |
+
# generate the paraphrased sentences
|
11 |
+
outputs = model.generate(
|
12 |
+
**inputs,
|
13 |
+
num_beams=num_beams,
|
14 |
+
num_return_sequences=num_return_sequences,
|
15 |
+
)
|
16 |
+
# decode the generated sentences using the tokenizer to get them back to text
|
17 |
+
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
18 |
+
|
19 |
+
#sentence = "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences."
|
20 |
+
#get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)
|
requirements.txt
ADDED
Binary file (172 Bytes). View file
|
|
summarizer.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('punkt')
|
3 |
+
nltk.download('stopwords')
|
4 |
+
|
5 |
+
import math
|
6 |
+
|
7 |
+
from nltk import sent_tokenize, word_tokenize, PorterStemmer
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
|
10 |
+
|
11 |
+
def _create_frequency_table(text_string) -> dict:
|
12 |
+
"""
|
13 |
+
we create a dictionary for the word frequency table.
|
14 |
+
For this, we should only use the words that are not part of the stopWords array.
|
15 |
+
Removing stop words and making frequency table
|
16 |
+
Stemmer - an algorithm to bring words to its root word.
|
17 |
+
:rtype: dict
|
18 |
+
"""
|
19 |
+
stopWords = set(stopwords.words("english"))
|
20 |
+
words = word_tokenize(text_string)
|
21 |
+
ps = PorterStemmer()
|
22 |
+
|
23 |
+
freqTable = dict()
|
24 |
+
for word in words:
|
25 |
+
word = ps.stem(word)
|
26 |
+
if word in stopWords:
|
27 |
+
continue
|
28 |
+
if word in freqTable:
|
29 |
+
freqTable[word] += 1
|
30 |
+
else:
|
31 |
+
freqTable[word] = 1
|
32 |
+
|
33 |
+
return freqTable
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def _create_frequency_matrix(sentences):
|
38 |
+
frequency_matrix = {}
|
39 |
+
stopWords = set(stopwords.words("english"))
|
40 |
+
ps = PorterStemmer()
|
41 |
+
|
42 |
+
for sent in sentences:
|
43 |
+
freq_table = {}
|
44 |
+
words = word_tokenize(sent)
|
45 |
+
for word in words:
|
46 |
+
word = word.lower()
|
47 |
+
word = ps.stem(word)
|
48 |
+
if word in stopWords:
|
49 |
+
continue
|
50 |
+
|
51 |
+
if word in freq_table:
|
52 |
+
freq_table[word] += 1
|
53 |
+
else:
|
54 |
+
freq_table[word] = 1
|
55 |
+
|
56 |
+
frequency_matrix[sent[:15]] = freq_table
|
57 |
+
|
58 |
+
return frequency_matrix
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
def _create_tf_matrix(freq_matrix):
|
64 |
+
tf_matrix = {}
|
65 |
+
|
66 |
+
for sent, f_table in freq_matrix.items():
|
67 |
+
tf_table = {}
|
68 |
+
|
69 |
+
count_words_in_sentence = len(f_table)
|
70 |
+
for word, count in f_table.items():
|
71 |
+
tf_table[word] = count / count_words_in_sentence
|
72 |
+
|
73 |
+
tf_matrix[sent] = tf_table
|
74 |
+
|
75 |
+
return tf_matrix
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
def _create_documents_per_words(freq_matrix):
|
80 |
+
word_per_doc_table = {}
|
81 |
+
|
82 |
+
for sent, f_table in freq_matrix.items():
|
83 |
+
for word, count in f_table.items():
|
84 |
+
if word in word_per_doc_table:
|
85 |
+
word_per_doc_table[word] += 1
|
86 |
+
else:
|
87 |
+
word_per_doc_table[word] = 1
|
88 |
+
|
89 |
+
return word_per_doc_table
|
90 |
+
|
91 |
+
|
92 |
+
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
|
93 |
+
idf_matrix = {}
|
94 |
+
|
95 |
+
for sent, f_table in freq_matrix.items():
|
96 |
+
idf_table = {}
|
97 |
+
|
98 |
+
for word in f_table.keys():
|
99 |
+
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
|
100 |
+
|
101 |
+
idf_matrix[sent] = idf_table
|
102 |
+
|
103 |
+
return idf_matrix
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
|
108 |
+
tf_idf_matrix = {}
|
109 |
+
|
110 |
+
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
|
111 |
+
|
112 |
+
tf_idf_table = {}
|
113 |
+
|
114 |
+
for (word1, value1), (word2, value2) in zip(f_table1.items(),
|
115 |
+
f_table2.items()): # here, keys are the same in both the table
|
116 |
+
tf_idf_table[word1] = float(value1 * value2)
|
117 |
+
|
118 |
+
tf_idf_matrix[sent1] = tf_idf_table
|
119 |
+
|
120 |
+
return tf_idf_matrix
|
121 |
+
|
122 |
+
|
123 |
+
def _score_sentences(tf_idf_matrix) -> dict:
|
124 |
+
"""
|
125 |
+
score a sentence by its word's TF
|
126 |
+
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
|
127 |
+
:rtype: dict
|
128 |
+
"""
|
129 |
+
|
130 |
+
sentenceValue = {}
|
131 |
+
|
132 |
+
for sent, f_table in tf_idf_matrix.items():
|
133 |
+
total_score_per_sentence = 0
|
134 |
+
|
135 |
+
count_words_in_sentence = len(f_table)
|
136 |
+
for word, score in f_table.items():
|
137 |
+
total_score_per_sentence += score
|
138 |
+
|
139 |
+
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
|
140 |
+
|
141 |
+
return sentenceValue
|
142 |
+
|
143 |
+
def _find_average_score(sentenceValue) -> int:
|
144 |
+
"""
|
145 |
+
Find the average score from the sentence value dictionary
|
146 |
+
:rtype: int
|
147 |
+
"""
|
148 |
+
sumValues = 0
|
149 |
+
for entry in sentenceValue:
|
150 |
+
sumValues += sentenceValue[entry]
|
151 |
+
|
152 |
+
# Average value of a sentence from original summary_text
|
153 |
+
average = (sumValues / len(sentenceValue))
|
154 |
+
|
155 |
+
return average
|
156 |
+
|
157 |
+
|
158 |
+
def _generate_summary(sentences, sentenceValue, threshold):
|
159 |
+
sentence_count = 0
|
160 |
+
summary = ''
|
161 |
+
|
162 |
+
for sentence in sentences:
|
163 |
+
if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
|
164 |
+
summary += " " + sentence
|
165 |
+
sentence_count += 1
|
166 |
+
|
167 |
+
return summary
|
168 |
+
|
169 |
+
|
170 |
+
def run_summarization(text):
|
171 |
+
"""
|
172 |
+
:param text: Plain summary_text of long article
|
173 |
+
:return: summarized summary_text
|
174 |
+
"""
|
175 |
+
|
176 |
+
'''
|
177 |
+
We already have a sentence tokenizer, so we just need
|
178 |
+
to run the sent_tokenize() method to create the array of sentences.
|
179 |
+
'''
|
180 |
+
# 1 Sentence Tokenize
|
181 |
+
sentences = sent_tokenize(text)
|
182 |
+
total_documents = len(sentences)
|
183 |
+
#print(sentences)
|
184 |
+
|
185 |
+
# 2 Create the Frequency matrix of the words in each sentence.
|
186 |
+
freq_matrix = _create_frequency_matrix(sentences)
|
187 |
+
#print(freq_matrix)
|
188 |
+
|
189 |
+
'''
|
190 |
+
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
|
191 |
+
'''
|
192 |
+
# 3 Calculate TermFrequency and generate a matrix
|
193 |
+
tf_matrix = _create_tf_matrix(freq_matrix)
|
194 |
+
#print(tf_matrix)
|
195 |
+
|
196 |
+
# 4 creating table for documents per words
|
197 |
+
count_doc_per_words = _create_documents_per_words(freq_matrix)
|
198 |
+
#print(count_doc_per_words)
|
199 |
+
|
200 |
+
'''
|
201 |
+
Inverse document frequency (IDF) is how unique or rare a word is.
|
202 |
+
'''
|
203 |
+
# 5 Calculate IDF and generate a matrix
|
204 |
+
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
|
205 |
+
#print(idf_matrix)
|
206 |
+
|
207 |
+
# 6 Calculate TF-IDF and generate a matrix
|
208 |
+
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
|
209 |
+
#print(tf_idf_matrix)
|
210 |
+
|
211 |
+
# 7 Important Algorithm: score the sentences
|
212 |
+
sentence_scores = _score_sentences(tf_idf_matrix)
|
213 |
+
#print(sentence_scores)
|
214 |
+
|
215 |
+
# 8 Find the threshold
|
216 |
+
threshold = _find_average_score(sentence_scores)
|
217 |
+
#print(threshold)
|
218 |
+
|
219 |
+
# 9 Important Algorithm: Generate the summary
|
220 |
+
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
|
221 |
+
return summary
|
222 |
+
|
223 |
+
|
224 |
+
|
225 |
+
#usage = run_summarization(text_str)
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
# def text_summarize(ARTICLE, maxLength, minLength):
|
237 |
+
# output = summarizer(ARTICLE)[0]['summary_text']
|
238 |
+
# ans = text_paraphrase(output)
|
239 |
+
# return ans
|