olusegun.odewole commited on
Commit
8749106
1 Parent(s): da6a72e

first commit

Browse files
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from multiapp import MultiApp
3
+ # import your app modules here
4
+ from apps import paraphraseApp, summarizeApp
5
+
6
+ app = MultiApp()
7
+
8
+ st.markdown("""
9
+ # ArticleHelp
10
+
11
+ ArticleHelp provides two services - Paraphrasing and Summarizing. It utilizes TF-IDF Algorithm for summarization and transformer models for paraphrasing.
12
+
13
+ ## Enter your text and see the magic!
14
+
15
+ """)
16
+
17
+ # Add all your application here
18
+ app.add_app("Paraphraser", paraphraseApp.app)
19
+ app.add_app("Summarizer", summarizeApp.app)
20
+ # The main app
21
+ app.run()
apps/__pycache__/paraphraseApp.cpython-38.pyc ADDED
Binary file (775 Bytes). View file
 
apps/__pycache__/summarizeApp.cpython-38.pyc ADDED
Binary file (598 Bytes). View file
 
apps/createPickle.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+ import pickle
4
+
5
+ model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
6
+ tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
7
+ pickle.dump(model, open('model.pkl', 'wb'))
8
+ pickle.dump(model, open('tokenizer.pkl', 'wb'))
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ model = model.to(device)
apps/paraphraseApp.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from paraphraser import get_paraphrased_sentences, model, tokenizer
3
+
4
+ def app():
5
+ st.title('Paraphraser')
6
+ st.write('Please provide the text to be paraphrased')
7
+ user_input = st.text_area('Enter text','')
8
+ paraphraseNo = st.slider('Number of Parapharases',1,2,10)
9
+ if st.button('Paraphrase'):
10
+ output = get_paraphrased_sentences(model, tokenizer, user_input, num_beams=10, num_return_sequences=paraphraseNo)
11
+ st.write("Paraphrased Text: ")
12
+ st.write(output)
13
+
14
+ ##get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)
apps/summarizeApp.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from summarizer import run_summarization
3
+
4
+ def app():
5
+ st.title('Summarizer')
6
+ st.write('Please provide the text to be summarized')
7
+ user_input = st.text_area('Enter text','')
8
+ if st.button('Summarize'):
9
+ output1 = run_summarization(str(user_input))#,minLength,maxLength)
10
+ st.write("Text Summary: ")
11
+ st.write(output1)
multiapp.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+
5
+ class MultiApp:
6
+ """Framework for combining multiple streamlit applications.
7
+ Usage:
8
+ def foo():
9
+ st.title("Hello Foo")
10
+ def bar():
11
+ st.title("Hello Bar")
12
+ app = MultiApp()
13
+ app.add_app("Foo", foo)
14
+ app.add_app("Bar", bar)
15
+ app.run()
16
+ It is also possible keep each application in a separate file.
17
+ import foo
18
+ import bar
19
+ app = MultiApp()
20
+ app.add_app("Foo", foo.app)
21
+ app.add_app("Bar", bar.app)
22
+ app.run()
23
+ """
24
+ def __init__(self):
25
+ self.apps = []
26
+
27
+ def add_app(self, title, func):
28
+ """Adds a new application.
29
+ Parameters
30
+ ----------
31
+ func:
32
+ the python function to render this app.
33
+ title:
34
+ title of the app. Appears in the dropdown in the sidebar.
35
+ """
36
+ self.apps.append({
37
+ "title": title,
38
+ "function": func
39
+ })
40
+
41
+ def run(self):
42
+ # app = st.sidebar.radio(
43
+ app = st.selectbox(
44
+ 'Navigation',
45
+ self.apps,
46
+ format_func=lambda app: app['title'])
47
+
48
+ app['function']()
oneliner.txt ADDED
Binary file (244 Bytes). View file
 
paraphraser.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import *
2
+
3
+ model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
4
+ tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")
5
+
6
+
7
+ def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
8
+ # tokenize the text to be form of a list of token IDs
9
+ inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
10
+ # generate the paraphrased sentences
11
+ outputs = model.generate(
12
+ **inputs,
13
+ num_beams=num_beams,
14
+ num_return_sequences=num_return_sequences,
15
+ )
16
+ # decode the generated sentences using the tokenizer to get them back to text
17
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)
18
+
19
+ #sentence = "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences."
20
+ #get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)
requirements.txt ADDED
Binary file (172 Bytes). View file
 
summarizer.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('punkt')
3
+ nltk.download('stopwords')
4
+
5
+ import math
6
+
7
+ from nltk import sent_tokenize, word_tokenize, PorterStemmer
8
+ from nltk.corpus import stopwords
9
+
10
+
11
+ def _create_frequency_table(text_string) -> dict:
12
+ """
13
+ we create a dictionary for the word frequency table.
14
+ For this, we should only use the words that are not part of the stopWords array.
15
+ Removing stop words and making frequency table
16
+ Stemmer - an algorithm to bring words to its root word.
17
+ :rtype: dict
18
+ """
19
+ stopWords = set(stopwords.words("english"))
20
+ words = word_tokenize(text_string)
21
+ ps = PorterStemmer()
22
+
23
+ freqTable = dict()
24
+ for word in words:
25
+ word = ps.stem(word)
26
+ if word in stopWords:
27
+ continue
28
+ if word in freqTable:
29
+ freqTable[word] += 1
30
+ else:
31
+ freqTable[word] = 1
32
+
33
+ return freqTable
34
+
35
+
36
+
37
+ def _create_frequency_matrix(sentences):
38
+ frequency_matrix = {}
39
+ stopWords = set(stopwords.words("english"))
40
+ ps = PorterStemmer()
41
+
42
+ for sent in sentences:
43
+ freq_table = {}
44
+ words = word_tokenize(sent)
45
+ for word in words:
46
+ word = word.lower()
47
+ word = ps.stem(word)
48
+ if word in stopWords:
49
+ continue
50
+
51
+ if word in freq_table:
52
+ freq_table[word] += 1
53
+ else:
54
+ freq_table[word] = 1
55
+
56
+ frequency_matrix[sent[:15]] = freq_table
57
+
58
+ return frequency_matrix
59
+
60
+
61
+
62
+
63
+ def _create_tf_matrix(freq_matrix):
64
+ tf_matrix = {}
65
+
66
+ for sent, f_table in freq_matrix.items():
67
+ tf_table = {}
68
+
69
+ count_words_in_sentence = len(f_table)
70
+ for word, count in f_table.items():
71
+ tf_table[word] = count / count_words_in_sentence
72
+
73
+ tf_matrix[sent] = tf_table
74
+
75
+ return tf_matrix
76
+
77
+
78
+
79
+ def _create_documents_per_words(freq_matrix):
80
+ word_per_doc_table = {}
81
+
82
+ for sent, f_table in freq_matrix.items():
83
+ for word, count in f_table.items():
84
+ if word in word_per_doc_table:
85
+ word_per_doc_table[word] += 1
86
+ else:
87
+ word_per_doc_table[word] = 1
88
+
89
+ return word_per_doc_table
90
+
91
+
92
+ def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
93
+ idf_matrix = {}
94
+
95
+ for sent, f_table in freq_matrix.items():
96
+ idf_table = {}
97
+
98
+ for word in f_table.keys():
99
+ idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
100
+
101
+ idf_matrix[sent] = idf_table
102
+
103
+ return idf_matrix
104
+
105
+
106
+
107
+ def _create_tf_idf_matrix(tf_matrix, idf_matrix):
108
+ tf_idf_matrix = {}
109
+
110
+ for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
111
+
112
+ tf_idf_table = {}
113
+
114
+ for (word1, value1), (word2, value2) in zip(f_table1.items(),
115
+ f_table2.items()): # here, keys are the same in both the table
116
+ tf_idf_table[word1] = float(value1 * value2)
117
+
118
+ tf_idf_matrix[sent1] = tf_idf_table
119
+
120
+ return tf_idf_matrix
121
+
122
+
123
+ def _score_sentences(tf_idf_matrix) -> dict:
124
+ """
125
+ score a sentence by its word's TF
126
+ Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
127
+ :rtype: dict
128
+ """
129
+
130
+ sentenceValue = {}
131
+
132
+ for sent, f_table in tf_idf_matrix.items():
133
+ total_score_per_sentence = 0
134
+
135
+ count_words_in_sentence = len(f_table)
136
+ for word, score in f_table.items():
137
+ total_score_per_sentence += score
138
+
139
+ sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
140
+
141
+ return sentenceValue
142
+
143
+ def _find_average_score(sentenceValue) -> int:
144
+ """
145
+ Find the average score from the sentence value dictionary
146
+ :rtype: int
147
+ """
148
+ sumValues = 0
149
+ for entry in sentenceValue:
150
+ sumValues += sentenceValue[entry]
151
+
152
+ # Average value of a sentence from original summary_text
153
+ average = (sumValues / len(sentenceValue))
154
+
155
+ return average
156
+
157
+
158
+ def _generate_summary(sentences, sentenceValue, threshold):
159
+ sentence_count = 0
160
+ summary = ''
161
+
162
+ for sentence in sentences:
163
+ if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
164
+ summary += " " + sentence
165
+ sentence_count += 1
166
+
167
+ return summary
168
+
169
+
170
+ def run_summarization(text):
171
+ """
172
+ :param text: Plain summary_text of long article
173
+ :return: summarized summary_text
174
+ """
175
+
176
+ '''
177
+ We already have a sentence tokenizer, so we just need
178
+ to run the sent_tokenize() method to create the array of sentences.
179
+ '''
180
+ # 1 Sentence Tokenize
181
+ sentences = sent_tokenize(text)
182
+ total_documents = len(sentences)
183
+ #print(sentences)
184
+
185
+ # 2 Create the Frequency matrix of the words in each sentence.
186
+ freq_matrix = _create_frequency_matrix(sentences)
187
+ #print(freq_matrix)
188
+
189
+ '''
190
+ Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
191
+ '''
192
+ # 3 Calculate TermFrequency and generate a matrix
193
+ tf_matrix = _create_tf_matrix(freq_matrix)
194
+ #print(tf_matrix)
195
+
196
+ # 4 creating table for documents per words
197
+ count_doc_per_words = _create_documents_per_words(freq_matrix)
198
+ #print(count_doc_per_words)
199
+
200
+ '''
201
+ Inverse document frequency (IDF) is how unique or rare a word is.
202
+ '''
203
+ # 5 Calculate IDF and generate a matrix
204
+ idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
205
+ #print(idf_matrix)
206
+
207
+ # 6 Calculate TF-IDF and generate a matrix
208
+ tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
209
+ #print(tf_idf_matrix)
210
+
211
+ # 7 Important Algorithm: score the sentences
212
+ sentence_scores = _score_sentences(tf_idf_matrix)
213
+ #print(sentence_scores)
214
+
215
+ # 8 Find the threshold
216
+ threshold = _find_average_score(sentence_scores)
217
+ #print(threshold)
218
+
219
+ # 9 Important Algorithm: Generate the summary
220
+ summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
221
+ return summary
222
+
223
+
224
+
225
+ #usage = run_summarization(text_str)
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+ # def text_summarize(ARTICLE, maxLength, minLength):
237
+ # output = summarizer(ARTICLE)[0]['summary_text']
238
+ # ans = text_paraphrase(output)
239
+ # return ans