toeknee432 commited on
Commit
e22a4b9
Β·
1 Parent(s): b73340a

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ test_sample.csv filter=lfs diff=lfs merge=lfs -text
my_app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="Hello",
5
+ page_icon="πŸ‘‹",
6
+ )
7
+
8
+ st.markdown("# Welcome to Text Summarization Demo! πŸ‘‹")
9
+
10
+ st.sidebar.success("Select a demo above.")
11
+
12
+ st.markdown("By Aaron Tae, Kevin Hamakawa, Emily Gong, Emily Huang, Tony Lei, Ved Phadke, Vivian Lee, Victor Shi")
13
+
14
+ st.markdown("## Introduction")
15
+ st.markdown('''
16
+ For our project, we looked into text summarization. Based on our research, there are 2 main categories of text summarization techniques: *extractive* and *abstractive*. As the names suggest,
17
+ the *extractive summarization* method directly extracts information from the orignal text whereas the *abstractive summarization*
18
+ method employs abstraction to produce a high level summary similar to humans'.
19
+ ''')
20
+ st.markdown("## Feel free to visit the tabs on the left to learn more")
21
+ st.markdown('Find us on [GitHub](https://github.com/the-data-science-union/DSU-S2023-Text-Summarization) or [Medium](https://www.google.com)')
22
+
23
+ st.balloons()
pages/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pages/.ipynb_checkpoints/1_πŸ‘‰_Understand_the_Data-checkpoint.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import random
4
+ st.set_page_config(page_title="Understand the Data", layout="wide")
5
+
6
+ st.markdown("# Understand the Data")
7
+ dataset = pd.read_csv("test_sample.csv")
8
+
9
+ txt = dataset.iat[0, 0]
10
+ original_summary = dataset.iat[0, 1]
11
+
12
+ col1, col2 = st.columns(2)
13
+ with col1:
14
+ st.header("Original `Billsum` test set:")
15
+ st.write(dataset.head(10))
16
+ avg_len_text = dataset['text'].str.len().mean()
17
+ avg_len_summary = dataset['summary'].str.len().mean()
18
+ avg_len_title = dataset['title'].str.len().mean()
19
+ st.write("Average length of a Bill:", avg_len_text)
20
+ st.write("Average length of a Summary:", avg_len_summary)
21
+ st.write("Average length of a Title:", avg_len_title)
22
+ with col2:
23
+ st.header("Example:")
24
+ if st.button('Randomly generate a Bill Example'):
25
+ my_num = random.randrange(len(dataset))
26
+ txt = dataset.iat[my_num, 0]
27
+ original_summary = dataset.iat[my_num, 1]
28
+ else:
29
+ pass
30
+ txt = st.text_area('Text', txt, height = 250)
31
+ original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
pages/.ipynb_checkpoints/2_πŸ†š_Extractive_vs_Abstractive-checkpoint.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import nltk
5
+ #Download for first time
6
+ nltk.download('stopwords')
7
+ nltk.download('punkt')
8
+ from nltk.tokenize import sent_tokenize, word_tokenize
9
+ from nltk.corpus import stopwords
10
+ import regex as re
11
+ from nltk.stem.snowball import SnowballStemmer, PorterStemmer
12
+ from datasets import load_dataset
13
+ import copy
14
+ from rouge import Rouge
15
+ import random
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+ from heapq import nlargest
19
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
20
+ import torch
21
+
22
+ @st.cache_resource(show_spinner=False)
23
+ def load_model():
24
+ #model_name = 'google/pegasus-large'
25
+ model_name = 'google/pegasus-billsum'
26
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+ #run using local model
28
+ #tokenizer = PegasusTokenizer.from_pretrained(model_name,use_auth_token=True)
29
+ #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000,use_auth_token=True).to(torch_device)
30
+ tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer", use_auth_token=True)
31
+ model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000, use_auth_token=True).to(torch_device)
32
+ #tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
33
+ #model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
34
+ return model,tokenizer
35
+
36
+ model,tokenizer = load_model()
37
+
38
+ #run this the first time and use the local model for faster runtime
39
+ #tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
40
+ #model.save_pretrained("local_pegasus-billsum_tokenizer_model")
41
+
42
+
43
+ en_stopwords = nltk.corpus.stopwords.words('english')
44
+ stemmer = SnowballStemmer("english")
45
+
46
+ def preprocessing(string):
47
+ '''
48
+ Given 1 single str,
49
+ returns a cleaned sentence
50
+ '''
51
+ # take out symbols
52
+ string = re.sub(r'\([^)]*\)', '', string)
53
+ string = re.sub('\n', '', string)
54
+ string = re.sub('<n>', '', string)
55
+ string = re.sub(' +', ' ', string)
56
+ string = re.sub(r'[^\w\s\.\,]', '', string)
57
+ string = re.sub('\.(?!\s|\d|$)', '. ', string)
58
+ string = string.lower()
59
+ return string
60
+
61
+ def delete_leading_white_spaces(string):
62
+ return re.sub(r'^[ \t]+', '', string)
63
+
64
+ def clear_leading_white_tab(string):
65
+ '''
66
+ Give 1 single string, clean out all the tabs (4 white spaces)
67
+ '''
68
+ if len(string) == 0 : return ""
69
+ if string[:4] == ' ':
70
+ return clear_leading_white_tab(string[4:])
71
+ else:
72
+ return string[:4] + clear_leading_white_tab(string[4:])
73
+
74
+ def further_split(ugly_string):
75
+ '''
76
+ Given a string with newline \n in them,
77
+ Returns a list of actual sentences
78
+ '''
79
+ lines = ugly_string.split('\n')
80
+ cleaned = []
81
+ for line in lines:
82
+ cleaned.append(clear_leading_white_tab(line))
83
+ condensed = []
84
+ for i in range(len(cleaned)):
85
+ p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
86
+ if p or cleaned[i][:3] == '``(':
87
+ condensed.append(cleaned[i])
88
+ elif len(condensed) == 0:
89
+ condensed.append(cleaned[i])
90
+ else:
91
+ condensed[-1] += cleaned[i]
92
+ return condensed
93
+
94
+ def split_right(long_string):
95
+ '''
96
+ Given a long string (a whole bill),
97
+ Performs sentence tokenization (rather than tokenizing based on period)
98
+ '''
99
+ result = []
100
+ paragraphs = long_string.split('\n\n')
101
+ for paragraph in paragraphs:
102
+ if '\n' in paragraph:
103
+ split_ps = further_split(paragraph)
104
+ for sent in split_ps:
105
+ result.append(sent)
106
+ else:
107
+ result.append(paragraph)
108
+ return result
109
+
110
+
111
+ def stemming(list_of_tokenized_strings):
112
+ '''
113
+ Given a tokenized sentences as a list,
114
+ returns a 2d list of stemmed sentences
115
+ '''
116
+ processed_sentences = []
117
+ for i in range(len(list_of_tokenized_strings)):
118
+ words = word_tokenize(list_of_tokenized_strings[i])
119
+ stemmed_words = []
120
+ for j in range(len(words)):
121
+ word = stemmer.stem(words[j])
122
+ if word not in en_stopwords:
123
+ stemmed_words.append(word)
124
+ processed_sentences.append(stemmed_words)
125
+ return processed_sentences
126
+
127
+ def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
128
+ '''
129
+ Given two 2d arrays preprocessed_sentences and stemmed_sentences,
130
+ returns a nested fequency matrix in the form of
131
+ {'sent' : {'word1': freq1, 'word2': freq2}}
132
+ '''
133
+ freq_matrix = {}
134
+ for i in range(len(stemmed_sentences)):
135
+ freq_table = {}
136
+ for j in range(len(stemmed_sentences[i])):
137
+ word = stemmed_sentences[i][j]
138
+ if word in freq_table:
139
+ freq_table[word] += 1
140
+ else:
141
+ freq_table[word] = 1
142
+ sent = preprocessed_sentences[i]
143
+ freq_matrix[sent] = freq_table
144
+ return freq_matrix
145
+
146
+ def tf(freq_matrix):
147
+ # value is the frequency dictionary
148
+ tf_matrix = copy.deepcopy(freq_matrix)
149
+ for sent, freq_dict in tf_matrix.items():
150
+ for key, value in freq_dict.items():
151
+ freq_dict[key] = value/len(freq_dict)
152
+ return tf_matrix
153
+
154
+ def num_sent_per_word(stemmed_sentences):
155
+ '''
156
+ Given a 2d arrays stemmed_sentences, return a dict with
157
+ '''
158
+ num_sent_per_word = {}
159
+ for i in range(len(stemmed_sentences)):
160
+ for j in range(len(stemmed_sentences[i])):
161
+ word = stemmed_sentences[i][j]
162
+ if word in num_sent_per_word:
163
+ num_sent_per_word[word] += 1
164
+ else:
165
+ num_sent_per_word[word] = 1
166
+ return num_sent_per_word
167
+
168
+ def idf(freq_matrix, num_sent_per_word, num_sent):
169
+ idf = copy.deepcopy(freq_matrix)
170
+ for sent, freq_dict in idf.items():
171
+ for key, value in freq_dict.items():
172
+ freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
173
+ return idf
174
+
175
+ def tf_idf(tf, idf):
176
+ tf_idf = {}
177
+ for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
178
+ tf_idf_table = {}
179
+ for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
180
+ tf_idf_table[key] = tf_v * idf_v
181
+ tf_idf[k] = tf_idf_table
182
+ return tf_idf
183
+
184
+ def score_sentences(tf_idf_matrix):
185
+ sent_scores = {}
186
+
187
+ for sent, tf_idf in tf_idf_matrix.items():
188
+ sent_score = 0
189
+ sent_len = len(tf_idf)
190
+ for word, tf_idf_score in tf_idf.items():
191
+ sent_score += tf_idf_score
192
+ sent_scores[sent] = sent_score / sent_len
193
+ return sent_scores
194
+
195
+ def average_sent_score(sentences_score):
196
+ total = 0
197
+ for sent, sent_score in sentences_score.items():
198
+ total += sent_score
199
+ avg = total/len(sentences_score)
200
+ return avg
201
+
202
+ def generate_summary(sentences, sentenceValue, threshold):
203
+ sentence_count = 0
204
+ summary = ''
205
+
206
+ for sentence in sentences:
207
+ if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
208
+ summary += " " + sentence
209
+ sentence_count += 1
210
+
211
+ return summary
212
+
213
+ def everything_generate_summary(original_string, multiplier):
214
+ '''
215
+ Given a string of a bill and a multiplier for generating the summary,
216
+ returns a summary
217
+ '''
218
+ # tokenize
219
+ example_sentences = split_right(original_string)
220
+ # preprocess
221
+ cleaned_sentences = []
222
+ for i in range(len(example_sentences)):
223
+ cleaned_sentences.append(preprocessing(example_sentences[i]))
224
+ for i in range(len(cleaned_sentences)):
225
+ cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
226
+ # stem
227
+ stemmed_sentences = stemming(example_sentences)
228
+ # calculate tf-idf
229
+ freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
230
+ tf_matrix = tf(freq_matrix)
231
+ nums_sent_per_word = num_sent_per_word(stemmed_sentences)
232
+ idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
233
+ tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
234
+ # setting a metric for generating summary
235
+ sentences_score = score_sentences(tf_idf_matrix)
236
+ threshold = average_sent_score(sentences_score)
237
+ summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
238
+ return summary
239
+
240
+ def get_rouge_scores(final_summary, original_text):
241
+ rouge = Rouge()
242
+ scores = rouge.get_scores(final_summary, original_text)
243
+ df = pd.DataFrame.from_dict(scores[0])
244
+ return df
245
+
246
+ def sklearn_generate_summary(original_string, n):
247
+ # tokenize
248
+ example_sentences = split_right(original_string)
249
+ # preprocess
250
+ cleaned_sentences = []
251
+ for i in range(len(example_sentences)):
252
+ cleaned_sentences.append(preprocessing(example_sentences[i]))
253
+ for i in range(len(cleaned_sentences)):
254
+ cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
255
+ # vectorize
256
+ vectorizer = TfidfVectorizer(stop_words='english')
257
+ tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
258
+ # score
259
+ scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
260
+ summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
261
+ result_vector = []
262
+ for i in sorted(summary_sentences):
263
+ result_vector.append(example_sentences[i])
264
+ result = " ".join(result_vector)
265
+
266
+ return result
267
+
268
+
269
+ # The actual app
270
+ # dataset = load_dataset("billsum", split = "test")
271
+ # dataset = pd.DataFrame(dataset)
272
+ dataset = pd.read_csv("test_sample.csv")
273
+ txt = dataset.iat[0, 0]
274
+ original_summary = dataset.iat[0, 1]
275
+
276
+ st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
277
+ st.markdown("# Text Summarizations Side by Side")
278
+
279
+ if st.button('Randomly generate a Bill Example'):
280
+ my_num = random.randrange(len(dataset))
281
+ txt = dataset.iat[my_num, 0]
282
+ original_summary = dataset.iat[my_num, 1]
283
+ else:
284
+ pass
285
+
286
+ column1, column2 = st.columns(2)
287
+ with column1:
288
+ txt = st.text_area('Text', txt, height = 250)
289
+ with column2:
290
+ original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
291
+
292
+
293
+
294
+ # txt = st.text_area('Text', txt, height = 250)
295
+ # original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
296
+
297
+
298
+ col1, col2, col3 = st.columns(3)
299
+ with col1:
300
+ st.header("TF-IDF from scratch:")
301
+ my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
302
+ first_summary = everything_generate_summary(txt, my_multiplier)
303
+ st.write("#### Summary:")
304
+ st.write(first_summary)
305
+ st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))
306
+
307
+ with col2:
308
+ st.header("TF-IDF from Sklearn:")
309
+ num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
310
+ second_summary = sklearn_generate_summary(txt, num_of_sentences)
311
+ st.write("#### Summary:")
312
+ st.write(second_summary)
313
+ st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))
314
+
315
+ with col3:
316
+ st.header("Abstractive summary:")
317
+ min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
318
+ if(st.button("Generate")):
319
+ txt_pre = preprocessing(txt)
320
+ txt_cleaned = delete_leading_white_spaces(txt_pre)
321
+ batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
322
+ translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
323
+ abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
324
+ st.write("#### Summary:")
325
+ st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
326
+ st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))
327
+ else:
328
+ pass
pages/.ipynb_checkpoints/3_Abstractive Summary-checkpoint.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
+ import torch
5
+
6
+ @st.cache_resource(show_spinner=False)
7
+ def load_model():
8
+ model_name = 'google/pegasus-large'
9
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ tokenizer = PegasusTokenizer.from_pretrained(model_name)
11
+ model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
12
+ #tokenizer = PegasusTokenizer.from_pretrained(model_name, use_auth_token=True)
13
+ #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000, use_auth_token=True).to(torch_device)
14
+ #run using local model
15
+ #tokenizer = PegasusTokenizer.from_pretrained(local_pegasus-large_tokenizer)
16
+ #model = PegasusForConditionalGeneration.from_pretrained(local_pegasus-large_tokenizer_model, max_position_embeddings=2000).to(torch_device)
17
+ return model,tokenizer
18
+
19
+ #run this the first time and use the local model for faster runtime
20
+ #tokenizer.save_pretrained("local_pegasus-large_tokenizer")
21
+ #model.save_pretrained("local_pegasus-large_tokenizer_model")
22
+ model,tokenizer = load_model()
23
+
24
+ st.header("Abstractive Summurization with PEGASUS-LARGE")
25
+ st.text("Try inputting a prompt below!")
26
+ src_text = st.text_input(placeholder='In mathematics, a metric space is a set together with a notion of distance between its elements, usually called points. The distance is measured by a function called a metric or distance function. Metric spaces are the most general setting for studying many of the concepts of mathematical analysis and geometry.The most familiar example of a metric space is 3-dimensional Euclidean space with its usual notion of distance. Other well-known examples are a sphere equipped with the angular distance and the hyperbolic plane. A metric may correspond to a metaphorical, rather than physical, notion of distance: for example, the set of 100-character Unicode strings can be equipped with the Hamming distance, which measures the number of characters that need to be changed to get from one string to another.Since they are very general, metric spaces are a tool used in many different branches of mathematics. Many types of mathematical objects have a natural notion of distance and therefore admit the structure of a metric space, including Riemannian manifolds, normed vector spaces, and graphs. In abstract algebra, the p-adic numbers arise as elements of the completion of a metric structure on the rational numbers. Metric spaces are also studied in their own right in metric geometry and analysis on metric spaces.Many of the basic notions of mathematical analysis, including balls, completeness, as well as uniform, Lipschitz, and HΓΆlder continuity, can be defined in the setting of metric spaces. Other notions, such as continuity, compactness, and open and closed sets, can be defined for metric spaces, but also in the even more general setting of topological spaces.', label="Input Text")
27
+
28
+ if(st.button("Generate")):
29
+ batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
30
+ translated = model.generate(**batch,min_length=50)
31
+ tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
32
+ st.write("#### Summary:")
33
+ st.write(tgt_text[0], unsafe_allow_html=True)
34
+ else:
35
+ pass
pages/1_πŸ‘‰_Understand_the_Data.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import random
4
+ st.set_page_config(page_title="Understand the Data", layout="wide")
5
+
6
+ st.markdown("# Understand the Data")
7
+ dataset = pd.read_csv("test_sample.csv")
8
+
9
+ txt = dataset.iat[0, 0]
10
+ original_summary = dataset.iat[0, 1]
11
+
12
+ col1, col2 = st.columns(2)
13
+ with col1:
14
+ st.header("Original `Billsum` test set:")
15
+ st.write(dataset.head(10))
16
+ avg_len_text = dataset['text'].str.len().mean()
17
+ avg_len_summary = dataset['summary'].str.len().mean()
18
+ avg_len_title = dataset['title'].str.len().mean()
19
+ st.write("Average length of a Bill:", avg_len_text)
20
+ st.write("Average length of a Summary:", avg_len_summary)
21
+ st.write("Average length of a Title:", avg_len_title)
22
+ with col2:
23
+ st.header("Example:")
24
+ if st.button('Randomly generate a Bill Example'):
25
+ my_num = random.randrange(len(dataset))
26
+ txt = dataset.iat[my_num, 0]
27
+ original_summary = dataset.iat[my_num, 1]
28
+ else:
29
+ pass
30
+ txt = st.text_area('Text', txt, height = 250)
31
+ original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
pages/2_πŸ†š_Extractive_vs_Abstractive.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import nltk
5
+ #Download for first time
6
+ nltk.download('stopwords')
7
+ nltk.download('punkt')
8
+ from nltk.tokenize import sent_tokenize, word_tokenize
9
+ from nltk.corpus import stopwords
10
+ import regex as re
11
+ from nltk.stem.snowball import SnowballStemmer, PorterStemmer
12
+ from datasets import load_dataset
13
+ import copy
14
+ from rouge import Rouge
15
+ import random
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+ from heapq import nlargest
19
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
20
+ import torch
21
+
22
+ @st.cache_resource(show_spinner=False)
23
+ def load_model():
24
+ #model_name = 'google/pegasus-large'
25
+ model_name = 'google/pegasus-billsum'
26
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+ #run using local model
28
+ #tokenizer = PegasusTokenizer.from_pretrained(model_name,use_auth_token=True)
29
+ #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000,use_auth_token=True).to(torch_device)
30
+ tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer", use_auth_token=True)
31
+ model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000, use_auth_token=True).to(torch_device)
32
+ #tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
33
+ #model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
34
+ return model,tokenizer
35
+
36
+ model,tokenizer = load_model()
37
+
38
+ #run this the first time and use the local model for faster runtime
39
+ #tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
40
+ #model.save_pretrained("local_pegasus-billsum_tokenizer_model")
41
+
42
+
43
+ en_stopwords = nltk.corpus.stopwords.words('english')
44
+ stemmer = SnowballStemmer("english")
45
+
46
+ def preprocessing(string):
47
+ '''
48
+ Given 1 single str,
49
+ returns a cleaned sentence
50
+ '''
51
+ # take out symbols
52
+ string = re.sub(r'\([^)]*\)', '', string)
53
+ string = re.sub('\n', '', string)
54
+ string = re.sub('<n>', '', string)
55
+ string = re.sub(' +', ' ', string)
56
+ string = re.sub(r'[^\w\s\.\,]', '', string)
57
+ string = re.sub('\.(?!\s|\d|$)', '. ', string)
58
+ string = string.lower()
59
+ return string
60
+
61
+ def delete_leading_white_spaces(string):
62
+ return re.sub(r'^[ \t]+', '', string)
63
+
64
+ def clear_leading_white_tab(string):
65
+ '''
66
+ Give 1 single string, clean out all the tabs (4 white spaces)
67
+ '''
68
+ if len(string) == 0 : return ""
69
+ if string[:4] == ' ':
70
+ return clear_leading_white_tab(string[4:])
71
+ else:
72
+ return string[:4] + clear_leading_white_tab(string[4:])
73
+
74
+ def further_split(ugly_string):
75
+ '''
76
+ Given a string with newline \n in them,
77
+ Returns a list of actual sentences
78
+ '''
79
+ lines = ugly_string.split('\n')
80
+ cleaned = []
81
+ for line in lines:
82
+ cleaned.append(clear_leading_white_tab(line))
83
+ condensed = []
84
+ for i in range(len(cleaned)):
85
+ p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
86
+ if p or cleaned[i][:3] == '``(':
87
+ condensed.append(cleaned[i])
88
+ elif len(condensed) == 0:
89
+ condensed.append(cleaned[i])
90
+ else:
91
+ condensed[-1] += cleaned[i]
92
+ return condensed
93
+
94
+ def split_right(long_string):
95
+ '''
96
+ Given a long string (a whole bill),
97
+ Performs sentence tokenization (rather than tokenizing based on period)
98
+ '''
99
+ result = []
100
+ paragraphs = long_string.split('\n\n')
101
+ for paragraph in paragraphs:
102
+ if '\n' in paragraph:
103
+ split_ps = further_split(paragraph)
104
+ for sent in split_ps:
105
+ result.append(sent)
106
+ else:
107
+ result.append(paragraph)
108
+ return result
109
+
110
+
111
+ def stemming(list_of_tokenized_strings):
112
+ '''
113
+ Given a tokenized sentences as a list,
114
+ returns a 2d list of stemmed sentences
115
+ '''
116
+ processed_sentences = []
117
+ for i in range(len(list_of_tokenized_strings)):
118
+ words = word_tokenize(list_of_tokenized_strings[i])
119
+ stemmed_words = []
120
+ for j in range(len(words)):
121
+ word = stemmer.stem(words[j])
122
+ if word not in en_stopwords:
123
+ stemmed_words.append(word)
124
+ processed_sentences.append(stemmed_words)
125
+ return processed_sentences
126
+
127
+ def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
128
+ '''
129
+ Given two 2d arrays preprocessed_sentences and stemmed_sentences,
130
+ returns a nested fequency matrix in the form of
131
+ {'sent' : {'word1': freq1, 'word2': freq2}}
132
+ '''
133
+ freq_matrix = {}
134
+ for i in range(len(stemmed_sentences)):
135
+ freq_table = {}
136
+ for j in range(len(stemmed_sentences[i])):
137
+ word = stemmed_sentences[i][j]
138
+ if word in freq_table:
139
+ freq_table[word] += 1
140
+ else:
141
+ freq_table[word] = 1
142
+ sent = preprocessed_sentences[i]
143
+ freq_matrix[sent] = freq_table
144
+ return freq_matrix
145
+
146
+ def tf(freq_matrix):
147
+ # value is the frequency dictionary
148
+ tf_matrix = copy.deepcopy(freq_matrix)
149
+ for sent, freq_dict in tf_matrix.items():
150
+ for key, value in freq_dict.items():
151
+ freq_dict[key] = value/len(freq_dict)
152
+ return tf_matrix
153
+
154
+ def num_sent_per_word(stemmed_sentences):
155
+ '''
156
+ Given a 2d arrays stemmed_sentences, return a dict with
157
+ '''
158
+ num_sent_per_word = {}
159
+ for i in range(len(stemmed_sentences)):
160
+ for j in range(len(stemmed_sentences[i])):
161
+ word = stemmed_sentences[i][j]
162
+ if word in num_sent_per_word:
163
+ num_sent_per_word[word] += 1
164
+ else:
165
+ num_sent_per_word[word] = 1
166
+ return num_sent_per_word
167
+
168
+ def idf(freq_matrix, num_sent_per_word, num_sent):
169
+ idf = copy.deepcopy(freq_matrix)
170
+ for sent, freq_dict in idf.items():
171
+ for key, value in freq_dict.items():
172
+ freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
173
+ return idf
174
+
175
+ def tf_idf(tf, idf):
176
+ tf_idf = {}
177
+ for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
178
+ tf_idf_table = {}
179
+ for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
180
+ tf_idf_table[key] = tf_v * idf_v
181
+ tf_idf[k] = tf_idf_table
182
+ return tf_idf
183
+
184
+ def score_sentences(tf_idf_matrix):
185
+ sent_scores = {}
186
+
187
+ for sent, tf_idf in tf_idf_matrix.items():
188
+ sent_score = 0
189
+ sent_len = len(tf_idf)
190
+ for word, tf_idf_score in tf_idf.items():
191
+ sent_score += tf_idf_score
192
+ sent_scores[sent] = sent_score / sent_len
193
+ return sent_scores
194
+
195
+ def average_sent_score(sentences_score):
196
+ total = 0
197
+ for sent, sent_score in sentences_score.items():
198
+ total += sent_score
199
+ avg = total/len(sentences_score)
200
+ return avg
201
+
202
+ def generate_summary(sentences, sentenceValue, threshold):
203
+ sentence_count = 0
204
+ summary = ''
205
+
206
+ for sentence in sentences:
207
+ if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
208
+ summary += " " + sentence
209
+ sentence_count += 1
210
+
211
+ return summary
212
+
213
+ def everything_generate_summary(original_string, multiplier):
214
+ '''
215
+ Given a string of a bill and a multiplier for generating the summary,
216
+ returns a summary
217
+ '''
218
+ # tokenize
219
+ example_sentences = split_right(original_string)
220
+ # preprocess
221
+ cleaned_sentences = []
222
+ for i in range(len(example_sentences)):
223
+ cleaned_sentences.append(preprocessing(example_sentences[i]))
224
+ for i in range(len(cleaned_sentences)):
225
+ cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
226
+ # stem
227
+ stemmed_sentences = stemming(example_sentences)
228
+ # calculate tf-idf
229
+ freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
230
+ tf_matrix = tf(freq_matrix)
231
+ nums_sent_per_word = num_sent_per_word(stemmed_sentences)
232
+ idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
233
+ tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
234
+ # setting a metric for generating summary
235
+ sentences_score = score_sentences(tf_idf_matrix)
236
+ threshold = average_sent_score(sentences_score)
237
+ summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
238
+ return summary
239
+
240
+ def get_rouge_scores(final_summary, original_text):
241
+ rouge = Rouge()
242
+ scores = rouge.get_scores(final_summary, original_text)
243
+ df = pd.DataFrame.from_dict(scores[0])
244
+ return df
245
+
246
+ def sklearn_generate_summary(original_string, n):
247
+ # tokenize
248
+ example_sentences = split_right(original_string)
249
+ # preprocess
250
+ cleaned_sentences = []
251
+ for i in range(len(example_sentences)):
252
+ cleaned_sentences.append(preprocessing(example_sentences[i]))
253
+ for i in range(len(cleaned_sentences)):
254
+ cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
255
+ # vectorize
256
+ vectorizer = TfidfVectorizer(stop_words='english')
257
+ tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
258
+ # score
259
+ scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
260
+ summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
261
+ result_vector = []
262
+ for i in sorted(summary_sentences):
263
+ result_vector.append(example_sentences[i])
264
+ result = " ".join(result_vector)
265
+
266
+ return result
267
+
268
+
269
+ # The actual app
270
+ # dataset = load_dataset("billsum", split = "test")
271
+ # dataset = pd.DataFrame(dataset)
272
+ dataset = pd.read_csv("test_sample.csv")
273
+ txt = dataset.iat[0, 0]
274
+ original_summary = dataset.iat[0, 1]
275
+
276
+ st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
277
+ st.markdown("# Text Summarizations Side by Side")
278
+
279
+ if st.button('Randomly generate a Bill Example'):
280
+ my_num = random.randrange(len(dataset))
281
+ txt = dataset.iat[my_num, 0]
282
+ original_summary = dataset.iat[my_num, 1]
283
+ else:
284
+ pass
285
+
286
+ column1, column2 = st.columns(2)
287
+ with column1:
288
+ txt = st.text_area('Text', txt, height = 250)
289
+ with column2:
290
+ original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
291
+
292
+
293
+
294
+ # txt = st.text_area('Text', txt, height = 250)
295
+ # original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
296
+
297
+
298
+ col1, col2, col3 = st.columns(3)
299
+ with col1:
300
+ st.header("TF-IDF from scratch:")
301
+ my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
302
+ first_summary = everything_generate_summary(txt, my_multiplier)
303
+ st.write("#### Summary:")
304
+ st.write(first_summary)
305
+ st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))
306
+
307
+ with col2:
308
+ st.header("TF-IDF from Sklearn:")
309
+ num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
310
+ second_summary = sklearn_generate_summary(txt, num_of_sentences)
311
+ st.write("#### Summary:")
312
+ st.write(second_summary)
313
+ st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))
314
+
315
+ with col3:
316
+ st.header("Abstractive summary:")
317
+ min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
318
+ if(st.button("Generate")):
319
+ txt_pre = preprocessing(txt)
320
+ txt_cleaned = delete_leading_white_spaces(txt_pre)
321
+ batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
322
+ translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
323
+ abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
324
+ st.write("#### Summary:")
325
+ st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
326
+ st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))
327
+ else:
328
+ pass
pages/3_Abstractive Summary.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
+ import torch
5
+
6
+ @st.cache_resource(show_spinner=False)
7
+ def load_model():
8
+ model_name = 'google/pegasus-large'
9
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ tokenizer = PegasusTokenizer.from_pretrained(model_name)
11
+ model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
12
+ #tokenizer = PegasusTokenizer.from_pretrained(model_name, use_auth_token=True)
13
+ #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000, use_auth_token=True).to(torch_device)
14
+ #run using local model
15
+ #tokenizer = PegasusTokenizer.from_pretrained(local_pegasus-large_tokenizer)
16
+ #model = PegasusForConditionalGeneration.from_pretrained(local_pegasus-large_tokenizer_model, max_position_embeddings=2000).to(torch_device)
17
+ return model,tokenizer
18
+
19
+ #run this the first time and use the local model for faster runtime
20
+ #tokenizer.save_pretrained("local_pegasus-large_tokenizer")
21
+ #model.save_pretrained("local_pegasus-large_tokenizer_model")
22
+ model,tokenizer = load_model()
23
+
24
+ st.header("Abstractive Summurization with PEGASUS-LARGE")
25
+ st.text("Try inputting a prompt below!")
26
+ src_text = st.text_input(placeholder='In mathematics, a metric space is a set together with a notion of distance between its elements, usually called points. The distance is measured by a function called a metric or distance function. Metric spaces are the most general setting for studying many of the concepts of mathematical analysis and geometry.The most familiar example of a metric space is 3-dimensional Euclidean space with its usual notion of distance. Other well-known examples are a sphere equipped with the angular distance and the hyperbolic plane. A metric may correspond to a metaphorical, rather than physical, notion of distance: for example, the set of 100-character Unicode strings can be equipped with the Hamming distance, which measures the number of characters that need to be changed to get from one string to another.Since they are very general, metric spaces are a tool used in many different branches of mathematics. Many types of mathematical objects have a natural notion of distance and therefore admit the structure of a metric space, including Riemannian manifolds, normed vector spaces, and graphs. In abstract algebra, the p-adic numbers arise as elements of the completion of a metric structure on the rational numbers. Metric spaces are also studied in their own right in metric geometry and analysis on metric spaces.Many of the basic notions of mathematical analysis, including balls, completeness, as well as uniform, Lipschitz, and HΓΆlder continuity, can be defined in the setting of metric spaces. Other notions, such as continuity, compactness, and open and closed sets, can be defined for metric spaces, but also in the even more general setting of topological spaces.', label="Input Text")
27
+
28
+ if(st.button("Generate")):
29
+ batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
30
+ translated = model.generate(**batch,min_length=50)
31
+ tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
32
+ st.write("#### Summary:")
33
+ st.write(tgt_text[0], unsafe_allow_html=True)
34
+ else:
35
+ pass
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.12.0
2
+ pandas==1.3.4
3
+ nltk==3.8.1
4
+ regex==2023.5.5
5
+ datasets==2.12.0
6
+ rouge==1.0.1
7
+ scikit-learn==1.2.2
8
+ torch==2.0.1
9
+ torchaudio==2.0.2
10
+ transformers==4.29.2
11
+ sentencepiece==0.1.99
12
+ huggingface-hub==0.15.1
test_sample.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:319b8953a0e775b2c55979a8f0416646cd794553f92d6b820c6b15bec9bd71f0
3
+ size 23165295