File size: 11,684 Bytes
e22a4b9
 
 
 
 
3c28d9b
 
e22a4b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4811c8f
 
 
 
e22a4b9
 
 
 
 
 
 
516879c
 
e22a4b9
 
df45eb5
 
e22a4b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd0d16b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import streamlit as st
import pandas as pd
import numpy as np
import nltk
#Download for first time
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import regex as re
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from datasets import load_dataset
import copy
from rouge import Rouge
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

@st.cache_resource(show_spinner=False)
def load_model():
    #model_name = 'google/pegasus-large'
    model_name = 'google/pegasus-billsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    #run using local model
    #tokenizer = PegasusTokenizer.from_pretrained(model_name)
    #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
    tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
    model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
    #tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
    #model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
    return model,tokenizer

model,tokenizer = load_model()

#run this the first time and use the local model for faster runtime
#tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
#model.save_pretrained("local_pegasus-billsum_tokenizer_model")




en_stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

def preprocessing(string):
    '''
    Given 1 single str, 
    returns a cleaned sentence
    '''
    # take out symbols
    string = re.sub(r'\([^)]*\)', '', string)
    string = re.sub('\n', '', string)
    string = re.sub('<n>', '', string)
    string = re.sub(' +', ' ', string)
    string = re.sub(r'[^\w\s\.\,]', '', string)
    string = re.sub('\.(?!\s|\d|$)', '. ', string)
    string = string.lower()
    return string

def delete_leading_white_spaces(string):
    return re.sub(r'^[ \t]+', '', string)

def clear_leading_white_tab(string):
    '''
    Give 1 single string, clean out all the tabs (4 white spaces)
    '''
    if len(string) == 0 : return ""
    if string[:4] == '    ':
        return clear_leading_white_tab(string[4:])
    else:
        return string[:4] + clear_leading_white_tab(string[4:])

def further_split(ugly_string):
    '''
    Given a string with newline \n in them,
    Returns a list of actual sentences
    '''
    lines = ugly_string.split('\n')
    cleaned = []
    for line in lines:
        cleaned.append(clear_leading_white_tab(line))
    condensed = []
    for i in range(len(cleaned)):
        p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
        if p or cleaned[i][:3] == '``(':
            condensed.append(cleaned[i])
        elif len(condensed) == 0:
            condensed.append(cleaned[i])
        else:
            condensed[-1] += cleaned[i]
    return condensed

def split_right(long_string):
    '''
    Given a long string (a whole bill),
    Performs sentence tokenization (rather than tokenizing based on period)
    '''
    result = []
    paragraphs = long_string.split('\n\n')
    for paragraph in paragraphs:
        if '\n' in paragraph:
            split_ps = further_split(paragraph)
            for sent in split_ps:
                result.append(sent)
        else:
            result.append(paragraph)
    return result


def stemming(list_of_tokenized_strings):
    '''
    Given a tokenized sentences as a list, 
    returns a 2d list of stemmed sentences
    '''
    processed_sentences = []
    for i in range(len(list_of_tokenized_strings)):
        words = word_tokenize(list_of_tokenized_strings[i])
        stemmed_words = []
        for j in range(len(words)):
            word = stemmer.stem(words[j])
            if word not in en_stopwords:
                stemmed_words.append(word)
        processed_sentences.append(stemmed_words) 
    return processed_sentences

def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
    '''
    Given two 2d arrays preprocessed_sentences and stemmed_sentences,
    returns a nested fequency matrix in the form of 
    {'sent' : {'word1': freq1, 'word2': freq2}}
    '''
    freq_matrix = {}
    for i in range(len(stemmed_sentences)):
        freq_table = {}
        for j in range(len(stemmed_sentences[i])):
            word = stemmed_sentences[i][j]
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
        sent = preprocessed_sentences[i]
        freq_matrix[sent] = freq_table
    return freq_matrix

def tf(freq_matrix):
    # value is the frequency dictionary
    tf_matrix = copy.deepcopy(freq_matrix)
    for sent, freq_dict in tf_matrix.items():
        for key, value in freq_dict.items():
            freq_dict[key] = value/len(freq_dict)
    return tf_matrix

def num_sent_per_word(stemmed_sentences):
    '''
    Given a 2d arrays stemmed_sentences, return a dict with 
    '''
    num_sent_per_word = {}
    for i in range(len(stemmed_sentences)):
        for j in range(len(stemmed_sentences[i])):
            word = stemmed_sentences[i][j]
            if word in num_sent_per_word:
                num_sent_per_word[word] += 1
            else:
                num_sent_per_word[word] = 1
    return num_sent_per_word

def idf(freq_matrix, num_sent_per_word, num_sent):
    idf = copy.deepcopy(freq_matrix)
    for sent, freq_dict in idf.items():
        for key, value in freq_dict.items():
            freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
    return idf

def tf_idf(tf, idf):
    tf_idf = {}
    for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
        tf_idf_table = {}
        for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
            tf_idf_table[key] = tf_v * idf_v
        tf_idf[k] = tf_idf_table
    return tf_idf

def score_sentences(tf_idf_matrix):
    sent_scores = {}
    
    for sent, tf_idf in tf_idf_matrix.items():
        sent_score = 0
        sent_len = len(tf_idf)
        for word, tf_idf_score in tf_idf.items():
            sent_score += tf_idf_score
        sent_scores[sent] = sent_score / sent_len
    return sent_scores

def average_sent_score(sentences_score):
    total = 0
    for sent, sent_score in sentences_score.items():
        total += sent_score
    avg = total/len(sentences_score)
    return avg

def generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

def everything_generate_summary(original_string, multiplier):
    '''
    Given a string of a bill and a multiplier for generating the summary,
    returns a summary
    '''
    # tokenize 
    example_sentences = split_right(original_string)
    # preprocess
    cleaned_sentences = []
    for i in range(len(example_sentences)):
        cleaned_sentences.append(preprocessing(example_sentences[i]))
    for i in range(len(cleaned_sentences)):
        cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
    # stem
    stemmed_sentences = stemming(example_sentences)
    # calculate tf-idf
    freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
    tf_matrix = tf(freq_matrix)
    nums_sent_per_word = num_sent_per_word(stemmed_sentences)
    idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
    tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
    # setting a metric for generating summary 
    sentences_score = score_sentences(tf_idf_matrix)
    threshold = average_sent_score(sentences_score)
    summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
    return summary

def get_rouge_scores(final_summary, original_text):
    rouge = Rouge()
    scores = rouge.get_scores(final_summary, original_text)
    df = pd.DataFrame.from_dict(scores[0])
    return df

def sklearn_generate_summary(original_string, n):
    # tokenize 
    example_sentences = split_right(original_string)
    # preprocess
    cleaned_sentences = []
    for i in range(len(example_sentences)):
        cleaned_sentences.append(preprocessing(example_sentences[i]))
    for i in range(len(cleaned_sentences)):
        cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
    # vectorize
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
    # score
    scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
    summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
    result_vector = []
    for i in sorted(summary_sentences):
        result_vector.append(example_sentences[i])
    result = " ".join(result_vector)
    
    return result


# The actual app
# dataset = load_dataset("billsum", split = "test")
# dataset = pd.DataFrame(dataset)
dataset = pd.read_csv("test_sample.csv")
txt = dataset.iat[0, 0]
original_summary = dataset.iat[0, 1]

st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
st.markdown("# Text Summarizations Side by Side")

if st.button('Randomly generate a Bill Example'):
    my_num = random.randrange(len(dataset))
    txt = dataset.iat[my_num, 0]
    original_summary = dataset.iat[my_num, 1]
else:
    pass

column1, column2 = st.columns(2)
with column1:
    txt = st.text_area('Text', txt, height = 250)
with column2:
    original_summary = st.text_area('Corresponding summary', original_summary, height = 250)



# txt = st.text_area('Text', txt, height = 250)
# original_summary = st.text_area('Corresponding summary', original_summary, height = 250)


col1, col2, col3 = st.columns(3)
with col1:
    st.header("TF-IDF from scratch:")
    my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
    first_summary = everything_generate_summary(txt, my_multiplier)
    st.write("#### Summary:")
    st.write(first_summary)
    st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))

with col2:
    st.header("TF-IDF from Sklearn:")
    num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
    second_summary = sklearn_generate_summary(txt, num_of_sentences)
    st.write("#### Summary:")
    st.write(second_summary)
    st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))

with col3:
    st.header("Abstractive summary:")
    min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
    if(st.button("Generate")):
        txt_pre = preprocessing(txt)
        txt_cleaned = delete_leading_white_spaces(txt_pre)
        batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
        translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
        abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
        st.write("#### Summary:")
        st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
        st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))