Spaces:
Runtime error
Runtime error
File size: 11,684 Bytes
e22a4b9 3c28d9b e22a4b9 4811c8f e22a4b9 516879c e22a4b9 df45eb5 e22a4b9 dd0d16b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
import streamlit as st
import pandas as pd
import numpy as np
import nltk
#Download for first time
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import regex as re
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from datasets import load_dataset
import copy
from rouge import Rouge
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
@st.cache_resource(show_spinner=False)
def load_model():
#model_name = 'google/pegasus-large'
model_name = 'google/pegasus-billsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
#run using local model
#tokenizer = PegasusTokenizer.from_pretrained(model_name)
#model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
#tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
#model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
return model,tokenizer
model,tokenizer = load_model()
#run this the first time and use the local model for faster runtime
#tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
#model.save_pretrained("local_pegasus-billsum_tokenizer_model")
en_stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def preprocessing(string):
'''
Given 1 single str,
returns a cleaned sentence
'''
# take out symbols
string = re.sub(r'\([^)]*\)', '', string)
string = re.sub('\n', '', string)
string = re.sub('<n>', '', string)
string = re.sub(' +', ' ', string)
string = re.sub(r'[^\w\s\.\,]', '', string)
string = re.sub('\.(?!\s|\d|$)', '. ', string)
string = string.lower()
return string
def delete_leading_white_spaces(string):
return re.sub(r'^[ \t]+', '', string)
def clear_leading_white_tab(string):
'''
Give 1 single string, clean out all the tabs (4 white spaces)
'''
if len(string) == 0 : return ""
if string[:4] == ' ':
return clear_leading_white_tab(string[4:])
else:
return string[:4] + clear_leading_white_tab(string[4:])
def further_split(ugly_string):
'''
Given a string with newline \n in them,
Returns a list of actual sentences
'''
lines = ugly_string.split('\n')
cleaned = []
for line in lines:
cleaned.append(clear_leading_white_tab(line))
condensed = []
for i in range(len(cleaned)):
p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
if p or cleaned[i][:3] == '``(':
condensed.append(cleaned[i])
elif len(condensed) == 0:
condensed.append(cleaned[i])
else:
condensed[-1] += cleaned[i]
return condensed
def split_right(long_string):
'''
Given a long string (a whole bill),
Performs sentence tokenization (rather than tokenizing based on period)
'''
result = []
paragraphs = long_string.split('\n\n')
for paragraph in paragraphs:
if '\n' in paragraph:
split_ps = further_split(paragraph)
for sent in split_ps:
result.append(sent)
else:
result.append(paragraph)
return result
def stemming(list_of_tokenized_strings):
'''
Given a tokenized sentences as a list,
returns a 2d list of stemmed sentences
'''
processed_sentences = []
for i in range(len(list_of_tokenized_strings)):
words = word_tokenize(list_of_tokenized_strings[i])
stemmed_words = []
for j in range(len(words)):
word = stemmer.stem(words[j])
if word not in en_stopwords:
stemmed_words.append(word)
processed_sentences.append(stemmed_words)
return processed_sentences
def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
'''
Given two 2d arrays preprocessed_sentences and stemmed_sentences,
returns a nested fequency matrix in the form of
{'sent' : {'word1': freq1, 'word2': freq2}}
'''
freq_matrix = {}
for i in range(len(stemmed_sentences)):
freq_table = {}
for j in range(len(stemmed_sentences[i])):
word = stemmed_sentences[i][j]
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
sent = preprocessed_sentences[i]
freq_matrix[sent] = freq_table
return freq_matrix
def tf(freq_matrix):
# value is the frequency dictionary
tf_matrix = copy.deepcopy(freq_matrix)
for sent, freq_dict in tf_matrix.items():
for key, value in freq_dict.items():
freq_dict[key] = value/len(freq_dict)
return tf_matrix
def num_sent_per_word(stemmed_sentences):
'''
Given a 2d arrays stemmed_sentences, return a dict with
'''
num_sent_per_word = {}
for i in range(len(stemmed_sentences)):
for j in range(len(stemmed_sentences[i])):
word = stemmed_sentences[i][j]
if word in num_sent_per_word:
num_sent_per_word[word] += 1
else:
num_sent_per_word[word] = 1
return num_sent_per_word
def idf(freq_matrix, num_sent_per_word, num_sent):
idf = copy.deepcopy(freq_matrix)
for sent, freq_dict in idf.items():
for key, value in freq_dict.items():
freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
return idf
def tf_idf(tf, idf):
tf_idf = {}
for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
tf_idf_table = {}
for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
tf_idf_table[key] = tf_v * idf_v
tf_idf[k] = tf_idf_table
return tf_idf
def score_sentences(tf_idf_matrix):
sent_scores = {}
for sent, tf_idf in tf_idf_matrix.items():
sent_score = 0
sent_len = len(tf_idf)
for word, tf_idf_score in tf_idf.items():
sent_score += tf_idf_score
sent_scores[sent] = sent_score / sent_len
return sent_scores
def average_sent_score(sentences_score):
total = 0
for sent, sent_score in sentences_score.items():
total += sent_score
avg = total/len(sentences_score)
return avg
def generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def everything_generate_summary(original_string, multiplier):
'''
Given a string of a bill and a multiplier for generating the summary,
returns a summary
'''
# tokenize
example_sentences = split_right(original_string)
# preprocess
cleaned_sentences = []
for i in range(len(example_sentences)):
cleaned_sentences.append(preprocessing(example_sentences[i]))
for i in range(len(cleaned_sentences)):
cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
# stem
stemmed_sentences = stemming(example_sentences)
# calculate tf-idf
freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
tf_matrix = tf(freq_matrix)
nums_sent_per_word = num_sent_per_word(stemmed_sentences)
idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
# setting a metric for generating summary
sentences_score = score_sentences(tf_idf_matrix)
threshold = average_sent_score(sentences_score)
summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
return summary
def get_rouge_scores(final_summary, original_text):
rouge = Rouge()
scores = rouge.get_scores(final_summary, original_text)
df = pd.DataFrame.from_dict(scores[0])
return df
def sklearn_generate_summary(original_string, n):
# tokenize
example_sentences = split_right(original_string)
# preprocess
cleaned_sentences = []
for i in range(len(example_sentences)):
cleaned_sentences.append(preprocessing(example_sentences[i]))
for i in range(len(cleaned_sentences)):
cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
# vectorize
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
# score
scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
result_vector = []
for i in sorted(summary_sentences):
result_vector.append(example_sentences[i])
result = " ".join(result_vector)
return result
# The actual app
# dataset = load_dataset("billsum", split = "test")
# dataset = pd.DataFrame(dataset)
dataset = pd.read_csv("test_sample.csv")
txt = dataset.iat[0, 0]
original_summary = dataset.iat[0, 1]
st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
st.markdown("# Text Summarizations Side by Side")
if st.button('Randomly generate a Bill Example'):
my_num = random.randrange(len(dataset))
txt = dataset.iat[my_num, 0]
original_summary = dataset.iat[my_num, 1]
else:
pass
column1, column2 = st.columns(2)
with column1:
txt = st.text_area('Text', txt, height = 250)
with column2:
original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
# txt = st.text_area('Text', txt, height = 250)
# original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
col1, col2, col3 = st.columns(3)
with col1:
st.header("TF-IDF from scratch:")
my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
first_summary = everything_generate_summary(txt, my_multiplier)
st.write("#### Summary:")
st.write(first_summary)
st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))
with col2:
st.header("TF-IDF from Sklearn:")
num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
second_summary = sklearn_generate_summary(txt, num_of_sentences)
st.write("#### Summary:")
st.write(second_summary)
st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))
with col3:
st.header("Abstractive summary:")
min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
if(st.button("Generate")):
txt_pre = preprocessing(txt)
txt_cleaned = delete_leading_white_spaces(txt_pre)
batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
st.write("#### Summary:")
st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))
|