text-summurization / pages /2_πŸ†š_Extractive_vs_Abstractive.py
toeknee432's picture
Update pages/2_πŸ†š_Extractive_vs_Abstractive.py
dd0d16b
import streamlit as st
import pandas as pd
import numpy as np
import nltk
#Download for first time
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import regex as re
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from datasets import load_dataset
import copy
from rouge import Rouge
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
@st.cache_resource(show_spinner=False)
def load_model():
#model_name = 'google/pegasus-large'
model_name = 'google/pegasus-billsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
#run using local model
#tokenizer = PegasusTokenizer.from_pretrained(model_name)
#model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
#tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
#model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
return model,tokenizer
model,tokenizer = load_model()
#run this the first time and use the local model for faster runtime
#tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
#model.save_pretrained("local_pegasus-billsum_tokenizer_model")
en_stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def preprocessing(string):
'''
Given 1 single str,
returns a cleaned sentence
'''
# take out symbols
string = re.sub(r'\([^)]*\)', '', string)
string = re.sub('\n', '', string)
string = re.sub('<n>', '', string)
string = re.sub(' +', ' ', string)
string = re.sub(r'[^\w\s\.\,]', '', string)
string = re.sub('\.(?!\s|\d|$)', '. ', string)
string = string.lower()
return string
def delete_leading_white_spaces(string):
return re.sub(r'^[ \t]+', '', string)
def clear_leading_white_tab(string):
'''
Give 1 single string, clean out all the tabs (4 white spaces)
'''
if len(string) == 0 : return ""
if string[:4] == ' ':
return clear_leading_white_tab(string[4:])
else:
return string[:4] + clear_leading_white_tab(string[4:])
def further_split(ugly_string):
'''
Given a string with newline \n in them,
Returns a list of actual sentences
'''
lines = ugly_string.split('\n')
cleaned = []
for line in lines:
cleaned.append(clear_leading_white_tab(line))
condensed = []
for i in range(len(cleaned)):
p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
if p or cleaned[i][:3] == '``(':
condensed.append(cleaned[i])
elif len(condensed) == 0:
condensed.append(cleaned[i])
else:
condensed[-1] += cleaned[i]
return condensed
def split_right(long_string):
'''
Given a long string (a whole bill),
Performs sentence tokenization (rather than tokenizing based on period)
'''
result = []
paragraphs = long_string.split('\n\n')
for paragraph in paragraphs:
if '\n' in paragraph:
split_ps = further_split(paragraph)
for sent in split_ps:
result.append(sent)
else:
result.append(paragraph)
return result
def stemming(list_of_tokenized_strings):
'''
Given a tokenized sentences as a list,
returns a 2d list of stemmed sentences
'''
processed_sentences = []
for i in range(len(list_of_tokenized_strings)):
words = word_tokenize(list_of_tokenized_strings[i])
stemmed_words = []
for j in range(len(words)):
word = stemmer.stem(words[j])
if word not in en_stopwords:
stemmed_words.append(word)
processed_sentences.append(stemmed_words)
return processed_sentences
def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
'''
Given two 2d arrays preprocessed_sentences and stemmed_sentences,
returns a nested fequency matrix in the form of
{'sent' : {'word1': freq1, 'word2': freq2}}
'''
freq_matrix = {}
for i in range(len(stemmed_sentences)):
freq_table = {}
for j in range(len(stemmed_sentences[i])):
word = stemmed_sentences[i][j]
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
sent = preprocessed_sentences[i]
freq_matrix[sent] = freq_table
return freq_matrix
def tf(freq_matrix):
# value is the frequency dictionary
tf_matrix = copy.deepcopy(freq_matrix)
for sent, freq_dict in tf_matrix.items():
for key, value in freq_dict.items():
freq_dict[key] = value/len(freq_dict)
return tf_matrix
def num_sent_per_word(stemmed_sentences):
'''
Given a 2d arrays stemmed_sentences, return a dict with
'''
num_sent_per_word = {}
for i in range(len(stemmed_sentences)):
for j in range(len(stemmed_sentences[i])):
word = stemmed_sentences[i][j]
if word in num_sent_per_word:
num_sent_per_word[word] += 1
else:
num_sent_per_word[word] = 1
return num_sent_per_word
def idf(freq_matrix, num_sent_per_word, num_sent):
idf = copy.deepcopy(freq_matrix)
for sent, freq_dict in idf.items():
for key, value in freq_dict.items():
freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
return idf
def tf_idf(tf, idf):
tf_idf = {}
for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
tf_idf_table = {}
for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
tf_idf_table[key] = tf_v * idf_v
tf_idf[k] = tf_idf_table
return tf_idf
def score_sentences(tf_idf_matrix):
sent_scores = {}
for sent, tf_idf in tf_idf_matrix.items():
sent_score = 0
sent_len = len(tf_idf)
for word, tf_idf_score in tf_idf.items():
sent_score += tf_idf_score
sent_scores[sent] = sent_score / sent_len
return sent_scores
def average_sent_score(sentences_score):
total = 0
for sent, sent_score in sentences_score.items():
total += sent_score
avg = total/len(sentences_score)
return avg
def generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def everything_generate_summary(original_string, multiplier):
'''
Given a string of a bill and a multiplier for generating the summary,
returns a summary
'''
# tokenize
example_sentences = split_right(original_string)
# preprocess
cleaned_sentences = []
for i in range(len(example_sentences)):
cleaned_sentences.append(preprocessing(example_sentences[i]))
for i in range(len(cleaned_sentences)):
cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
# stem
stemmed_sentences = stemming(example_sentences)
# calculate tf-idf
freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
tf_matrix = tf(freq_matrix)
nums_sent_per_word = num_sent_per_word(stemmed_sentences)
idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
# setting a metric for generating summary
sentences_score = score_sentences(tf_idf_matrix)
threshold = average_sent_score(sentences_score)
summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
return summary
def get_rouge_scores(final_summary, original_text):
rouge = Rouge()
scores = rouge.get_scores(final_summary, original_text)
df = pd.DataFrame.from_dict(scores[0])
return df
def sklearn_generate_summary(original_string, n):
# tokenize
example_sentences = split_right(original_string)
# preprocess
cleaned_sentences = []
for i in range(len(example_sentences)):
cleaned_sentences.append(preprocessing(example_sentences[i]))
for i in range(len(cleaned_sentences)):
cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
# vectorize
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
# score
scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
result_vector = []
for i in sorted(summary_sentences):
result_vector.append(example_sentences[i])
result = " ".join(result_vector)
return result
# The actual app
# dataset = load_dataset("billsum", split = "test")
# dataset = pd.DataFrame(dataset)
dataset = pd.read_csv("test_sample.csv")
txt = dataset.iat[0, 0]
original_summary = dataset.iat[0, 1]
st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
st.markdown("# Text Summarizations Side by Side")
if st.button('Randomly generate a Bill Example'):
my_num = random.randrange(len(dataset))
txt = dataset.iat[my_num, 0]
original_summary = dataset.iat[my_num, 1]
else:
pass
column1, column2 = st.columns(2)
with column1:
txt = st.text_area('Text', txt, height = 250)
with column2:
original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
# txt = st.text_area('Text', txt, height = 250)
# original_summary = st.text_area('Corresponding summary', original_summary, height = 250)
col1, col2, col3 = st.columns(3)
with col1:
st.header("TF-IDF from scratch:")
my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
first_summary = everything_generate_summary(txt, my_multiplier)
st.write("#### Summary:")
st.write(first_summary)
st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))
with col2:
st.header("TF-IDF from Sklearn:")
num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
second_summary = sklearn_generate_summary(txt, num_of_sentences)
st.write("#### Summary:")
st.write(second_summary)
st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))
with col3:
st.header("Abstractive summary:")
min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
if(st.button("Generate")):
txt_pre = preprocessing(txt)
txt_cleaned = delete_leading_white_spaces(txt_pre)
batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
st.write("#### Summary:")
st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))