Spaces:

toeknee432
/

text-summurization

Runtime error

App Files Files Community

text-summurization / pages /2_🆚_Extractive_vs_Abstractive.py

toeknee432

Update pages/2_🆚_Extractive_vs_Abstractive.py

dd0d16b about 2 years ago

raw

history blame contribute delete

11.7 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import nltk
	#Download for first time
	nltk.download('stopwords')
	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	import regex as re
	from nltk.stem.snowball import SnowballStemmer, PorterStemmer
	from datasets import load_dataset
	import copy
	from rouge import Rouge
	import random
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer
	from heapq import nlargest
	from transformers import PegasusForConditionalGeneration, PegasusTokenizer
	import torch

	@st.cache_resource(show_spinner=False)
	def load_model():
	#model_name = 'google/pegasus-large'
	model_name = 'google/pegasus-billsum'
	torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
	#run using local model
	#tokenizer = PegasusTokenizer.from_pretrained(model_name)
	#model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
	tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
	model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
	#tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
	#model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
	return model,tokenizer

	model,tokenizer = load_model()

	#run this the first time and use the local model for faster runtime
	#tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
	#model.save_pretrained("local_pegasus-billsum_tokenizer_model")




	en_stopwords = nltk.corpus.stopwords.words('english')
	stemmer = SnowballStemmer("english")

	def preprocessing(string):
	'''
	Given 1 single str,
	returns a cleaned sentence
	'''
	# take out symbols
	string = re.sub(r'$[^)]*$', '', string)
	string = re.sub('\n', '', string)
	string = re.sub('<n>', '', string)
	string = re.sub(' +', ' ', string)
	string = re.sub(r'[^\w\s\.\,]', '', string)
	string = re.sub('\.(?!\s\|\d\|$)', '. ', string)
	string = string.lower()
	return string

	def delete_leading_white_spaces(string):
	return re.sub(r'^[ \t]+', '', string)

	def clear_leading_white_tab(string):
	'''
	Give 1 single string, clean out all the tabs (4 white spaces)
	'''
	if len(string) == 0 : return ""
	if string[:4] == ' ':
	return clear_leading_white_tab(string[4:])
	else:
	return string[:4] + clear_leading_white_tab(string[4:])

	def further_split(ugly_string):
	'''
	Given a string with newline \n in them,
	Returns a list of actual sentences
	'''
	lines = ugly_string.split('\n')
	cleaned = []
	for line in lines:
	cleaned.append(clear_leading_white_tab(line))
	condensed = []
	for i in range(len(cleaned)):
	p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
	if p or cleaned[i][:3] == '``(':
	condensed.append(cleaned[i])
	elif len(condensed) == 0:
	condensed.append(cleaned[i])
	else:
	condensed[-1] += cleaned[i]
	return condensed

	def split_right(long_string):
	'''
	Given a long string (a whole bill),
	Performs sentence tokenization (rather than tokenizing based on period)
	'''
	result = []
	paragraphs = long_string.split('\n\n')
	for paragraph in paragraphs:
	if '\n' in paragraph:
	split_ps = further_split(paragraph)
	for sent in split_ps:
	result.append(sent)
	else:
	result.append(paragraph)
	return result


	def stemming(list_of_tokenized_strings):
	'''
	Given a tokenized sentences as a list,
	returns a 2d list of stemmed sentences
	'''
	processed_sentences = []
	for i in range(len(list_of_tokenized_strings)):
	words = word_tokenize(list_of_tokenized_strings[i])
	stemmed_words = []
	for j in range(len(words)):
	word = stemmer.stem(words[j])
	if word not in en_stopwords:
	stemmed_words.append(word)
	processed_sentences.append(stemmed_words)
	return processed_sentences

	def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
	'''
	Given two 2d arrays preprocessed_sentences and stemmed_sentences,
	returns a nested fequency matrix in the form of
	{'sent' : {'word1': freq1, 'word2': freq2}}
	'''
	freq_matrix = {}
	for i in range(len(stemmed_sentences)):
	freq_table = {}
	for j in range(len(stemmed_sentences[i])):
	word = stemmed_sentences[i][j]
	if word in freq_table:
	freq_table[word] += 1
	else:
	freq_table[word] = 1
	sent = preprocessed_sentences[i]
	freq_matrix[sent] = freq_table
	return freq_matrix

	def tf(freq_matrix):
	# value is the frequency dictionary
	tf_matrix = copy.deepcopy(freq_matrix)
	for sent, freq_dict in tf_matrix.items():
	for key, value in freq_dict.items():
	freq_dict[key] = value/len(freq_dict)
	return tf_matrix

	def num_sent_per_word(stemmed_sentences):
	'''
	Given a 2d arrays stemmed_sentences, return a dict with
	'''
	num_sent_per_word = {}
	for i in range(len(stemmed_sentences)):
	for j in range(len(stemmed_sentences[i])):
	word = stemmed_sentences[i][j]
	if word in num_sent_per_word:
	num_sent_per_word[word] += 1
	else:
	num_sent_per_word[word] = 1
	return num_sent_per_word

	def idf(freq_matrix, num_sent_per_word, num_sent):
	idf = copy.deepcopy(freq_matrix)
	for sent, freq_dict in idf.items():
	for key, value in freq_dict.items():
	freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
	return idf

	def tf_idf(tf, idf):
	tf_idf = {}
	for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
	tf_idf_table = {}
	for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
	tf_idf_table[key] = tf_v * idf_v
	tf_idf[k] = tf_idf_table
	return tf_idf

	def score_sentences(tf_idf_matrix):
	sent_scores = {}

	for sent, tf_idf in tf_idf_matrix.items():
	sent_score = 0
	sent_len = len(tf_idf)
	for word, tf_idf_score in tf_idf.items():
	sent_score += tf_idf_score
	sent_scores[sent] = sent_score / sent_len
	return sent_scores

	def average_sent_score(sentences_score):
	total = 0
	for sent, sent_score in sentences_score.items():
	total += sent_score
	avg = total/len(sentences_score)
	return avg

	def generate_summary(sentences, sentenceValue, threshold):
	sentence_count = 0
	summary = ''

	for sentence in sentences:
	if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
	summary += " " + sentence
	sentence_count += 1

	return summary

	def everything_generate_summary(original_string, multiplier):
	'''
	Given a string of a bill and a multiplier for generating the summary,
	returns a summary
	'''
	# tokenize
	example_sentences = split_right(original_string)
	# preprocess
	cleaned_sentences = []
	for i in range(len(example_sentences)):
	cleaned_sentences.append(preprocessing(example_sentences[i]))
	for i in range(len(cleaned_sentences)):
	cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
	# stem
	stemmed_sentences = stemming(example_sentences)
	# calculate tf-idf
	freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
	tf_matrix = tf(freq_matrix)
	nums_sent_per_word = num_sent_per_word(stemmed_sentences)
	idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
	tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
	# setting a metric for generating summary
	sentences_score = score_sentences(tf_idf_matrix)
	threshold = average_sent_score(sentences_score)
	summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
	return summary

	def get_rouge_scores(final_summary, original_text):
	rouge = Rouge()
	scores = rouge.get_scores(final_summary, original_text)
	df = pd.DataFrame.from_dict(scores[0])
	return df

	def sklearn_generate_summary(original_string, n):
	# tokenize
	example_sentences = split_right(original_string)
	# preprocess
	cleaned_sentences = []
	for i in range(len(example_sentences)):
	cleaned_sentences.append(preprocessing(example_sentences[i]))
	for i in range(len(cleaned_sentences)):
	cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
	# vectorize
	vectorizer = TfidfVectorizer(stop_words='english')
	tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
	# score
	scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
	summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
	result_vector = []
	for i in sorted(summary_sentences):
	result_vector.append(example_sentences[i])
	result = " ".join(result_vector)

	return result


	# The actual app
	# dataset = load_dataset("billsum", split = "test")
	# dataset = pd.DataFrame(dataset)
	dataset = pd.read_csv("test_sample.csv")
	txt = dataset.iat[0, 0]
	original_summary = dataset.iat[0, 1]

	st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
	st.markdown("# Text Summarizations Side by Side")

	if st.button('Randomly generate a Bill Example'):
	my_num = random.randrange(len(dataset))
	txt = dataset.iat[my_num, 0]
	original_summary = dataset.iat[my_num, 1]
	else:
	pass

	column1, column2 = st.columns(2)
	with column1:
	txt = st.text_area('Text', txt, height = 250)
	with column2:
	original_summary = st.text_area('Corresponding summary', original_summary, height = 250)



	# txt = st.text_area('Text', txt, height = 250)
	# original_summary = st.text_area('Corresponding summary', original_summary, height = 250)


	col1, col2, col3 = st.columns(3)
	with col1:
	st.header("TF-IDF from scratch:")
	my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
	first_summary = everything_generate_summary(txt, my_multiplier)
	st.write("#### Summary:")
	st.write(first_summary)
	st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))

	with col2:
	st.header("TF-IDF from Sklearn:")
	num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
	second_summary = sklearn_generate_summary(txt, num_of_sentences)
	st.write("#### Summary:")
	st.write(second_summary)
	st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))

	with col3:
	st.header("Abstractive summary:")
	min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
	if(st.button("Generate")):
	txt_pre = preprocessing(txt)
	txt_cleaned = delete_leading_white_spaces(txt_pre)
	batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
	translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
	abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
	st.write("#### Summary:")
	st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
	st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))