Spaces:

toeknee432
/

text-summurization

Runtime error

File size: 11,684 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import nltk
#Download for first time
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import regex as re
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from datasets import load_dataset
import copy
from rouge import Rouge
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

@st.cache_resource(show_spinner=False)
def load_model():
    #model_name = 'google/pegasus-large'
    model_name = 'google/pegasus-billsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    #run using local model
    #tokenizer = PegasusTokenizer.from_pretrained(model_name)
    #model = PegasusForConditionalGeneration.from_pretrained(model_name, max_position_embeddings=2000).to(torch_device)
    tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
    model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
    #tokenizer = PegasusTokenizer.from_pretrained("local_pegasus-billsum_tokenizer")
    #model = PegasusForConditionalGeneration.from_pretrained("local_pegasus-billsum_tokenizer_model", max_position_embeddings=2000).to(torch_device)
    return model,tokenizer

model,tokenizer = load_model()

#run this the first time and use the local model for faster runtime
#tokenizer.save_pretrained("local_pegasus-billsum_tokenizer")
#model.save_pretrained("local_pegasus-billsum_tokenizer_model")




en_stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

def preprocessing(string):
    '''
    Given 1 single str, 
    returns a cleaned sentence
    '''
    # take out symbols
    string = re.sub(r'\([^)]*\)', '', string)
    string = re.sub('\n', '', string)
    string = re.sub('<n>', '', string)
    string = re.sub(' +', ' ', string)
    string = re.sub(r'[^\w\s\.\,]', '', string)
    string = re.sub('\.(?!\s|\d|$)', '. ', string)
    string = string.lower()
    return string

def delete_leading_white_spaces(string):
    return re.sub(r'^[ \t]+', '', string)

def clear_leading_white_tab(string):
    '''
    Give 1 single string, clean out all the tabs (4 white spaces)
    '''
    if len(string) == 0 : return ""
    if string[:4] == '    ':
        return clear_leading_white_tab(string[4:])
    else:
        return string[:4] + clear_leading_white_tab(string[4:])

def further_split(ugly_string):
    '''
    Given a string with newline \n in them,
    Returns a list of actual sentences
    '''
    lines = ugly_string.split('\n')
    cleaned = []
    for line in lines:
        cleaned.append(clear_leading_white_tab(line))
    condensed = []
    for i in range(len(cleaned)):
        p = cleaned[i][0] == '(' and cleaned[i][2] == ')'
        if p or cleaned[i][:3] == '``(':
            condensed.append(cleaned[i])
        elif len(condensed) == 0:
            condensed.append(cleaned[i])
        else:
            condensed[-1] += cleaned[i]
    return condensed

def split_right(long_string):
    '''
    Given a long string (a whole bill),
    Performs sentence tokenization (rather than tokenizing based on period)
    '''
    result = []
    paragraphs = long_string.split('\n\n')
    for paragraph in paragraphs:
        if '\n' in paragraph:
            split_ps = further_split(paragraph)
            for sent in split_ps:
                result.append(sent)
        else:
            result.append(paragraph)
    return result


def stemming(list_of_tokenized_strings):
    '''
    Given a tokenized sentences as a list, 
    returns a 2d list of stemmed sentences
    '''
    processed_sentences = []
    for i in range(len(list_of_tokenized_strings)):
        words = word_tokenize(list_of_tokenized_strings[i])
        stemmed_words = []
        for j in range(len(words)):
            word = stemmer.stem(words[j])
            if word not in en_stopwords:
                stemmed_words.append(word)
        processed_sentences.append(stemmed_words) 
    return processed_sentences

def create_freq_matrix(preprocessed_sentences, stemmed_sentences):
    '''
    Given two 2d arrays preprocessed_sentences and stemmed_sentences,
    returns a nested fequency matrix in the form of 
    {'sent' : {'word1': freq1, 'word2': freq2}}
    '''
    freq_matrix = {}
    for i in range(len(stemmed_sentences)):
        freq_table = {}
        for j in range(len(stemmed_sentences[i])):
            word = stemmed_sentences[i][j]
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
        sent = preprocessed_sentences[i]
        freq_matrix[sent] = freq_table
    return freq_matrix

def tf(freq_matrix):
    # value is the frequency dictionary
    tf_matrix = copy.deepcopy(freq_matrix)
    for sent, freq_dict in tf_matrix.items():
        for key, value in freq_dict.items():
            freq_dict[key] = value/len(freq_dict)
    return tf_matrix

def num_sent_per_word(stemmed_sentences):
    '''
    Given a 2d arrays stemmed_sentences, return a dict with 
    '''
    num_sent_per_word = {}
    for i in range(len(stemmed_sentences)):
        for j in range(len(stemmed_sentences[i])):
            word = stemmed_sentences[i][j]
            if word in num_sent_per_word:
                num_sent_per_word[word] += 1
            else:
                num_sent_per_word[word] = 1
    return num_sent_per_word

def idf(freq_matrix, num_sent_per_word, num_sent):
    idf = copy.deepcopy(freq_matrix)
    for sent, freq_dict in idf.items():
        for key, value in freq_dict.items():
            freq_dict[key] = np.log(num_sent / num_sent_per_word[key])
    return idf

def tf_idf(tf, idf):
    tf_idf = {}
    for (k,v), (k2,v2) in zip(tf.items(), idf.items()):
        tf_idf_table = {}
        for (key, tf_v), (key2, idf_v) in zip(v.items(), v2.items()):
            tf_idf_table[key] = tf_v * idf_v
        tf_idf[k] = tf_idf_table
    return tf_idf

def score_sentences(tf_idf_matrix):
    sent_scores = {}
    
    for sent, tf_idf in tf_idf_matrix.items():
        sent_score = 0
        sent_len = len(tf_idf)
        for word, tf_idf_score in tf_idf.items():
            sent_score += tf_idf_score
        sent_scores[sent] = sent_score / sent_len
    return sent_scores

def average_sent_score(sentences_score):
    total = 0
    for sent, sent_score in sentences_score.items():
        total += sent_score
    avg = total/len(sentences_score)
    return avg

def generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

def everything_generate_summary(original_string, multiplier):
    '''
    Given a string of a bill and a multiplier for generating the summary,
    returns a summary
    '''
    # tokenize 
    example_sentences = split_right(original_string)
    # preprocess
    cleaned_sentences = []
    for i in range(len(example_sentences)):
        cleaned_sentences.append(preprocessing(example_sentences[i]))
    for i in range(len(cleaned_sentences)):
        cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
    # stem
    stemmed_sentences = stemming(example_sentences)
    # calculate tf-idf
    freq_matrix = create_freq_matrix(example_sentences, stemmed_sentences)
    tf_matrix = tf(freq_matrix)
    nums_sent_per_word = num_sent_per_word(stemmed_sentences)
    idf_matrix = idf(freq_matrix, nums_sent_per_word, len(stemmed_sentences))
    tf_idf_matrix = tf_idf(tf_matrix, idf_matrix)
    # setting a metric for generating summary 
    sentences_score = score_sentences(tf_idf_matrix)
    threshold = average_sent_score(sentences_score)
    summary = generate_summary(example_sentences, sentences_score, multiplier * threshold)
    return summary

def get_rouge_scores(final_summary, original_text):
    rouge = Rouge()
    scores = rouge.get_scores(final_summary, original_text)
    df = pd.DataFrame.from_dict(scores[0])
    return df

def sklearn_generate_summary(original_string, n):
    # tokenize 
    example_sentences = split_right(original_string)
    # preprocess
    cleaned_sentences = []
    for i in range(len(example_sentences)):
        cleaned_sentences.append(preprocessing(example_sentences[i]))
    for i in range(len(cleaned_sentences)):
        cleaned_sentences[i] = delete_leading_white_spaces(cleaned_sentences[i])
    # vectorize
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
    # score
    scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
    summary_sentences = nlargest(n, range(len(scores)), key=scores.__getitem__)
    result_vector = []
    for i in sorted(summary_sentences):
        result_vector.append(example_sentences[i])
    result = " ".join(result_vector)
    
    return result


# The actual app
# dataset = load_dataset("billsum", split = "test")
# dataset = pd.DataFrame(dataset)
dataset = pd.read_csv("test_sample.csv")
txt = dataset.iat[0, 0]
original_summary = dataset.iat[0, 1]

st.set_page_config(page_title="Text Summarizations Side by Side", layout="wide")
st.markdown("# Text Summarizations Side by Side")

if st.button('Randomly generate a Bill Example'):
    my_num = random.randrange(len(dataset))
    txt = dataset.iat[my_num, 0]
    original_summary = dataset.iat[my_num, 1]
else:
    pass

column1, column2 = st.columns(2)
with column1:
    txt = st.text_area('Text', txt, height = 250)
with column2:
    original_summary = st.text_area('Corresponding summary', original_summary, height = 250)



# txt = st.text_area('Text', txt, height = 250)
# original_summary = st.text_area('Corresponding summary', original_summary, height = 250)


col1, col2, col3 = st.columns(3)
with col1:
    st.header("TF-IDF from scratch:")
    my_multiplier = st.slider('Please input a multiplier value:', 1.0, 1.5)
    first_summary = everything_generate_summary(txt, my_multiplier)
    st.write("#### Summary:")
    st.write(first_summary)
    st.write("#### Rouge score:", get_rouge_scores(first_summary, txt))

with col2:
    st.header("TF-IDF from Sklearn:")
    num_of_sentences = st.number_input('How many sentences do you want to generate?', 1)
    second_summary = sklearn_generate_summary(txt, num_of_sentences)
    st.write("#### Summary:")
    st.write(second_summary)
    st.write("#### Rouge score:", get_rouge_scores(second_summary, txt))

with col3:
    st.header("Abstractive summary:")
    min_l = st.slider('Please input a a minimum length (words) for the summary:', 1, 50, step=1,value=20)
    if(st.button("Generate")):
        txt_pre = preprocessing(txt)
        txt_cleaned = delete_leading_white_spaces(txt_pre)
        batch = tokenizer.prepare_seq2seq_batch(txt_cleaned, truncation=True, padding='longest',return_tensors='pt')
        translated = model.generate(**batch,min_length=min_l, max_new_tokens = 100)
        abs_summary = tokenizer.batch_decode(translated, skip_special_tokens=True)
        st.write("#### Summary:")
        st.write (abs_summary[0], height = 400, placeholder="Abstractive Summary", unsafe_allow_html=True)
        st.write("#### Rouge score:", get_rouge_scores(abs_summary[0], txt))