Spaces:

segestic
/

ArticlePara

Runtime error

App Files Files Community

olusegun.odewole commited on Dec 12, 2022

Commit

8749106

1 Parent(s): da6a72e

first commit

Browse files

Files changed (11) hide show

app.py +21 -0
apps/__pycache__/paraphraseApp.cpython-38.pyc +0 -0
apps/__pycache__/summarizeApp.cpython-38.pyc +0 -0
apps/createPickle.py +10 -0
apps/paraphraseApp.py +14 -0
apps/summarizeApp.py +11 -0
multiapp.py +48 -0
oneliner.txt +0 -0
paraphraser.py +20 -0
requirements.txt +0 -0
summarizer.py +239 -0

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import streamlit as st
+from multiapp import MultiApp
+# import your app modules here
+from apps import paraphraseApp, summarizeApp
+app = MultiApp()
+st.markdown("""
+# ArticleHelp
+ArticleHelp provides two services - Paraphrasing and Summarizing. It utilizes TF-IDF Algorithm for summarization and transformer models for paraphrasing.
+## Enter your text and see the magic!
+""")
+# Add all your application here
+app.add_app("Paraphraser", paraphraseApp.app)
+app.add_app("Summarizer", summarizeApp.app)
+# The main app
+app.run()

apps/__pycache__/paraphraseApp.cpython-38.pyc ADDED Viewed

Binary file (775 Bytes). View file

apps/__pycache__/summarizeApp.cpython-38.pyc ADDED Viewed

Binary file (598 Bytes). View file

apps/createPickle.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+import pickle
+model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
+tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
+pickle.dump(model, open('model.pkl', 'wb'))
+pickle.dump(model, open('tokenizer.pkl', 'wb'))
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)

apps/paraphraseApp.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+from paraphraser import get_paraphrased_sentences, model, tokenizer
+def app():
+    st.title('Paraphraser')
+    st.write('Please provide the text to be paraphrased')
+    user_input = st.text_area('Enter text','')
+    paraphraseNo = st.slider('Number of Parapharases',1,2,10)
+    if st.button('Paraphrase'):
+        output = get_paraphrased_sentences(model, tokenizer, user_input, num_beams=10, num_return_sequences=paraphraseNo)
+        st.write("Paraphrased Text: ")
+        st.write(output)
+##get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)

apps/summarizeApp.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import streamlit as st
+from summarizer import run_summarization
+def app():
+    st.title('Summarizer')
+    st.write('Please provide the text to be summarized')
+    user_input = st.text_area('Enter text','')
+    if st.button('Summarize'):
+        output1 = run_summarization(str(user_input))#,minLength,maxLength)
+        st.write("Text Summary: ")
+        st.write(output1)

multiapp.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Frameworks for running multiple Streamlit applications as a single app.
+"""
+import streamlit as st
+class MultiApp:
+    """Framework for combining multiple streamlit applications.
+    Usage:
+        def foo():
+            st.title("Hello Foo")
+        def bar():
+            st.title("Hello Bar")
+        app = MultiApp()
+        app.add_app("Foo", foo)
+        app.add_app("Bar", bar)
+        app.run()
+    It is also possible keep each application in a separate file.
+        import foo
+        import bar
+        app = MultiApp()
+        app.add_app("Foo", foo.app)
+        app.add_app("Bar", bar.app)
+        app.run()
+    """
+    def __init__(self):
+        self.apps = []
+    def add_app(self, title, func):
+        """Adds a new application.
+        Parameters
+        ----------
+        func:
+            the python function to render this app.
+        title:
+            title of the app. Appears in the dropdown in the sidebar.
+        """
+        self.apps.append({
+            "title": title,
+            "function": func
+        })
+    def run(self):
+        # app = st.sidebar.radio(
+        app = st.selectbox(
+            'Navigation',
+            self.apps,
+            format_func=lambda app: app['title'])
+        app['function']()

oneliner.txt ADDED Viewed

Binary file (244 Bytes). View file

paraphraser.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from transformers import *
+model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
+tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")
+def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
+  # tokenize the text to be form of a list of token IDs
+  inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
+  # generate the paraphrased sentences
+  outputs = model.generate(
+    **inputs,
+    num_beams=num_beams,
+    num_return_sequences=num_return_sequences,
+  )
+  # decode the generated sentences using the tokenizer to get them back to text
+  return tokenizer.batch_decode(outputs, skip_special_tokens=True)
+#sentence = "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences."
+#get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=10)

requirements.txt ADDED Viewed

Binary file (172 Bytes). View file

summarizer.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import nltk
+nltk.download('punkt')
+nltk.download('stopwords')
+import math
+from nltk import sent_tokenize, word_tokenize, PorterStemmer
+from nltk.corpus import stopwords
+def _create_frequency_table(text_string) -> dict:
+    """
+    we create a dictionary for the word frequency table.
+    For this, we should only use the words that are not part of the stopWords array.
+    Removing stop words and making frequency table
+    Stemmer - an algorithm to bring words to its root word.
+    :rtype: dict
+    """
+    stopWords = set(stopwords.words("english"))
+    words = word_tokenize(text_string)
+    ps = PorterStemmer()
+    freqTable = dict()
+    for word in words:
+        word = ps.stem(word)
+        if word in stopWords:
+            continue
+        if word in freqTable:
+            freqTable[word] += 1
+        else:
+            freqTable[word] = 1
+    return freqTable
+def _create_frequency_matrix(sentences):
+    frequency_matrix = {}
+    stopWords = set(stopwords.words("english"))
+    ps = PorterStemmer()
+    for sent in sentences:
+        freq_table = {}
+        words = word_tokenize(sent)
+        for word in words:
+            word = word.lower()
+            word = ps.stem(word)
+            if word in stopWords:
+                continue
+            if word in freq_table:
+                freq_table[word] += 1
+            else:
+                freq_table[word] = 1
+        frequency_matrix[sent[:15]] = freq_table
+    return frequency_matrix
+def _create_tf_matrix(freq_matrix):
+    tf_matrix = {}
+    for sent, f_table in freq_matrix.items():
+        tf_table = {}
+        count_words_in_sentence = len(f_table)
+        for word, count in f_table.items():
+            tf_table[word] = count / count_words_in_sentence
+        tf_matrix[sent] = tf_table
+    return tf_matrix
+def _create_documents_per_words(freq_matrix):
+    word_per_doc_table = {}
+    for sent, f_table in freq_matrix.items():
+        for word, count in f_table.items():
+            if word in word_per_doc_table:
+                word_per_doc_table[word] += 1
+            else:
+                word_per_doc_table[word] = 1
+    return word_per_doc_table
+def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
+    idf_matrix = {}
+    for sent, f_table in freq_matrix.items():
+        idf_table = {}
+        for word in f_table.keys():
+            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
+        idf_matrix[sent] = idf_table
+    return idf_matrix
+def _create_tf_idf_matrix(tf_matrix, idf_matrix):
+    tf_idf_matrix = {}
+    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
+        tf_idf_table = {}
+        for (word1, value1), (word2, value2) in zip(f_table1.items(),
+                                                    f_table2.items()):  # here, keys are the same in both the table
+            tf_idf_table[word1] = float(value1 * value2)
+        tf_idf_matrix[sent1] = tf_idf_table
+    return tf_idf_matrix
+def _score_sentences(tf_idf_matrix) -> dict:
+    """
+    score a sentence by its word's TF
+    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
+    :rtype: dict
+    """
+    sentenceValue = {}
+    for sent, f_table in tf_idf_matrix.items():
+        total_score_per_sentence = 0
+        count_words_in_sentence = len(f_table)
+        for word, score in f_table.items():
+            total_score_per_sentence += score
+        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
+    return sentenceValue
+def _find_average_score(sentenceValue) -> int:
+    """
+    Find the average score from the sentence value dictionary
+    :rtype: int
+    """
+    sumValues = 0
+    for entry in sentenceValue:
+        sumValues += sentenceValue[entry]
+    # Average value of a sentence from original summary_text
+    average = (sumValues / len(sentenceValue))
+    return average
+def _generate_summary(sentences, sentenceValue, threshold):
+    sentence_count = 0
+    summary = ''
+    for sentence in sentences:
+        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
+            summary += " " + sentence
+            sentence_count += 1
+    return summary
+def run_summarization(text):
+    """
+    :param text: Plain summary_text of long article
+    :return: summarized summary_text
+    """
+    '''
+    We already have a sentence tokenizer, so we just need
+    to run the sent_tokenize() method to create the array of sentences.
+    '''
+    # 1 Sentence Tokenize
+    sentences = sent_tokenize(text)
+    total_documents = len(sentences)
+    #print(sentences)
+    # 2 Create the Frequency matrix of the words in each sentence.
+    freq_matrix = _create_frequency_matrix(sentences)
+    #print(freq_matrix)
+    '''
+    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
+    '''
+    # 3 Calculate TermFrequency and generate a matrix
+    tf_matrix = _create_tf_matrix(freq_matrix)
+    #print(tf_matrix)
+    # 4 creating table for documents per words
+    count_doc_per_words = _create_documents_per_words(freq_matrix)
+    #print(count_doc_per_words)
+    '''
+    Inverse document frequency (IDF) is how unique or rare a word is.
+    '''
+    # 5 Calculate IDF and generate a matrix
+    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
+    #print(idf_matrix)
+    # 6 Calculate TF-IDF and generate a matrix
+    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
+    #print(tf_idf_matrix)
+    # 7 Important Algorithm: score the sentences
+    sentence_scores = _score_sentences(tf_idf_matrix)
+    #print(sentence_scores)
+    # 8 Find the threshold
+    threshold = _find_average_score(sentence_scores)
+    #print(threshold)
+    # 9 Important Algorithm: Generate the summary
+    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
+    return summary
+#usage  = run_summarization(text_str)
+# def text_summarize(ARTICLE, maxLength, minLength):
+#   output = summarizer(ARTICLE)[0]['summary_text']
+#   ans = text_paraphrase(output)
+#   return ans