File size: 2,849 Bytes
cfdc85e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from spacy import tokens
import streamlit as st
from heapq import nlargest
import subprocess
subprocess.run("pip3 install PyPDF2".split())
subprocess.run("python3 -m spacy download en_core_web_sm".split())
import PyPDF2
from utils import (
    clean_text,
    fetch_article_text,
    preprocess_text_for_abstractive_summarization,
    read_text_from_file,
)
#---------------------Pre-Requiste------------------------#
stopwords = STOP_WORDS
punctuation = punctuation + '\n'



if __name__=="__main__":
    st.title("Text Summarizer πŸ“")
    st.subheader("Creator: Shreyas Dixit")
    
    n = st.sidebar.slider('Summarization %',10,90,step=10)
    n = n/100
    type=st.selectbox('Pick one', ['PDF','Text'])
    if type=="PDF":
        #Upload file
        uploaded_file = st.file_uploader("Choose a file",type=['pdf','txt','docx'])
        text = read_text_from_file(uploaded_file)
        # FileName = uploaded_file.name
        # if uploaded_file is not None:
        #     pdfFileObj = open("{FileName}", 'rb') 
        #     pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
        #     pageObj = pdfReader.getPage(0) 
        #     text = (pageObj.extractText()) 
        #     pdfFileObj.close() 
    elif type=="Text": 
        #Text
        text=st.text_area("Input text !")

    if st.button('Summarize'):
        nlp = spacy.load('en_core_web_sm')
        doc = nlp(text)
        #Word tokenization
        tokens = [tokens.text for tokens in doc]
        word_frquency = {}
        for word in doc:
            if word.text.lower() not in stopwords:
                if word.text.lower() not in punctuation:
                    if word.text not in word_frquency.keys():
                        word_frquency[word.text] = 1
                    else:
                        word_frquency[word.text] += 1
        #Normalize the values
        max_word = max(word_frquency.values())
        for word in word_frquency.keys():
            word_frquency[word] = word_frquency[word]/max_word
        #Sentence Tokenization
        sentence_token = [sent for sent in doc.sents]
        sentence_score = {}
        for sent in sentence_token:
            for word in sent:
                if word.text.lower() in word_frquency.keys():
                    if sent not in sentence_score.keys():
                        sentence_score[sent] = word_frquency[word.text.lower()]
                    else:
                        sentence_score[sent] += word_frquency[word.text.lower()]
        #Creating a Summary
        select_length = int(len(sentence_token)*n)
        summary = nlargest(select_length,sentence_score,key = sentence_score.get)
        summary = [word.text for word in summary]
        summary = ' '.join(summary)
        st.markdown(summary)