|
import spacy |
|
from spacy.lang.en.stop_words import STOP_WORDS |
|
from string import punctuation |
|
from spacy import tokens |
|
import streamlit as st |
|
from heapq import nlargest |
|
import subprocess |
|
subprocess.run("pip3 install PyPDF2".split()) |
|
subprocess.run("python3 -m spacy download en_core_web_sm".split()) |
|
import PyPDF2 |
|
from utils import ( |
|
clean_text, |
|
fetch_article_text, |
|
preprocess_text_for_abstractive_summarization, |
|
read_text_from_file, |
|
) |
|
|
|
stopwords = STOP_WORDS |
|
punctuation = punctuation + '\n' |
|
|
|
|
|
|
|
if __name__=="__main__": |
|
st.title("Text Summarizer ๐") |
|
st.subheader("Creator: Shreyas Dixit") |
|
|
|
n = st.sidebar.slider('Summarization %',10,90,step=10) |
|
n = n/100 |
|
type=st.selectbox('Pick one', ['PDF','Text']) |
|
if type=="PDF": |
|
|
|
uploaded_file = st.file_uploader("Choose a file",type=['pdf','txt','docx']) |
|
text = read_text_from_file(uploaded_file) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif type=="Text": |
|
|
|
text=st.text_area("Input text !") |
|
|
|
if st.button('Summarize'): |
|
nlp = spacy.load('en_core_web_sm') |
|
doc = nlp(text) |
|
|
|
tokens = [tokens.text for tokens in doc] |
|
word_frquency = {} |
|
for word in doc: |
|
if word.text.lower() not in stopwords: |
|
if word.text.lower() not in punctuation: |
|
if word.text not in word_frquency.keys(): |
|
word_frquency[word.text] = 1 |
|
else: |
|
word_frquency[word.text] += 1 |
|
|
|
max_word = max(word_frquency.values()) |
|
for word in word_frquency.keys(): |
|
word_frquency[word] = word_frquency[word]/max_word |
|
|
|
sentence_token = [sent for sent in doc.sents] |
|
sentence_score = {} |
|
for sent in sentence_token: |
|
for word in sent: |
|
if word.text.lower() in word_frquency.keys(): |
|
if sent not in sentence_score.keys(): |
|
sentence_score[sent] = word_frquency[word.text.lower()] |
|
else: |
|
sentence_score[sent] += word_frquency[word.text.lower()] |
|
|
|
select_length = int(len(sentence_token)*n) |
|
summary = nlargest(select_length,sentence_score,key = sentence_score.get) |
|
summary = [word.text for word in summary] |
|
summary = ' '.join(summary) |
|
st.markdown(summary) |