Text-Summarizer / app.py
aminaj's picture
Create app.py
6b0baa7 verified
import streamlit as st
from docx import Document
import fitz # PyMuPDF
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline
import textwrap
import tempfile
# Functions for file reading
def read_txt(file):
return file.getvalue().decode("utf-8")
def read_docx(file):
doc = Document(file)
return " ".join([para.text for para in doc.paragraphs])
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page_num in range(len(doc)):
page = doc[page_num]
text += page.get_text()
doc.close()
return text
def read_pdf(file):
# Create a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False)
# Write uploaded file content to the temporary file
temp_file.write(file.read())
# Close the temporary file to ensure changes are saved
temp_file.close()
# Get the file path of the temporary file
file_path = temp_file.name
return file_path, extract_text_from_pdf(file_path)
# Function for text summarization from pdf
def text_summarizer_from_pdf(pdf_path):
pdf_text = extract_text_from_pdf(pdf_path)
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode("summarize: " + pdf_text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
formatted_summary = "\n".join(textwrap.wrap(summary, width=80))
return formatted_summary
# Summarizer pipeline for txt and docx files
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
st.title("Text Summarizer")
st.subheader("πŸ“ Upload a pdf, docx or text file to generate a short summary")
# Sidebar to upload file
uploaded_file = st.sidebar.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
if uploaded_file:
file_details = {"FileName:" : uploaded_file.name, "FileType:" : uploaded_file.type, "FileSize:" : uploaded_file.size}
for key, value in file_details.items():
st.sidebar.write(key, value)
# Check the file type and read the file
if uploaded_file.type == "text/plain":
text = read_txt(uploaded_file)
elif uploaded_file.type == "application/pdf":
temp_path, text = read_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
text = read_docx(uploaded_file)
else:
st.error("File type not supported. Please upload a txt, pdf or docx file.")
st.stop()
# Generate summary
if st.button('Generate Summary'):
with st.spinner("Generating summary..."):
try:
if(uploaded_file.type == "application/pdf"):
pdf_file_path = temp_path
summary = text_summarizer_from_pdf(temp_path)
st.success(summary)
else:
summary = summarizer(text, max_length=1000, min_length=30, do_sample=False)
st.success(summary[0]['summary_text'])
except Exception as e:
st.write(f"Failed to generate summary. Your file may have some problem. Please try again!")