Spaces:
Sleeping
Sleeping
import streamlit as st | |
from docx import Document | |
import fitz # PyMuPDF | |
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline | |
import textwrap | |
import tempfile | |
# Functions for file reading | |
def read_txt(file): | |
return file.getvalue().decode("utf-8") | |
def read_docx(file): | |
doc = Document(file) | |
return " ".join([para.text for para in doc.paragraphs]) | |
def extract_text_from_pdf(file_path): | |
doc = fitz.open(file_path) | |
text = "" | |
for page_num in range(len(doc)): | |
page = doc[page_num] | |
text += page.get_text() | |
doc.close() | |
return text | |
def read_pdf(file): | |
# Create a temporary file | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
# Write uploaded file content to the temporary file | |
temp_file.write(file.read()) | |
# Close the temporary file to ensure changes are saved | |
temp_file.close() | |
# Get the file path of the temporary file | |
file_path = temp_file.name | |
return file_path, extract_text_from_pdf(file_path) | |
# Function for text summarization from pdf | |
def text_summarizer_from_pdf(pdf_path): | |
pdf_text = extract_text_from_pdf(pdf_path) | |
model_name = "facebook/bart-large-cnn" | |
model = BartForConditionalGeneration.from_pretrained(model_name) | |
tokenizer = BartTokenizer.from_pretrained(model_name) | |
inputs = tokenizer.encode("summarize: " + pdf_text, return_tensors="pt", max_length=1024, truncation=True) | |
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
formatted_summary = "\n".join(textwrap.wrap(summary, width=80)) | |
return formatted_summary | |
# Summarizer pipeline for txt and docx files | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
st.title("Text Summarizer") | |
st.subheader("π Upload a pdf, docx or text file to generate a short summary") | |
# Sidebar to upload file | |
uploaded_file = st.sidebar.file_uploader("Choose a file", type=["txt", "pdf", "docx"]) | |
if uploaded_file: | |
file_details = {"FileName:" : uploaded_file.name, "FileType:" : uploaded_file.type, "FileSize:" : uploaded_file.size} | |
for key, value in file_details.items(): | |
st.sidebar.write(key, value) | |
# Check the file type and read the file | |
if uploaded_file.type == "text/plain": | |
text = read_txt(uploaded_file) | |
elif uploaded_file.type == "application/pdf": | |
temp_path, text = read_pdf(uploaded_file) | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
text = read_docx(uploaded_file) | |
else: | |
st.error("File type not supported. Please upload a txt, pdf or docx file.") | |
st.stop() | |
# Generate summary | |
if st.button('Generate Summary'): | |
with st.spinner("Generating summary..."): | |
try: | |
if(uploaded_file.type == "application/pdf"): | |
pdf_file_path = temp_path | |
summary = text_summarizer_from_pdf(temp_path) | |
st.success(summary) | |
else: | |
summary = summarizer(text, max_length=1000, min_length=30, do_sample=False) | |
st.success(summary[0]['summary_text']) | |
except Exception as e: | |
st.write(f"Failed to generate summary. Your file may have some problem. Please try again!") |