Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
from transformers import pipeline | |
import torch | |
from PyPDF2 import PdfReader | |
# Disable tokenizers parallelism | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# Setup for the model | |
device = 0 if torch.cuda.is_available() else -1 | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device) | |
def split_text(text, max_chunk_size=512): | |
words = text.split() | |
for i in range(0, len(words), max_chunk_size): | |
yield " ".join(words[i:i + max_chunk_size]) | |
def extract_text_from_pdf(pdf_file): | |
reader = PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(reader.pages)): | |
page = reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
def summarize_text(text, summarizer): | |
chunks = list(split_text(text)) | |
summaries = [] | |
for chunk in chunks: | |
input_length = len(chunk.split()) | |
max_summary_length = max(10, int(input_length * 0.6)) | |
min_summary_length = max(5, int(input_length * 0.2)) | |
result = summarizer(chunk, max_length=max_summary_length, min_length=min_summary_length, do_sample=False) | |
summaries.append(result[0]['summary_text']) | |
return " ".join(summaries) | |
def extract_and_summarize_page_by_page(pdf_file, summarizer): | |
reader = PdfReader(pdf_file) | |
summaries = [] | |
for page_num in range(len(reader.pages)): | |
page = reader.pages[page_num] | |
text = page.extract_text() | |
if text: | |
page_summary = summarize_text(text, summarizer) | |
summaries.append(page_summary) | |
else: | |
summaries.append(f"Page {page_num + 1}: No extractable text found.") | |
return summaries | |
# Streamlit interface | |
st.subheader("Generate PDF Summary") | |
pdf_file = st.file_uploader("Upload a PDF", type=["pdf"]) | |
if pdf_file: | |
text = extract_text_from_pdf(pdf_file) | |
if len(text) > 0: | |
summaries = extract_and_summarize_page_by_page(pdf_file, summarizer) | |
st.subheader("Summary") | |
for i, summary in enumerate(summaries, 1): | |
st.write(f"### Page {i}\n{summary}\n") | |
else: | |
st.warning("No extractable text found in the PDF.") | |