import os import streamlit as st from transformers import pipeline import torch from PyPDF2 import PdfReader # Disable tokenizers parallelism os.environ["TOKENIZERS_PARALLELISM"] = "false" # Setup for the model device = 0 if torch.cuda.is_available() else -1 summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device) def split_text(text, max_chunk_size=512): words = text.split() for i in range(0, len(words), max_chunk_size): yield " ".join(words[i:i + max_chunk_size]) def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page_num in range(len(reader.pages)): page = reader.pages[page_num] text += page.extract_text() return text def summarize_text(text, summarizer): chunks = list(split_text(text)) summaries = [] for chunk in chunks: input_length = len(chunk.split()) max_summary_length = max(10, int(input_length * 0.6)) min_summary_length = max(5, int(input_length * 0.2)) result = summarizer(chunk, max_length=max_summary_length, min_length=min_summary_length, do_sample=False) summaries.append(result[0]['summary_text']) return " ".join(summaries) def extract_and_summarize_page_by_page(pdf_file, summarizer): reader = PdfReader(pdf_file) summaries = [] for page_num in range(len(reader.pages)): page = reader.pages[page_num] text = page.extract_text() if text: page_summary = summarize_text(text, summarizer) summaries.append(page_summary) else: summaries.append(f"Page {page_num + 1}: No extractable text found.") return summaries # Streamlit interface st.subheader("Generate PDF Summary") pdf_file = st.file_uploader("Upload a PDF", type=["pdf"]) if pdf_file: text = extract_text_from_pdf(pdf_file) if len(text) > 0: summaries = extract_and_summarize_page_by_page(pdf_file, summarizer) st.subheader("Summary") for i, summary in enumerate(summaries, 1): st.write(f"### Page {i}\n{summary}\n") else: st.warning("No extractable text found in the PDF.")