Spaces:

ritchi1
/

pdf-summarizer

Sleeping

App Files Files Community

ritchi1 commited on Nov 29, 2024

Commit

15e827e

verified ·

1 Parent(s): f9464b7

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -19

app.py CHANGED Viewed

@@ -1,34 +1,53 @@
 import gradio as gr
 from transformers import pipeline
 import PyPDF2
 # Load the summarization pipeline
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-def summarize_pdf(pdf_file):
     try:
-        # Extract text from the uploaded PDF
         pdf_reader = PyPDF2.PdfReader(pdf_file)
-        text = ""
-        for page_number, page in enumerate(pdf_reader.pages):
-            try:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-            except Exception as e:
-                return f"❌ Could not read page {page_number + 1}: {str(e)}"
-        # Check if text was extracted
         if not text.strip():
             return "❌ Could not extract any text from the PDF. Please upload a readable document."
-        # Summarize the extracted text
-        # Limit the text input length for summarization to avoid overflow errors
-        max_length = 1024  # Hugging Face models have input length limits
-        text = text[:max_length]
-        summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
-        return summary[0]['summary_text']
     except Exception as e:
         return f"❌ An error occurred: {str(e)}"

 import gradio as gr
 from transformers import pipeline
 import PyPDF2
+import pdfplumber
 # Load the summarization pipeline
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF using PyPDF2 with a fallback to pdfplumber."""
+    text = ""
     try:
+        # First try with PyPDF2
         pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    except Exception as e:
+        print(f"PyPDF2 failed: {e}")
+        # Fallback to pdfplumber
+        with pdfplumber.open(pdf_file) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text()
+    return text
+def chunk_text(text, max_chunk_size=1024):
+    """Split text into smaller chunks to fit within model token limits."""
+    words = text.split()
+    for i in range(0, len(words), max_chunk_size):
+        yield " ".join(words[i:i + max_chunk_size])
+def summarize_pdf(pdf_file):
+    """Extract text from PDF, chunk it, and summarize."""
+    try:
+        # Extract text from the PDF
+        text = extract_text_from_pdf(pdf_file)
         if not text.strip():
             return "❌ Could not extract any text from the PDF. Please upload a readable document."
+        # Chunk text for summarization
+        summaries = []
+        for chunk in chunk_text(text):
+            # Summarize each chunk
+            summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
+            summaries.append(summary[0]['summary_text'])
+        # Combine all summaries into one
+        full_summary = "\n\n".join(summaries)
+        return full_summary
     except Exception as e:
         return f"❌ An error occurred: {str(e)}"