ritchi1 commited on
Commit
15e827e
·
verified ·
1 Parent(s): f9464b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -19
app.py CHANGED
@@ -1,34 +1,53 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import PyPDF2
 
4
 
5
  # Load the summarization pipeline
6
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
 
8
- def summarize_pdf(pdf_file):
 
 
 
9
  try:
10
- # Extract text from the uploaded PDF
11
  pdf_reader = PyPDF2.PdfReader(pdf_file)
12
- text = ""
13
-
14
- for page_number, page in enumerate(pdf_reader.pages):
15
- try:
16
- page_text = page.extract_text()
17
- if page_text:
18
- text += page_text + "\n"
19
- except Exception as e:
20
- return f"❌ Could not read page {page_number + 1}: {str(e)}"
21
-
22
- # Check if text was extracted
 
 
 
 
 
 
 
 
 
 
 
23
  if not text.strip():
24
  return "❌ Could not extract any text from the PDF. Please upload a readable document."
25
 
26
- # Summarize the extracted text
27
- # Limit the text input length for summarization to avoid overflow errors
28
- max_length = 1024 # Hugging Face models have input length limits
29
- text = text[:max_length]
30
- summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
31
- return summary[0]['summary_text']
 
 
 
 
32
 
33
  except Exception as e:
34
  return f"❌ An error occurred: {str(e)}"
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import PyPDF2
4
+ import pdfplumber
5
 
6
  # Load the summarization pipeline
7
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
8
 
9
+ def extract_text_from_pdf(pdf_file):
10
+ """Extract text from a PDF using PyPDF2 with a fallback to pdfplumber."""
11
+ text = ""
12
+
13
  try:
14
+ # First try with PyPDF2
15
  pdf_reader = PyPDF2.PdfReader(pdf_file)
16
+ for page in pdf_reader.pages:
17
+ text += page.extract_text()
18
+ except Exception as e:
19
+ print(f"PyPDF2 failed: {e}")
20
+ # Fallback to pdfplumber
21
+ with pdfplumber.open(pdf_file) as pdf:
22
+ for page in pdf.pages:
23
+ text += page.extract_text()
24
+
25
+ return text
26
+
27
+ def chunk_text(text, max_chunk_size=1024):
28
+ """Split text into smaller chunks to fit within model token limits."""
29
+ words = text.split()
30
+ for i in range(0, len(words), max_chunk_size):
31
+ yield " ".join(words[i:i + max_chunk_size])
32
+
33
+ def summarize_pdf(pdf_file):
34
+ """Extract text from PDF, chunk it, and summarize."""
35
+ try:
36
+ # Extract text from the PDF
37
+ text = extract_text_from_pdf(pdf_file)
38
  if not text.strip():
39
  return "❌ Could not extract any text from the PDF. Please upload a readable document."
40
 
41
+ # Chunk text for summarization
42
+ summaries = []
43
+ for chunk in chunk_text(text):
44
+ # Summarize each chunk
45
+ summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
46
+ summaries.append(summary[0]['summary_text'])
47
+
48
+ # Combine all summaries into one
49
+ full_summary = "\n\n".join(summaries)
50
+ return full_summary
51
 
52
  except Exception as e:
53
  return f"❌ An error occurred: {str(e)}"