shaima21 commited on
Commit
bcbe6f2
·
verified ·
1 Parent(s): 62fa92b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from transformers import pipeline
4
+ import torch
5
+ from PyPDF2 import PdfReader
6
+
7
+ # Disable tokenizers parallelism
8
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
+
10
+ # Setup for the model
11
+ device = 0 if torch.cuda.is_available() else -1
12
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
13
+
14
+ def split_text(text, max_chunk_size=512):
15
+ words = text.split()
16
+ for i in range(0, len(words), max_chunk_size):
17
+ yield " ".join(words[i:i + max_chunk_size])
18
+
19
+ def extract_text_from_pdf(pdf_file):
20
+ reader = PdfReader(pdf_file)
21
+ text = ""
22
+ for page_num in range(len(reader.pages)):
23
+ page = reader.pages[page_num]
24
+ text += page.extract_text()
25
+ return text
26
+
27
+ def summarize_text(text, summarizer):
28
+ chunks = list(split_text(text))
29
+ summaries = []
30
+ for chunk in chunks:
31
+ input_length = len(chunk.split())
32
+ max_summary_length = max(10, int(input_length * 0.6))
33
+ min_summary_length = max(5, int(input_length * 0.2))
34
+ result = summarizer(chunk, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)
35
+ summaries.append(result[0]['summary_text'])
36
+ return " ".join(summaries)
37
+
38
+ def extract_and_summarize_page_by_page(pdf_file, summarizer):
39
+ reader = PdfReader(pdf_file)
40
+ summaries = []
41
+ for page_num in range(len(reader.pages)):
42
+ page = reader.pages[page_num]
43
+ text = page.extract_text()
44
+ if text:
45
+ page_summary = summarize_text(text, summarizer)
46
+ summaries.append(page_summary)
47
+ else:
48
+ summaries.append(f"Page {page_num + 1}: No extractable text found.")
49
+ return summaries
50
+
51
+ # Streamlit interface
52
+ st.subheader("Generate PDF Summary")
53
+ pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
54
+
55
+ if pdf_file:
56
+ text = extract_text_from_pdf(pdf_file)
57
+ if len(text) > 0:
58
+ summaries = extract_and_summarize_page_by_page(pdf_file, summarizer)
59
+ st.subheader("Summary")
60
+ for i, summary in enumerate(summaries, 1):
61
+ st.write(f"### Page {i}\n{summary}\n")
62
+ else:
63
+ st.warning("No extractable text found in the PDF.")