zamal commited on
Commit
c289504
Β·
verified Β·
1 Parent(s): 9efba83

Upload 2 files

Browse files
Files changed (2) hide show
  1. application.py +38 -0
  2. main.py +39 -0
application.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
+ import io
4
+ import base64
5
+
6
+ # Initialize your Streamlit app
7
+ st.title("πŸš€ PDF to Bullet Point Summarizer πŸ—Ÿ πŸ”")
8
+
9
+ # Initialize the Streamlit app
10
+
11
+
12
+ # File uploader for the PDF
13
+ uploaded_file = st.file_uploader("Upload your PDF document", type="pdf")
14
+
15
+ # Slider for users to select the summarization extent
16
+ summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20)
17
+
18
+ if uploaded_file is not None:
19
+ with st.spinner('Processing...'):
20
+ # Read the PDF content
21
+ text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
22
+
23
+ # Extract key phrases from the text
24
+ key_phrases = extract_key_phrases(text)
25
+
26
+ # Score sentences based on the key phrases
27
+ sentence_scores = score_sentences(text, key_phrases)
28
+
29
+ # Determine the number of bullet points based on the selected summarization scale
30
+ total_sentences = len(list(sentence_scores.keys()))
31
+ num_points = max(1, total_sentences * summary_scale // 100)
32
+
33
+ # Generate the bullet-point summary
34
+ summary = summarize_text(sentence_scores, num_points=num_points)
35
+
36
+ # Display the summary as bullet points
37
+ st.subheader("Here's the summary πŸ’―: ")
38
+ st.markdown(summary)
main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import spacy
3
+ from collections import Counter
4
+ import heapq
5
+ import io
6
+
7
+ # Load spaCy model
8
+ nlp = spacy.load("en_core_web_sm")
9
+
10
+ def read_pdf(file_stream):
11
+ text = ''
12
+ reader = PyPDF2.PdfReader(file_stream)
13
+ for page in reader.pages:
14
+ text += page.extract_text() + ' '
15
+ return text.strip()
16
+
17
+ def extract_key_phrases(text):
18
+ doc = nlp(text)
19
+ # Combine noun chunks and named entities as candidates for key phrases
20
+ key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
21
+ return key_phrases
22
+
23
+ def score_sentences(text, key_phrases):
24
+ sentence_scores = {}
25
+ doc = nlp(text)
26
+ for sent in doc.sents:
27
+ for phrase in key_phrases:
28
+ if phrase in sent.text:
29
+ if sent in sentence_scores:
30
+ sentence_scores[sent] += 1
31
+ else:
32
+ sentence_scores[sent] = 1
33
+ return sentence_scores
34
+
35
+ def summarize_text(sentence_scores, num_points=5):
36
+ summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
37
+ # Format summary as bullet points
38
+ summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
39
+ return summary