Arslan17121 commited on
Commit
3616299
·
verified ·
1 Parent(s): ee73d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -14
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import streamlit as st
2
  import io
3
  import PyPDF2
4
- from transformers import pipeline
5
  from gtts import gTTS
 
 
6
 
7
  # Function to extract text from a PDF
8
  def extract_text_from_pdf(pdf_file):
@@ -13,19 +15,33 @@ def extract_text_from_pdf(pdf_file):
13
  text += page.extract_text() or "" # Handle None for non-text pages
14
  return text
15
 
16
- # Function to generate discussion points
17
  def generate_discussion_points(text):
18
- summarizer = pipeline('summarization')
19
  summary = summarizer(text, max_length=600, min_length=300, do_sample=False)
20
- return summary[0]['summary_text']
21
 
22
  # Function to convert text to speech
23
  def text_to_speech(text):
24
- tts = gTTS(text=text, lang='en')
25
  tts.save("discussion_points.mp3")
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Streamlit app
28
- st.title("PDF Analysis and Discussion Generator")
29
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
30
 
31
  if uploaded_file is not None:
@@ -36,11 +52,21 @@ if uploaded_file is not None:
36
 
37
  # Generate and display discussion points
38
  st.subheader("Generated Discussion Points")
39
- discussion_points = generate_discussion_points(text)
40
- st.write(discussion_points)
41
-
42
- # Convert discussion points to audio and play it
43
- text_to_speech(discussion_points)
44
- audio_file = open("discussion_points.mp3", "rb")
45
- audio_bytes = audio_file.read()
46
- st.audio(audio_bytes, format="audio/mp3")
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import io
3
  import PyPDF2
4
+ from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor, pipeline
5
  from gtts import gTTS
6
+ from PIL import Image
7
+ from pdf2image import convert_from_bytes
8
 
9
  # Function to extract text from a PDF
10
  def extract_text_from_pdf(pdf_file):
 
15
  text += page.extract_text() or "" # Handle None for non-text pages
16
  return text
17
 
18
+ # Function to generate discussion points (summarization)
19
  def generate_discussion_points(text):
20
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
21
  summary = summarizer(text, max_length=600, min_length=300, do_sample=False)
22
+ return summary[0]["summary_text"]
23
 
24
  # Function to convert text to speech
25
  def text_to_speech(text):
26
+ tts = gTTS(text=text, lang="en")
27
  tts.save("discussion_points.mp3")
28
 
29
+ # Function for document question answering
30
+ def answer_questions(pdf_file, question):
31
+ images = convert_from_bytes(pdf_file.read())
32
+ processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
33
+ model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large")
34
+
35
+ answers = []
36
+ for img in images:
37
+ inputs = processor(images=img, text=question, return_tensors="pt")
38
+ outputs = model.generate(**inputs)
39
+ answer = processor.decode(outputs[0], skip_special_tokens=True)
40
+ answers.append(answer)
41
+ return answers
42
+
43
  # Streamlit app
44
+ st.title("PDF Analysis Tool: Text, Summarization, and Q&A")
45
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
46
 
47
  if uploaded_file is not None:
 
52
 
53
  # Generate and display discussion points
54
  st.subheader("Generated Discussion Points")
55
+ if st.button("Generate Discussion Points"):
56
+ discussion_points = generate_discussion_points(text)
57
+ st.write(discussion_points)
58
+ text_to_speech(discussion_points)
59
+
60
+ # Play the audio
61
+ audio_file = open("discussion_points.mp3", "rb")
62
+ audio_bytes = audio_file.read()
63
+ st.audio(audio_bytes, format="audio/mp3")
64
+
65
+ # Q&A Section
66
+ st.subheader("Document Question Answering")
67
+ question = st.text_input("Ask a question about the document:")
68
+ if question:
69
+ answers = answer_questions(uploaded_file, question)
70
+ st.write("Answers:")
71
+ for page_num, answer in enumerate(answers, 1):
72
+ st.write(f"Page {page_num}: {answer}")