louiecerv commited on
Commit
73268c4
Β·
1 Parent(s): 3114a1e

Updated the program to use temp dir

Browse files
Files changed (2) hide show
  1. README.md +2 -0
  2. app.py +61 -31
README.md CHANGED
@@ -11,3 +11,5 @@ short_description: Implement the Multimodal for PDFs
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # replace with a full README later
app.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import base64
3
  import io
4
  from io import BytesIO
 
 
5
 
6
  import streamlit as st
7
  from PIL import Image
@@ -14,43 +16,60 @@ api_key = os.getenv("OPENAI_API_KEY")
14
 
15
  client = OpenAI(api_key=api_key)
16
 
 
17
  def extract_text_and_images_from_pdf(pdf_file):
18
  try:
19
  text_content = ""
20
  image_urls = []
21
 
22
- pdf_stream = BytesIO(pdf_file.read())
23
-
24
- # Extract text using PdfReader
25
- pdf_reader = PdfReader(pdf_stream)
26
- for page in pdf_reader.pages:
27
- text_content += page.extract_text()
28
-
29
- # Extract images using PyMuPDF
30
- doc = fitz.open(stream=pdf_stream)
31
- for page_index in range(len(doc)):
32
- page = doc.load_page(page_index)
33
- image_list = page.get_images()
34
- for img_index, img in enumerate(image_list):
35
- xref = img[0]
36
- base_image = doc.extract_image(xref)
37
- image_bytes = base_image["image"]
38
- image = Image.open(BytesIO(image_bytes))
39
- # Resize image (optional)
40
- image.thumbnail((512, 512)) # Adjust size as needed
41
-
42
- # Encode the image as base64 and create a data URL
43
- buffered = io.BytesIO()
44
- image.save(buffered, format="JPEG")
45
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
46
- data_url = f"data:image/jpeg;base64,{img_str}"
47
- image_urls.append(data_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  return text_content, image_urls
50
  except Exception as e:
51
  st.error(f"An error occurred during PDF processing: {e}")
52
  return "", []
53
 
 
54
  def generate_ai_response(text_content, image_urls, text_prompt):
55
  try:
56
  # Construct the messages list with the prompt and base64-encoded image URLs
@@ -59,8 +78,11 @@ def generate_ai_response(text_content, image_urls, text_prompt):
59
  "role": "user",
60
  "content": [
61
  {"type": "text", "text": text_prompt},
62
- *[{"type": "image_url", "image_url": {"url": url}} for url in image_urls]
63
- ]
 
 
 
64
  }
65
  ]
66
 
@@ -76,6 +98,7 @@ def generate_ai_response(text_content, image_urls, text_prompt):
76
  st.error(f"An error occurred during AI response generation: {e}")
77
  return ""
78
 
 
79
  def main():
80
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
81
 
@@ -92,7 +115,9 @@ def main():
92
 
93
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
94
  if uploaded_pdf is not None:
95
- text_content, image_urls = extract_text_and_images_from_pdf(uploaded_pdf)
 
 
96
 
97
  st.subheader("Extracted Text")
98
  st.text(text_content)
@@ -102,13 +127,18 @@ def main():
102
  if image_urls:
103
  st.subheader("Extracted Images")
104
  for img_url in image_urls:
105
- st.image(img_url, caption="Extracted Image", use_container_width=True)
 
 
106
 
107
  if st.button("Generate Response"):
108
  with st.spinner("Processing..."):
109
- ai_response = generate_ai_response(text_content, image_urls, text_prompt)
 
 
110
  st.success("Response generated!")
111
  st.markdown(f"AI Response: {ai_response}")
112
 
 
113
  if __name__ == "__main__":
114
  main()
 
2
  import base64
3
  import io
4
  from io import BytesIO
5
+ import tempfile
6
+ import time
7
 
8
  import streamlit as st
9
  from PIL import Image
 
16
 
17
  client = OpenAI(api_key=api_key)
18
 
19
+
20
  def extract_text_and_images_from_pdf(pdf_file):
21
  try:
22
  text_content = ""
23
  image_urls = []
24
 
25
+ with tempfile.TemporaryDirectory() as temp_dir:
26
+ pdf_path = os.path.join(temp_dir, "uploaded.pdf")
27
+ with open(pdf_path, "wb") as f:
28
+ f.write(pdf_file.read())
29
+
30
+ # Extract text using PdfReader
31
+ with open(pdf_path, "rb") as pdf_stream:
32
+ pdf_reader = PdfReader(pdf_stream)
33
+ for page in pdf_reader.pages:
34
+ text_content += page.extract_text()
35
+
36
+ # Introduce a small delay
37
+ time.sleep(1) # Wait for 1 second
38
+
39
+ # Extract images using PyMuPDF
40
+ doc = fitz.open(pdf_path)
41
+ for page_index in range(len(doc)):
42
+ page = doc.load_page(page_index)
43
+ image_list = page.get_images()
44
+ for img_index, img in enumerate(image_list):
45
+ xref = img[0]
46
+ base_image = doc.extract_image(xref)
47
+ image_bytes = base_image["image"]
48
+ image = Image.open(BytesIO(image_bytes))
49
+ # Resize image (optional)
50
+ image.thumbnail((512, 512)) # Adjust size as needed
51
+
52
+ # Save the image to the temporary directory
53
+ image_path = os.path.join(
54
+ temp_dir, f"image_{page_index}_{img_index}.jpg"
55
+ )
56
+ image.save(image_path, format="JPEG")
57
+
58
+ # Encode the image as base64 and create a data URL
59
+ with open(image_path, "rb") as f:
60
+ img_str = base64.b64encode(f.read()).decode("utf-8")
61
+ data_url = f"data:image/jpeg;base64,{img_str}"
62
+ image_urls.append(data_url)
63
+
64
+ # Close the fitz document
65
+ doc.close()
66
 
67
  return text_content, image_urls
68
  except Exception as e:
69
  st.error(f"An error occurred during PDF processing: {e}")
70
  return "", []
71
 
72
+
73
  def generate_ai_response(text_content, image_urls, text_prompt):
74
  try:
75
  # Construct the messages list with the prompt and base64-encoded image URLs
 
78
  "role": "user",
79
  "content": [
80
  {"type": "text", "text": text_prompt},
81
+ *[
82
+ {"type": "image_url", "image_url": {"url": url}}
83
+ for url in image_urls
84
+ ],
85
+ ],
86
  }
87
  ]
88
 
 
98
  st.error(f"An error occurred during AI response generation: {e}")
99
  return ""
100
 
101
+
102
  def main():
103
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
104
 
 
115
 
116
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
117
  if uploaded_pdf is not None:
118
+ text_content, image_urls = extract_text_and_images_from_pdf(
119
+ uploaded_pdf
120
+ )
121
 
122
  st.subheader("Extracted Text")
123
  st.text(text_content)
 
127
  if image_urls:
128
  st.subheader("Extracted Images")
129
  for img_url in image_urls:
130
+ st.image(
131
+ img_url, caption="Extracted Image", use_container_width=True
132
+ )
133
 
134
  if st.button("Generate Response"):
135
  with st.spinner("Processing..."):
136
+ ai_response = generate_ai_response(
137
+ text_content, image_urls, text_prompt
138
+ )
139
  st.success("Response generated!")
140
  st.markdown(f"AI Response: {ai_response}")
141
 
142
+
143
  if __name__ == "__main__":
144
  main()