louiecerv commited on
Commit
0e012ba
Β·
2 Parent(s): 49c45c3 ce590a9

Merge branch 'main' of https://huggingface.co/spaces/louiecerv/openai_pdf_multimodal

Browse files
Files changed (3) hide show
  1. README.md +2 -0
  2. app.py +38 -36
  3. temp_image_0_0.jpg +0 -0
README.md CHANGED
@@ -11,3 +11,5 @@ short_description: Implement the Multimodal for PDFs
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # replace with a full README later
app.py CHANGED
@@ -3,7 +3,7 @@ import base64
3
  import io
4
  from io import BytesIO
5
  import tempfile
6
- import shutil
7
 
8
  import streamlit as st
9
  from PIL import Image
@@ -12,22 +12,27 @@ import fitz # PyMuPDF
12
  from openai import OpenAI
13
 
14
  # OpenAI API Key
15
- api_key = os.getenv("OPENAI_API_KEY")
16
-
17
- client = OpenAI(api_key=api_key)
 
 
 
18
 
19
  def extract_text_and_images_from_pdf(pdf_file_path):
20
  try:
21
  text_content = ""
22
  image_urls = []
23
 
 
 
24
  # Extract text using PdfReader
25
- pdf_reader = PdfReader(pdf_file_path)
26
  for page in pdf_reader.pages:
27
  text_content += page.extract_text()
28
 
29
  # Extract images using PyMuPDF
30
- doc = fitz.open(pdf_file_path)
31
  for page_index in range(len(doc)):
32
  page = doc.load_page(page_index)
33
  image_list = page.get_images()
@@ -39,32 +44,33 @@ def extract_text_and_images_from_pdf(pdf_file_path):
39
  # Resize image (optional)
40
  image.thumbnail((512, 512)) # Adjust size as needed
41
 
42
- # Encode the image as base64 and create a data URL
43
- buffered = io.BytesIO()
44
- image.save(buffered, format="JPEG")
45
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
46
- data_url = f"data:image/jpeg;base64,{img_str}"
47
- image_urls.append(data_url)
 
 
48
 
49
  return text_content, image_urls
50
  except Exception as e:
51
  st.error(f"An error occurred during PDF processing: {e}")
52
  return "", []
53
 
 
54
  def generate_ai_response(text_content, image_urls, text_prompt):
55
  try:
56
-
57
- if len(image_urls) > 0:
58
- # Construct the messages list with the prompt and base64-encoded image URLs
59
- messages = [
60
- {
61
- "role": "user",
62
- "content": [
63
- {"type": "text", "text": f"Perform this task {text_prompt} on the images:"},
64
- *[{"type": "image_url", "image_url": {"url": url}} for url in image_urls]
65
- ]
66
- }
67
- ]
68
 
69
  else:
70
  # Construct the prompt on the extracted text only
@@ -89,6 +95,7 @@ def generate_ai_response(text_content, image_urls, text_prompt):
89
  st.error(f"An error occurred during AI response generation: {e}")
90
  return ""
91
 
 
92
  def main():
93
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
94
 
@@ -105,13 +112,7 @@ def main():
105
 
106
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
107
  if uploaded_pdf is not None:
108
- # Save the uploaded PDF to a temporary directory
109
- temp_dir = tempfile.mkdtemp()
110
- pdf_file_path = os.path.join(temp_dir, uploaded_pdf.name)
111
- with open(pdf_file_path, "wb") as f:
112
- f.write(uploaded_pdf.getvalue())
113
-
114
- text_content, image_urls = extract_text_and_images_from_pdf(pdf_file_path)
115
 
116
  st.subheader("Extracted Text")
117
  st.text(text_content)
@@ -121,16 +122,17 @@ def main():
121
  if image_urls:
122
  st.subheader("Extracted Images")
123
  for img_url in image_urls:
124
- st.image(img_url, caption="Extracted Image", use_container_width=True)
 
 
125
 
126
  if st.button("Generate Response"):
127
  with st.spinner("Processing..."):
128
- ai_response = generate_ai_response(text_content, image_urls, text_prompt)
 
 
129
  st.success("Response generated!")
130
  st.markdown(f"AI Response: {ai_response}")
131
 
132
- # Clean up the temporary directory
133
- shutil.rmtree(temp_dir)
134
-
135
  if __name__ == "__main__":
136
  main()
 
3
  import io
4
  from io import BytesIO
5
  import tempfile
6
+ import time
7
 
8
  import streamlit as st
9
  from PIL import Image
 
12
  from openai import OpenAI
13
 
14
  # OpenAI API Key
15
+ try:
16
+ api_key = os.getenv("OPENAI_API_KEY")
17
+ client = OpenAI(api_key=api_key)
18
+ except Exception as e:
19
+ st.error(f"An error occurred during OpenAI client initialization: {e}")
20
+ st.stop()
21
 
22
  def extract_text_and_images_from_pdf(pdf_file_path):
23
  try:
24
  text_content = ""
25
  image_urls = []
26
 
27
+ pdf_stream = BytesIO(pdf_file.read())
28
+
29
  # Extract text using PdfReader
30
+ pdf_reader = PdfReader(pdf_stream)
31
  for page in pdf_reader.pages:
32
  text_content += page.extract_text()
33
 
34
  # Extract images using PyMuPDF
35
+ doc = fitz.open(stream=pdf_stream)
36
  for page_index in range(len(doc)):
37
  page = doc.load_page(page_index)
38
  image_list = page.get_images()
 
44
  # Resize image (optional)
45
  image.thumbnail((512, 512)) # Adjust size as needed
46
 
47
+ # Encode the image as base64 and create a data URL
48
+ with open(image_path, "rb") as f:
49
+ img_str = base64.b64encode(f.read()).decode("utf-8")
50
+ data_url = f"data:image/jpeg;base64,{img_str}"
51
+ image_urls.append(data_url)
52
+
53
+ # Close the fitz document
54
+ doc.close()
55
 
56
  return text_content, image_urls
57
  except Exception as e:
58
  st.error(f"An error occurred during PDF processing: {e}")
59
  return "", []
60
 
61
+
62
  def generate_ai_response(text_content, image_urls, text_prompt):
63
  try:
64
+ # Construct the messages list with the prompt and base64-encoded image URLs
65
+ messages = [
66
+ {
67
+ "role": "user",
68
+ "content": [
69
+ {"type": "text", "text": text_prompt},
70
+ *[{"type": "image_url", "image_url": {"url": url}} for url in image_urls]
71
+ ]
72
+ }
73
+ ]
 
 
74
 
75
  else:
76
  # Construct the prompt on the extracted text only
 
95
  st.error(f"An error occurred during AI response generation: {e}")
96
  return ""
97
 
98
+
99
  def main():
100
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
101
 
 
112
 
113
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
114
  if uploaded_pdf is not None:
115
+ text_content, image_urls = extract_text_and_images_from_pdf(uploaded_pdf)
 
 
 
 
 
 
116
 
117
  st.subheader("Extracted Text")
118
  st.text(text_content)
 
122
  if image_urls:
123
  st.subheader("Extracted Images")
124
  for img_url in image_urls:
125
+ st.image(
126
+ img_url, caption="Extracted Image", use_container_width=True
127
+ )
128
 
129
  if st.button("Generate Response"):
130
  with st.spinner("Processing..."):
131
+ ai_response = generate_ai_response(
132
+ text_content, image_urls, text_prompt
133
+ )
134
  st.success("Response generated!")
135
  st.markdown(f"AI Response: {ai_response}")
136
 
 
 
 
137
  if __name__ == "__main__":
138
  main()
temp_image_0_0.jpg DELETED
Binary file (41.1 kB)