louiecerv commited on
Commit
3114a1e
Β·
1 Parent(s): 4a47ee5

Fixed the PDF and image processing

Browse files
Files changed (3) hide show
  1. app.py +77 -76
  2. requirements.txt +4 -2
  3. temp_image_0_0.jpg +0 -0
app.py CHANGED
@@ -1,57 +1,80 @@
1
  import os
2
  import base64
3
- import requests
4
- import streamlit as st
5
- from reportlab.pdfgen import canvas
6
- from reportlab.lib.pagesizes import letter
7
  from io import BytesIO
8
- from PyPDF2 import PdfReader
 
9
  from PIL import Image
 
10
  import fitz # PyMuPDF
 
11
 
12
  # OpenAI API Key
13
- api_key = os.getenv("OPENAI_API_KEY") # Ensure this environment variable is set
14
 
15
- headers = {
16
- "Content-Type": "application/json",
17
- "Authorization": f"Bearer {api_key}"
18
- }
19
 
20
  def extract_text_and_images_from_pdf(pdf_file):
21
- """
22
- Extracts text and images from a PDF file.
23
-
24
- Args:
25
- pdf_file (UploadedFile): The uploaded PDF file.
26
-
27
- Returns:
28
- tuple: A tuple containing the extracted text and images.
29
- """
30
- text_content = ""
31
- images = []
32
-
33
- # Convert UploadedFile to BytesIO for compatibility
34
- pdf_stream = BytesIO(pdf_file.read())
35
-
36
- # Extract text using PdfReader
37
- pdf_reader = PdfReader(pdf_stream)
38
- for page in pdf_reader.pages:
39
- text_content += page.extract_text()
40
-
41
- # Extract images using PyMuPDF
42
- doc = fitz.open(stream=pdf_stream)
43
- for page_index in range(len(doc)):
44
- page = doc.load_page(page_index)
45
- image_list = page.get_images()
46
- for img_index, img in enumerate(image_list):
47
- xref = img[0]
48
- base_image = doc.extract_image(xref)
49
- image_bytes = base_image["image"]
50
- # Convert image bytes to a PIL Image object
51
- image = Image.open(BytesIO(image_bytes))
52
- images.append(image)
53
-
54
- return text_content, images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def main():
57
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
@@ -67,47 +90,25 @@ def main():
67
 
68
  st.write("Upload a PDF file for analysis.")
69
 
70
- # File upload for PDF
71
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
72
  if uploaded_pdf is not None:
73
- text_content, images = extract_text_and_images_from_pdf(uploaded_pdf)
74
 
75
- # Display extracted text
76
  st.subheader("Extracted Text")
77
  st.text(text_content)
78
 
79
- # Display extracted images
80
- if images:
 
81
  st.subheader("Extracted Images")
82
- for img in images:
83
- st.image(img, caption="Extracted Image", use_container_width=True)
84
-
85
- # Prepare the multimodal payload
86
- payload = {
87
- "model": "gpt-4-turbo",
88
- "messages": [
89
- {
90
- "role": "user",
91
- "content": [
92
- {"type": "text", "text": text_content}
93
- # Images can be added here if extracted
94
- ]
95
- }
96
- ],
97
- "max_tokens": 2048,
98
- }
99
 
100
  if st.button("Generate Response"):
101
  with st.spinner("Processing..."):
102
- response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
103
-
104
- if response.status_code != 200:
105
- st.error(f"Error: {response.status_code} - {response.text}")
106
- else:
107
- content = response.json()
108
- content_string = content['choices'][0]['message']['content']
109
- st.success("Response generated!")
110
- st.markdown(f"AI Response: {content_string}")
111
 
112
  if __name__ == "__main__":
113
- main()
 
1
  import os
2
  import base64
3
+ import io
 
 
 
4
  from io import BytesIO
5
+
6
+ import streamlit as st
7
  from PIL import Image
8
+ from PyPDF2 import PdfReader
9
  import fitz # PyMuPDF
10
+ from openai import OpenAI
11
 
12
  # OpenAI API Key
13
+ api_key = os.getenv("OPENAI_API_KEY")
14
 
15
+ client = OpenAI(api_key=api_key)
 
 
 
16
 
17
  def extract_text_and_images_from_pdf(pdf_file):
18
+ try:
19
+ text_content = ""
20
+ image_urls = []
21
+
22
+ pdf_stream = BytesIO(pdf_file.read())
23
+
24
+ # Extract text using PdfReader
25
+ pdf_reader = PdfReader(pdf_stream)
26
+ for page in pdf_reader.pages:
27
+ text_content += page.extract_text()
28
+
29
+ # Extract images using PyMuPDF
30
+ doc = fitz.open(stream=pdf_stream)
31
+ for page_index in range(len(doc)):
32
+ page = doc.load_page(page_index)
33
+ image_list = page.get_images()
34
+ for img_index, img in enumerate(image_list):
35
+ xref = img[0]
36
+ base_image = doc.extract_image(xref)
37
+ image_bytes = base_image["image"]
38
+ image = Image.open(BytesIO(image_bytes))
39
+ # Resize image (optional)
40
+ image.thumbnail((512, 512)) # Adjust size as needed
41
+
42
+ # Encode the image as base64 and create a data URL
43
+ buffered = io.BytesIO()
44
+ image.save(buffered, format="JPEG")
45
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
46
+ data_url = f"data:image/jpeg;base64,{img_str}"
47
+ image_urls.append(data_url)
48
+
49
+ return text_content, image_urls
50
+ except Exception as e:
51
+ st.error(f"An error occurred during PDF processing: {e}")
52
+ return "", []
53
+
54
+ def generate_ai_response(text_content, image_urls, text_prompt):
55
+ try:
56
+ # Construct the messages list with the prompt and base64-encoded image URLs
57
+ messages = [
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": text_prompt},
62
+ *[{"type": "image_url", "image_url": {"url": url}} for url in image_urls]
63
+ ]
64
+ }
65
+ ]
66
+
67
+ response = client.chat.completions.create(
68
+ model="gpt-4o-mini",
69
+ messages=messages,
70
+ max_tokens=2048,
71
+ )
72
+
73
+ content_string = response.choices[0].message.content
74
+ return content_string
75
+ except Exception as e:
76
+ st.error(f"An error occurred during AI response generation: {e}")
77
+ return ""
78
 
79
  def main():
80
  st.title("Multimodal PDF Processing using GPT-4 Turbo Model")
 
90
 
91
  st.write("Upload a PDF file for analysis.")
92
 
 
93
  uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
94
  if uploaded_pdf is not None:
95
+ text_content, image_urls = extract_text_and_images_from_pdf(uploaded_pdf)
96
 
 
97
  st.subheader("Extracted Text")
98
  st.text(text_content)
99
 
100
+ text_prompt = st.text_area("Enter a text prompt for the AI model:", "")
101
+
102
+ if image_urls:
103
  st.subheader("Extracted Images")
104
+ for img_url in image_urls:
105
+ st.image(img_url, caption="Extracted Image", use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  if st.button("Generate Response"):
108
  with st.spinner("Processing..."):
109
+ ai_response = generate_ai_response(text_content, image_urls, text_prompt)
110
+ st.success("Response generated!")
111
+ st.markdown(f"AI Response: {ai_response}")
 
 
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
+ main()
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  streamlit
2
- requests
 
3
  Pillow
 
4
  reportlab
5
- PyPDF2
 
1
  streamlit
2
+ PyPDF2
3
+ PyMuPDF
4
  Pillow
5
+ requests
6
  reportlab
7
+ openai
temp_image_0_0.jpg ADDED