Muzammil6376 commited on
Commit
fd644c0
Β·
verified Β·
1 Parent(s): 7c301cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -298
app.py CHANGED
@@ -1,341 +1,198 @@
 
1
  import os
2
- import gradio as gr
3
  import tempfile
4
  from pathlib import Path
5
  import base64
6
- import fitz # PyMuPDF - works on HF Spaces without additional dependencies
7
  from PIL import Image
8
  import io
9
 
10
- # Import vectorstore and embeddings from langchain community package
 
 
 
11
  from langchain_community.vectorstores import FAISS
12
- from langchain_community.embeddings import HuggingFaceEmbeddings
13
- # Text splitter to break large documents into manageable chunks
14
  from langchain.text_splitter import RecursiveCharacterTextSplitter
15
- # HF Inference client for multimodal model
16
- from huggingface_hub import InferenceClient
17
 
18
  # ── Globals ───────────────────────────────────────────────────────────────────
19
- index = None # FAISS index storing document embeddings
20
- retriever = None # Retriever to fetch relevant chunks
21
- current_pdf_name = None # Name of the currently loaded PDF
22
- extracted_content = None # Combined text and image descriptions
23
- extracted_images = [] # Store image paths for multimodal queries
24
 
25
  # ── Single Multimodal Model ──────────────────────────────────────────────────
26
- # Using a single multimodal model that can handle both text and images
27
  multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
28
-
29
- # ── Multimodal Embeddings ────────────────────────────────────────────────────
30
- # Using CLIP-based embeddings that can handle both text and images
31
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
32
 
33
- # Create temporary directories for processing
34
  temp_dir = tempfile.mkdtemp()
35
  figures_dir = os.path.join(temp_dir, "figures")
36
  os.makedirs(figures_dir, exist_ok=True)
37
 
38
  def encode_image_to_base64(image_path):
39
- """Convert image to base64 for API calls"""
40
  with open(image_path, "rb") as image_file:
41
  return base64.b64encode(image_file.read()).decode('utf-8')
42
 
43
  def extract_images_from_pdf_pymupdf(pdf_path):
44
- """
45
- Extract images from PDF using PyMuPDF (works on HF Spaces)
46
- Args:
47
- pdf_path: Path to the PDF file
48
- Returns:
49
- List of image paths and their descriptions
50
- """
51
  extracted_images = []
52
  image_descriptions = []
53
-
54
  try:
55
- # Open PDF with PyMuPDF
56
  pdf_document = fitz.open(pdf_path)
57
-
58
  for page_num in range(len(pdf_document)):
59
  page = pdf_document.load_page(page_num)
60
- image_list = page.get_images()
61
-
62
- for img_index, img in enumerate(image_list):
63
- # Get image data
64
  xref = img[0]
65
  pix = fitz.Pixmap(pdf_document, xref)
66
-
67
- # Convert to PIL Image
68
- if pix.n - pix.alpha < 4: # GRAY or RGB
69
  img_data = pix.tobytes("png")
70
  img_pil = Image.open(io.BytesIO(img_data))
71
-
72
- # Save image
73
  image_filename = f"page_{page_num}_img_{img_index}.png"
74
  image_path = os.path.join(figures_dir, image_filename)
75
  img_pil.save(image_path)
76
-
77
- # Analyze image with multimodal model
78
- description = analyze_image_with_multimodal_model(image_path)
79
-
80
  extracted_images.append(image_path)
81
- image_descriptions.append(description)
82
-
83
- pix = None # Free memory
84
-
85
  pdf_document.close()
86
  return extracted_images, image_descriptions
87
-
88
  except Exception as e:
89
  print(f"Error extracting images: {e}")
90
  return [], []
91
 
92
  def analyze_image_with_multimodal_model(image_path):
93
- """
94
- Analyze an extracted image using the multimodal model.
95
- Args:
96
- image_path: Path to the extracted image file
97
- Returns:
98
- Text description of the image content
99
- """
100
  try:
101
- # Encode image to base64
102
- image_base64 = encode_image_to_base64(image_path)
103
-
104
- # Simple text-based prompt for HF Inference API
105
- prompt = f"""Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive.
106
-
107
- Image: [Image data provided]
108
-
109
- Description:"""
110
-
111
- # Use multimodal model for image analysis
112
- # Note: Simplified for HF Spaces compatibility
113
- response = multimodal_client.text_generation(
114
- prompt=prompt,
115
- max_new_tokens=200,
116
- temperature=0.3
117
  )
118
-
119
- description = response.strip()
120
- return f"[IMAGE CONTENT]: {description}"
121
-
122
  except Exception as e:
123
- return f"[IMAGE CONTENT]: Could not analyze image - {str(e)}"
124
 
125
  def process_pdf_multimodal(pdf_file):
126
- """
127
- Process PDF using PyMuPDF (HF Spaces compatible).
128
- """
129
  global current_pdf_name, index, retriever, extracted_content, extracted_images
130
-
131
  if pdf_file is None:
132
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
133
 
134
  current_pdf_name = os.path.basename(pdf_file.name)
135
-
 
 
 
136
  try:
137
- # Clear previous data
138
- extracted_images.clear()
139
- for file in os.listdir(figures_dir):
140
- os.remove(os.path.join(figures_dir, file))
141
-
142
- # Extract text using PyMuPDF
143
  pdf_document = fitz.open(pdf_file.name)
144
  text_elements = []
145
-
146
- for page_num in range(len(pdf_document)):
147
- page = pdf_document.load_page(page_num)
148
- text = page.get_text()
149
- if text.strip():
150
- text_elements.append(f"[PAGE {page_num + 1}]\n{text.strip()}")
151
-
152
  pdf_document.close()
153
-
154
- # Extract images using PyMuPDF
155
- image_paths, image_descriptions = extract_images_from_pdf_pymupdf(pdf_file.name)
156
- extracted_images.extend(image_paths)
157
-
158
- # Combine all content
159
- all_content = text_elements + image_descriptions
160
  extracted_content = "\n\n".join(all_content)
161
-
162
- if not extracted_content.strip():
163
- return current_pdf_name, "❌ No content could be extracted from the PDF.", gr.update(interactive=False)
164
-
165
- # Split into chunks for embedding
166
- text_splitter = RecursiveCharacterTextSplitter(
167
- chunk_size=1000,
168
- chunk_overlap=200,
169
- add_start_index=True
170
  )
171
- chunks = text_splitter.split_text(extracted_content)
172
-
173
- # Create FAISS index with multimodal embeddings
174
  index = FAISS.from_texts(chunks, embeddings)
175
  retriever = index.as_retriever(search_kwargs={"k": 3})
176
-
177
- # Status message
178
- num_images = len(image_descriptions)
179
- num_text_pages = len(text_elements)
180
- status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} chunks ({num_text_pages} pages, {num_images} images analyzed)"
181
-
182
  return current_pdf_name, status, gr.update(interactive=True)
183
-
184
  except Exception as e:
185
- error_msg = f"❌ Error processing PDF: {str(e)}"
186
- return current_pdf_name, error_msg, gr.update(interactive=False)
187
 
188
  def ask_multimodal_question(pdf_name, question):
189
- """
190
- Answer questions using the single multimodal model with retrieved context.
191
- """
192
- global retriever, extracted_images
193
-
194
- if index is None or retriever is None:
195
  return "❌ Please upload and process a PDF first."
196
-
197
  if not question.strip():
198
  return "❌ Please enter a question."
199
-
200
- try:
201
- # Retrieve relevant chunks
202
- docs = retriever.get_relevant_documents(question)
203
- context = "\n\n".join(doc.page_content for doc in docs)
204
-
205
- # Create prompt for text generation
206
- prompt = f"""You are an AI assistant analyzing a document that contains both text and visual elements.
207
-
208
- RETRIEVED CONTEXT:
209
- {context}
210
-
211
- QUESTION: {question}
212
 
213
- Please provide a comprehensive answer based on the retrieved context above. The context includes both textual information and descriptions of images, charts, tables, and other visual elements from the document.
214
-
215
- If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
216
-
217
- ANSWER:"""
218
-
219
- # Generate response with multimodal model
220
- response = multimodal_client.text_generation(
221
- prompt=prompt,
222
- max_new_tokens=300,
223
- temperature=0.5
 
224
  )
225
-
226
- return response.strip()
227
-
228
  except Exception as e:
229
- return f"❌ Error generating answer: {str(e)}"
230
 
231
  def generate_multimodal_summary():
232
- """
233
- Generate summary using the multimodal model.
234
- """
235
  if not extracted_content:
236
  return "❌ Please upload and process a PDF first."
237
-
238
  try:
239
- # Use first 4000 characters for summary
240
- content_preview = extracted_content[:4000]
241
-
242
  messages = [
243
- {
244
- "role": "user",
245
- "content": [
246
- {
247
- "type": "text",
248
- "text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
249
-
250
- DOCUMENT CONTENT:
251
- {content_preview}
252
-
253
- Create a well-structured summary that captures:
254
- 1. Main topics and key points from the text
255
- 2. Important information from visual elements (charts, images, tables)
256
- 3. Overall document purpose and conclusions
257
-
258
- SUMMARY:"""
259
- }
260
- ]
261
- }
262
  ]
263
-
264
- response = multimodal_client.chat_completion(
265
- messages=messages,
266
- max_tokens=250,
267
- temperature=0.3
268
  )
269
-
270
- return response["choices"][0]["message"]["content"].strip()
271
-
272
  except Exception as e:
273
- return f"❌ Error generating summary: {str(e)}"
274
 
275
  def extract_multimodal_keywords():
276
- """
277
- Extract keywords using the multimodal model.
278
- """
279
  if not extracted_content:
280
  return "❌ Please upload and process a PDF first."
281
-
282
  try:
283
- content_preview = extracted_content[:3000]
284
-
285
  messages = [
286
- {
287
- "role": "user",
288
- "content": [
289
- {
290
- "type": "text",
291
- "text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
292
-
293
- DOCUMENT CONTENT:
294
- {content_preview}
295
-
296
- Extract key terms that represent:
297
- - Main topics and concepts
298
- - Important technical terms
299
- - Key findings or data points
300
- - Visual elements mentioned (chart types, image subjects)
301
-
302
- Format as a comma-separated list.
303
-
304
- KEY TERMS:"""
305
- }
306
- ]
307
- }
308
  ]
309
-
310
- response = multimodal_client.chat_completion(
311
- messages=messages,
312
- max_tokens=120,
313
- temperature=0.3
314
  )
315
-
316
- return response["choices"][0]["message"]["content"].strip()
317
-
318
  except Exception as e:
319
- return f"❌ Error extracting keywords: {str(e)}"
320
 
321
  def clear_multimodal_interface():
322
- """
323
- Reset all global state and clear UI.
324
- """
325
  global index, retriever, current_pdf_name, extracted_content, extracted_images
326
-
327
- # Clear figures directory
328
- try:
329
- for file in os.listdir(figures_dir):
330
- os.remove(os.path.join(figures_dir, file))
331
- except:
332
- pass
333
-
334
- # Reset globals
335
  index = retriever = None
336
  current_pdf_name = extracted_content = None
337
  extracted_images.clear()
338
-
339
  return None, "", gr.update(interactive=False)
340
 
341
  # ── Gradio UI ────────────────────────────────────────────────────────────────
@@ -345,37 +202,12 @@ with gr.Blocks(theme=theme, css="""
345
  .container { border-radius: 10px; padding: 15px; }
346
  .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
347
  .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
348
- .main-title {
349
- text-align: center;
350
- font-size: 64px;
351
- font-weight: bold;
352
- margin-bottom: 20px;
353
- }
354
- .multimodal-badge {
355
- background: linear-gradient(45deg, #6366f1, #8b5cf6);
356
- color: white;
357
- padding: 5px 15px;
358
- border-radius: 20px;
359
- font-size: 14px;
360
- display: inline-block;
361
- margin: 10px auto;
362
- }
363
- .model-info {
364
- background: #f8fafc;
365
- border: 1px solid #e2e8f0;
366
- border-radius: 8px;
367
- padding: 10px;
368
- margin: 10px 0;
369
- font-size: 12px;
370
- color: #64748b;
371
- }
372
  """) as demo:
373
-
374
- # Application title with multimodal badge
375
  gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
376
- gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🧠 Single Model β€’ Text + Vision</span></div>")
377
-
378
- # Model information
379
  gr.Markdown("""
380
  <div class='model-info'>
381
  <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
@@ -389,19 +221,12 @@ with gr.Blocks(theme=theme, css="""
389
  pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
390
  upload_button = gr.Button("πŸ”„ Process with Multimodal AI", variant="primary")
391
  status_box = gr.Textbox(label="Processing Status", interactive=False)
392
-
393
  with gr.Column():
394
  gr.Markdown("## ❓ Ask Questions")
395
- gr.Markdown("*Single AI model understands both text and visual content*")
396
- question_input = gr.Textbox(
397
- lines=3,
398
- placeholder="Ask about text content, images, charts, tables, or any visual elements...",
399
- interactive=False
400
- )
401
  ask_button = gr.Button("πŸ” Ask Multimodal AI", variant="primary")
402
  answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
403
 
404
- # Analysis tools
405
  with gr.Row():
406
  with gr.Column():
407
  summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
@@ -410,34 +235,18 @@ with gr.Blocks(theme=theme, css="""
410
  keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
411
  keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
412
 
413
- # Clear button
414
  clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
415
-
416
  gr.Markdown("""
417
  <div class='footer'>
418
- <strong>Unified Multimodal Pipeline:</strong> One model handles text analysis, image understanding, and question answering<br>
419
- Supports: Text β€’ Images β€’ Charts β€’ Tables β€’ Diagrams β€’ Mixed Content Queries
420
  </div>
421
  """)
422
 
423
- # Event bindings
424
- upload_button.click(
425
- process_pdf_multimodal,
426
- [pdf_file],
427
- [pdf_display, status_box, question_input]
428
- )
429
- ask_button.click(
430
- ask_multimodal_question,
431
- [pdf_display, question_input],
432
- answer_output
433
- )
434
  summary_button.click(generate_multimodal_summary, [], summary_output)
435
  keywords_button.click(extract_multimodal_keywords, [], keywords_output)
436
- clear_button.click(
437
- clear_multimodal_interface,
438
- [],
439
- [pdf_file, pdf_display, question_input]
440
- )
441
 
442
  if __name__ == "__main__":
443
- demo.launch(debug=True, share=True)
 
1
+ # app.py
2
  import os
 
3
  import tempfile
4
  from pathlib import Path
5
  import base64
6
+ import fitz # PyMuPDF
7
  from PIL import Image
8
  import io
9
 
10
+ import gradio as gr
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # Import vectorstore and embeddings from updated packages
14
  from langchain_community.vectorstores import FAISS
15
+ from langchain_huggingface import HuggingFaceEmbeddings
 
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
17
 
18
  # ── Globals ───────────────────────────────────────────────────────────────────
19
+ index = None
20
+ retriever = None
21
+ current_pdf_name = None
22
+ extracted_content = None
23
+ extracted_images = []
24
 
25
  # ── Single Multimodal Model ──────────────────────────────────────────────────
 
26
  multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
 
 
 
27
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
28
 
29
+ # Create temp dirs
30
  temp_dir = tempfile.mkdtemp()
31
  figures_dir = os.path.join(temp_dir, "figures")
32
  os.makedirs(figures_dir, exist_ok=True)
33
 
34
  def encode_image_to_base64(image_path):
 
35
  with open(image_path, "rb") as image_file:
36
  return base64.b64encode(image_file.read()).decode('utf-8')
37
 
38
  def extract_images_from_pdf_pymupdf(pdf_path):
 
 
 
 
 
 
 
39
  extracted_images = []
40
  image_descriptions = []
 
41
  try:
 
42
  pdf_document = fitz.open(pdf_path)
 
43
  for page_num in range(len(pdf_document)):
44
  page = pdf_document.load_page(page_num)
45
+ for img_index, img in enumerate(page.get_images()):
 
 
 
46
  xref = img[0]
47
  pix = fitz.Pixmap(pdf_document, xref)
48
+ if pix.n - pix.alpha < 4:
 
 
49
  img_data = pix.tobytes("png")
50
  img_pil = Image.open(io.BytesIO(img_data))
 
 
51
  image_filename = f"page_{page_num}_img_{img_index}.png"
52
  image_path = os.path.join(figures_dir, image_filename)
53
  img_pil.save(image_path)
54
+ desc = analyze_image_with_multimodal_model(image_path)
 
 
 
55
  extracted_images.append(image_path)
56
+ image_descriptions.append(desc)
57
+ pix = None
 
 
58
  pdf_document.close()
59
  return extracted_images, image_descriptions
 
60
  except Exception as e:
61
  print(f"Error extracting images: {e}")
62
  return [], []
63
 
64
  def analyze_image_with_multimodal_model(image_path):
 
 
 
 
 
 
 
65
  try:
66
+ b64 = encode_image_to_base64(image_path)
67
+ prompt = (
68
+ "Analyze this image and provide a detailed description. Include any text, data, "
69
+ "charts, diagrams, tables, or important visual elements you can see.\n"
70
+ "Image: [Image data provided]\nDescription:"
 
 
 
 
 
 
 
 
 
 
 
71
  )
72
+ resp = multimodal_client.text_generation(
73
+ prompt=prompt, max_new_tokens=200, temperature=0.3
74
+ )
75
+ return "[IMAGE CONTENT]: " + resp.strip()
76
  except Exception as e:
77
+ return f"[IMAGE CONTENT]: Could not analyze image - {e}"
78
 
79
  def process_pdf_multimodal(pdf_file):
 
 
 
80
  global current_pdf_name, index, retriever, extracted_content, extracted_images
 
81
  if pdf_file is None:
82
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
83
 
84
  current_pdf_name = os.path.basename(pdf_file.name)
85
+ extracted_images.clear()
86
+ for f in os.listdir(figures_dir):
87
+ os.remove(os.path.join(figures_dir, f))
88
+
89
  try:
90
+ # Text extraction
 
 
 
 
 
91
  pdf_document = fitz.open(pdf_file.name)
92
  text_elements = []
93
+ for i in range(len(pdf_document)):
94
+ p = pdf_document.load_page(i)
95
+ t = p.get_text().strip()
96
+ if t:
97
+ text_elements.append(f"[PAGE {i+1}]\n{t}")
 
 
98
  pdf_document.close()
99
+
100
+ # Image extraction & analysis
101
+ imgs, img_descs = extract_images_from_pdf_pymupdf(pdf_file.name)
102
+ extracted_images.extend(imgs)
103
+
104
+ # Combine content and split
105
+ all_content = text_elements + img_descs
106
  extracted_content = "\n\n".join(all_content)
107
+ if not extracted_content:
108
+ return current_pdf_name, "❌ No content extracted.", gr.update(interactive=False)
109
+
110
+ splitter = RecursiveCharacterTextSplitter(
111
+ chunk_size=1000, chunk_overlap=200, add_start_index=True
 
 
 
 
112
  )
113
+ chunks = splitter.split_text(extracted_content)
114
+
 
115
  index = FAISS.from_texts(chunks, embeddings)
116
  retriever = index.as_retriever(search_kwargs={"k": 3})
117
+
118
+ status = (
119
+ f"βœ… Processed '{current_pdf_name}' β€” "
120
+ f"{len(chunks)} chunks "
121
+ f"({len(text_elements)} pages, {len(img_descs)} images analyzed)"
122
+ )
123
  return current_pdf_name, status, gr.update(interactive=True)
124
+
125
  except Exception as e:
126
+ return current_pdf_name, f"❌ Error processing PDF: {e}", gr.update(interactive=False)
 
127
 
128
  def ask_multimodal_question(pdf_name, question):
129
+ global retriever
130
+ if not retriever:
 
 
 
 
131
  return "❌ Please upload and process a PDF first."
 
132
  if not question.strip():
133
  return "❌ Please enter a question."
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ try:
136
+ docs = retriever.invoke(question)
137
+ context = "\n\n".join(d.page_content for d in docs)
138
+ prompt = (
139
+ "You are an AI assistant analyzing a document that contains both text and visual elements.\n\n"
140
+ f"RETRIEVED CONTEXT:\n{context}\n\n"
141
+ f"QUESTION: {question}\n"
142
+ "Please provide a comprehensive answer based on the retrieved context above. "
143
+ "If you reference visual elements, mention them explicitly.\nANSWER:"
144
+ )
145
+ resp = multimodal_client.text_generation(
146
+ prompt=prompt, max_new_tokens=300, temperature=0.5
147
  )
148
+ return resp.strip()
 
 
149
  except Exception as e:
150
+ return f"❌ Error generating answer: {e}"
151
 
152
  def generate_multimodal_summary():
 
 
 
153
  if not extracted_content:
154
  return "❌ Please upload and process a PDF first."
 
155
  try:
156
+ preview = extracted_content[:4000]
 
 
157
  messages = [
158
+ {"role":"user","content":[{"type":"text","text":
159
+ "Please provide a comprehensive summary of this document content. The content includes both textual "
160
+ f"information and descriptions of visual elements.\n\nDOCUMENT CONTENT:\n{preview}\n\nSUMMARY:"
161
+ }]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  ]
163
+ resp = multimodal_client.chat_completion(
164
+ messages=messages, max_tokens=250, temperature=0.3
 
 
 
165
  )
166
+ return resp["choices"][0]["message"]["content"].strip()
 
 
167
  except Exception as e:
168
+ return f"❌ Error generating summary: {e}"
169
 
170
  def extract_multimodal_keywords():
 
 
 
171
  if not extracted_content:
172
  return "❌ Please upload and process a PDF first."
 
173
  try:
174
+ preview = extracted_content[:3000]
 
175
  messages = [
176
+ {"role":"user","content":[{"type":"text","text":
177
+ "Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. "
178
+ f"DOCUMENT CONTENT:\n{preview}\n\nKEY TERMS:"
179
+ }]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  ]
181
+ resp = multimodal_client.chat_completion(
182
+ messages=messages, max_tokens=120, temperature=0.3
 
 
 
183
  )
184
+ return resp["choices"][0]["message"]["content"].strip()
 
 
185
  except Exception as e:
186
+ return f"❌ Error extracting keywords: {e}"
187
 
188
  def clear_multimodal_interface():
 
 
 
189
  global index, retriever, current_pdf_name, extracted_content, extracted_images
190
+ for f in os.listdir(figures_dir):
191
+ try: os.remove(os.path.join(figures_dir, f))
192
+ except: pass
 
 
 
 
 
 
193
  index = retriever = None
194
  current_pdf_name = extracted_content = None
195
  extracted_images.clear()
 
196
  return None, "", gr.update(interactive=False)
197
 
198
  # ── Gradio UI ────────────────────────────────────────────────────────────────
 
202
  .container { border-radius: 10px; padding: 15px; }
203
  .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
204
  .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
205
+ .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
206
+ .multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; }
207
+ .model-info { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px; margin: 10px 0; font-size: 12px; color: #64748b; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  """) as demo:
 
 
209
  gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
210
+ gr.Markdown("<div style='text-align:center;'><span class='multimodal-badge'>🧠 Single Model β€’ Text + Vision</span></div>")
 
 
211
  gr.Markdown("""
212
  <div class='model-info'>
213
  <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
 
221
  pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
222
  upload_button = gr.Button("πŸ”„ Process with Multimodal AI", variant="primary")
223
  status_box = gr.Textbox(label="Processing Status", interactive=False)
 
224
  with gr.Column():
225
  gr.Markdown("## ❓ Ask Questions")
226
+ question_input = gr.Textbox(lines=3, placeholder="Ask about text or visual content...", interactive=False)
 
 
 
 
 
227
  ask_button = gr.Button("πŸ” Ask Multimodal AI", variant="primary")
228
  answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
229
 
 
230
  with gr.Row():
231
  with gr.Column():
232
  summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
 
235
  keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
236
  keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
237
 
 
238
  clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
 
239
  gr.Markdown("""
240
  <div class='footer'>
241
+ <strong>Unified Multimodal Pipeline:</strong> One model handles text, images, charts, tables, diagrams, and mixed content queries
 
242
  </div>
243
  """)
244
 
245
+ upload_button.click(process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input])
246
+ ask_button.click(ask_multimodal_question, [pdf_display, question_input], answer_output)
 
 
 
 
 
 
 
 
 
247
  summary_button.click(generate_multimodal_summary, [], summary_output)
248
  keywords_button.click(extract_multimodal_keywords, [], keywords_output)
249
+ clear_button.click(clear_multimodal_interface, [], [pdf_file, pdf_display, question_input])
 
 
 
 
250
 
251
  if __name__ == "__main__":
252
+ demo.launch(debug=True)