Muzammil6376 commited on
Commit
a6c0d87
Β·
verified Β·
1 Parent(s): e5e4142

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -406
app.py CHANGED
@@ -1,18 +1,16 @@
1
  import os
2
  import gradio as gr
3
- import base64
4
- from PIL import Image
5
- import io
6
- import requests
7
 
8
  # Import vectorstore and embeddings from langchain community package
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  # Text splitter to break large documents into manageable chunks
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- # HF Inference client for running chat completions
14
  from huggingface_hub import InferenceClient
15
- # Unstructured for advanced PDF processing with image/table extraction
16
  from unstructured.partition.pdf import partition_pdf
17
  from unstructured.partition.utils.constants import PartitionStrategy
18
 
@@ -20,287 +18,155 @@ from unstructured.partition.utils.constants import PartitionStrategy
20
  index = None # FAISS index storing document embeddings
21
  retriever = None # Retriever to fetch relevant chunks
22
  current_pdf_name = None # Name of the currently loaded PDF
23
- pdf_text = None # Full text of the uploaded PDF
24
- extracted_images = [] # List to store extracted images and their descriptions
25
 
26
- # Create directories for storing extracted figures
27
- FIGURES_DIR = "extracted_figures/"
28
- os.makedirs(FIGURES_DIR, exist_ok=True)
29
-
30
- # ── HF Inference clients for different models ─────────────────────────────────
31
- # Text generation model
32
  text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
 
 
33
 
34
- # Vision-Language Models (choose one based on your needs and HF availability)
35
- # Option 1: BLIP-2 for general image understanding
36
- vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
37
-
38
- # Option 2: Alternative vision models you can use:
39
- # vision_client = InferenceClient(model="microsoft/git-base-coco")
40
- # vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
41
- # vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")
42
-
43
- # For more advanced multimodal tasks, you can use:
44
- # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
45
- # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
46
-
47
- # ── Open Source Multimodal Embeddings ──────────────────────────────────────
48
- # Primary choices - all open source, no OpenAI dependency
49
- embedding_models = [
50
- "sentence-transformers/all-mpnet-base-v2", # Excellent general purpose
51
- "BAAI/bge-large-en-v1.5", # Best Chinese model, great English
52
- "intfloat/e5-large-v2", # Microsoft's open model
53
- "sentence-transformers/all-MiniLM-L12-v2", # Good balance speed/quality
54
- "BAAI/bge-base-en-v1.5" # Fallback option
55
- ]
56
-
57
- def initialize_embeddings():
58
- """Initialize embeddings with fallback options"""
59
- for model_name in embedding_models:
60
- try:
61
- embeddings = HuggingFaceEmbeddings(
62
- model_name=model_name,
63
- model_kwargs={'device': 'cpu', 'trust_remote_code': True},
64
- encode_kwargs={'normalize_embeddings': True, 'batch_size': 16}
65
- )
66
- print(f"βœ… Successfully loaded: {model_name}")
67
- return embeddings
68
- except Exception as e:
69
- print(f"⚠️ Failed to load {model_name}: {e}")
70
- continue
71
-
72
- # Ultimate fallback - should always work
73
- print("πŸ”„ Using basic sentence-transformers model")
74
- return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
75
 
76
- # Initialize embeddings
77
- embeddings = initialize_embeddings()
 
 
78
 
79
- def create_multimodal_embeddings(text_chunks, image_descriptions):
80
- """
81
- Create embeddings that combine text and visual information
82
- """
83
- try:
84
- all_chunks = []
85
-
86
- # Process text chunks
87
- for chunk in text_chunks:
88
- # Add context markers for better embedding
89
- enhanced_chunk = f"Document text: {chunk}"
90
- all_chunks.append(enhanced_chunk)
91
-
92
- # Process image descriptions with special formatting
93
- for img_desc in image_descriptions:
94
- # Mark visual content for better embedding alignment
95
- enhanced_desc = f"Visual content: {img_desc}"
96
- all_chunks.append(enhanced_desc)
97
-
98
- return all_chunks
99
-
100
- except Exception as e:
101
- print(f"Error creating multimodal embeddings: {e}")
102
- return text_chunks + image_descriptions
103
  """
104
- Enhanced image description using multiple vision models
 
 
 
 
105
  """
106
  try:
107
- # Load and process image
108
- with open(image_path, "rb") as f:
109
- image_bytes = f.read()
110
-
111
- # Method 1: Use BLIP-2 for detailed image captioning
112
- try:
113
- description = vision_client.image_to_text(image_bytes)
114
- base_description = description if isinstance(description, str) else description.get('generated_text', '')
115
- except Exception as e:
116
- print(f"BLIP-2 failed: {e}")
117
- base_description = "Image could not be processed with vision model"
118
-
119
- # Method 2: Enhance with text-based analysis using the text model
120
- enhancement_prompt = f"""
121
- Analyze this image description and provide a detailed analysis focusing on:
122
- 1. Any text, numbers, or data visible
123
- 2. Charts, graphs, or tables
124
- 3. Key visual elements and their significance
125
- 4. Context and meaning
126
-
127
- Description: {base_description}
128
-
129
- Provide a comprehensive analysis:
130
- """
131
-
132
- try:
133
- response = text_client.chat_completion(
134
- messages=[{"role": "user", "content": enhancement_prompt}],
135
- max_tokens=300,
136
- temperature=0.3
137
  )
138
- enhanced_description = response["choices"][0]["message"]["content"].strip()
139
- except Exception as e:
140
- print(f"Text enhancement failed: {e}")
141
- enhanced_description = base_description
142
-
143
- return f"Visual Element Analysis:\n{enhanced_description}"
144
-
145
  except Exception as e:
146
- print(f"Error processing image {image_path}: {str(e)}")
147
- return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"
148
 
149
- def process_pdf_multimodal_advanced(pdf_file):
150
  """
151
- Advanced multimodal PDF processing with enhanced vision capabilities
 
 
 
 
 
 
 
152
  """
153
- global current_pdf_name, index, retriever, pdf_text, extracted_images
154
 
155
  if pdf_file is None:
156
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
157
 
158
  current_pdf_name = os.path.basename(pdf_file.name)
159
- extracted_images = []
160
 
161
- # Clear existing figures directory
162
- for file in os.listdir(FIGURES_DIR):
163
- try:
164
- os.remove(os.path.join(FIGURES_DIR, file))
165
- except:
166
- pass
167
-
168
  try:
169
- # Process PDF with unstructured
 
 
 
 
170
  elements = partition_pdf(
171
  pdf_file.name,
172
  strategy=PartitionStrategy.HI_RES,
173
  extract_image_block_types=["Image", "Table"],
174
- extract_image_block_output_dir=FIGURES_DIR,
175
- extract_image_block_to_payload=False,
176
- # Additional parameters for better extraction
177
- infer_table_structure=True,
178
- chunking_strategy="by_title",
179
- max_characters=1000,
180
- combine_text_under_n_chars=100
181
  )
182
 
183
- # Process elements
184
  text_elements = []
185
- visual_descriptions = []
186
-
187
  for element in elements:
188
- if element.category in ["Image", "Table"]:
189
- # Handle image/table elements
190
- continue
191
- elif element.category == "Title":
192
- text_elements.append(f"TITLE: {element.text}")
193
- elif element.category == "Header":
194
- text_elements.append(f"HEADER: {element.text}")
195
- else:
196
- if hasattr(element, 'text') and element.text.strip():
197
- text_elements.append(element.text)
198
-
199
- pdf_text = "\n\n".join(text_elements)
200
-
201
- # Process extracted visual elements
202
- if os.path.exists(FIGURES_DIR):
203
- for filename in sorted(os.listdir(FIGURES_DIR)):
204
- if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
205
- image_path = os.path.join(FIGURES_DIR, filename)
206
-
207
- # Get enhanced description
208
- description = extract_image_description_advanced(image_path)
209
- visual_descriptions.append(description)
210
-
211
- extracted_images.append({
212
- 'path': image_path,
213
- 'description': description,
214
- 'filename': filename,
215
- 'type': 'table' if 'table' in filename.lower() else 'image'
216
- })
217
-
218
- # Combine all content
219
- all_content = text_elements + visual_descriptions
220
-
221
- # Combine text and visual content with enhanced embedding strategy
222
- text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []
223
-
224
- # Create multimodal embeddings
225
- all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)
226
 
227
- # Create FAISS index with optimized settings for multimodal content
228
- if all_chunks:
229
- index = FAISS.from_texts(all_chunks, embeddings)
230
- retriever = index.as_retriever(
231
- search_type="mmr", # Maximum marginal relevance for diverse results
232
- search_kwargs={
233
- "k": 5, # Get more results for multimodal content
234
- "fetch_k": 10, # Broader initial search
235
- "lambda_mult": 0.6 # Balance between relevance and diversity
236
- }
237
- )
238
- else:
239
- raise Exception("No content extracted from PDF")
240
 
241
- status = f"βœ… Advanced processing complete for '{current_pdf_name}'\nπŸ“„ {len(text_elements)} text sections\nπŸ–ΌοΈ {len(extracted_images)} visual elements\nπŸ“¦ {len(all_chunks)} total searchable chunks"
 
 
242
 
243
  return current_pdf_name, status, gr.update(interactive=True)
244
 
245
  except Exception as e:
246
- error_msg = f"❌ Processing error: {str(e)}"
247
  return current_pdf_name, error_msg, gr.update(interactive=False)
248
 
249
- def ask_question_multimodal_advanced(pdf_name, question):
250
  """
251
- Advanced multimodal question answering with smart routing
 
 
 
 
 
252
  """
253
- global retriever, extracted_images
254
-
255
  if index is None or retriever is None:
256
  return "❌ Please upload and process a PDF first."
257
 
258
  if not question.strip():
259
  return "❌ Please enter a question."
260
-
261
  try:
262
- # Retrieve relevant chunks
263
  docs = retriever.get_relevant_documents(question)
264
- context = "\n\n".join([doc.page_content for doc in docs])
265
-
266
- # Enhanced visual query detection
267
- visual_keywords = [
268
- 'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
269
- 'visual', 'show', 'display', 'plot', 'data', 'visualization',
270
- 'illustration', 'screenshot', 'photo', 'drawing'
271
- ]
272
-
273
- is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)
274
-
275
- # Smart context enhancement
276
- if is_visual_query and extracted_images:
277
- # Prioritize visual content for visual queries
278
- visual_context = "\n\n".join([img['description'] for img in extracted_images])
279
- enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
280
- else:
281
- enhanced_context = context
282
-
283
- # Advanced prompting based on query type
284
- if is_visual_query:
285
- system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
286
- You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
287
- When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
288
- else:
289
- system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
290
- Use the context provided to give detailed and helpful responses."""
291
 
292
- prompt = f"""{system_prompt}
293
-
294
- Context: {enhanced_context}
295
-
296
- Question: {question}
297
-
298
- Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""
299
-
300
  response = text_client.chat_completion(
301
  messages=[{"role": "user", "content": prompt}],
302
- max_tokens=400,
303
- temperature=0.4
304
  )
305
 
306
  answer = response["choices"][0]["message"]["content"].strip()
@@ -309,72 +175,26 @@ Provide a detailed, accurate answer based on the context above. If the question
309
  except Exception as e:
310
  return f"❌ Error generating answer: {str(e)}"
311
 
312
- def analyze_document_structure():
313
  """
314
- New feature: Analyze the overall structure of the document
315
  """
316
- global pdf_text, extracted_images
317
-
318
- if not pdf_text and not extracted_images:
319
  return "❌ Please upload and process a PDF first."
320
 
321
  try:
322
- structure_prompt = f"""
323
- Analyze the structure and organization of this document. Provide insights about:
324
- 1. Document type and purpose
325
- 2. Main sections and topics
326
- 3. Visual elements present ({len(extracted_images)} images/tables/charts)
327
- 4. Key information hierarchy
328
- 5. Overall document quality and completeness
329
 
330
- Text content sample: {pdf_text[:1000]}
331
- Visual elements: {len(extracted_images)} items detected
332
-
333
- Provide a structural analysis:
334
- """
335
-
336
- response = text_client.chat_completion(
337
- messages=[{"role": "user", "content": structure_prompt}],
338
- max_tokens=300,
339
- temperature=0.3
340
  )
341
 
342
- return response["choices"][0]["message"]["content"].strip()
343
-
344
- except Exception as e:
345
- return f"❌ Error analyzing structure: {str(e)}"
346
-
347
- # [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]
348
-
349
- def generate_summary_multimodal():
350
- """Enhanced summary generation considering both text and visual content"""
351
- global pdf_text, extracted_images
352
-
353
- if not pdf_text and not extracted_images:
354
- return "❌ Please upload and process a PDF first."
355
-
356
- try:
357
- content_parts = []
358
-
359
- if pdf_text:
360
- content_parts.append(f"Text Content:\n{pdf_text[:2000]}")
361
-
362
- if extracted_images:
363
- visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
364
- content_parts.append(f"Visual Content:\n{visual_summary}")
365
-
366
- combined_content = "\n\n".join(content_parts)
367
-
368
- prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
369
- Focus on key findings, main topics, and insights from charts, tables, or images.
370
-
371
- Content: {combined_content}
372
-
373
- Summary:"""
374
-
375
  response = text_client.chat_completion(
376
  messages=[{"role": "user", "content": prompt}],
377
- max_tokens=250,
378
  temperature=0.5
379
  )
380
 
@@ -383,35 +203,25 @@ def generate_summary_multimodal():
383
  except Exception as e:
384
  return f"❌ Error generating summary: {str(e)}"
385
 
386
- def extract_keywords_multimodal():
387
- """Enhanced keyword extraction from both text and visual content"""
388
- global pdf_text, extracted_images
389
-
390
- if not pdf_text and not extracted_images:
391
  return "❌ Please upload and process a PDF first."
392
 
393
  try:
394
- content_parts = []
395
-
396
- if pdf_text:
397
- content_parts.append(f"Text: {pdf_text[:1500]}")
398
-
399
- if extracted_images:
400
- visual_content = "\n".join([img['description'][:150] for img in extracted_images])
401
- content_parts.append(f"Visual Content: {visual_content}")
402
 
403
- combined_content = "\n\n".join(content_parts)
404
-
405
- prompt = f"""Extract key terms, concepts, and topics from this document content.
406
- Include technical terms, important concepts, and themes from both text and visual elements.
407
-
408
- Content: {combined_content}
409
-
410
- Key terms and concepts:"""
411
 
412
  response = text_client.chat_completion(
413
  messages=[{"role": "user", "content": prompt}],
414
- max_tokens=120,
415
  temperature=0.5
416
  )
417
 
@@ -420,45 +230,26 @@ def extract_keywords_multimodal():
420
  except Exception as e:
421
  return f"❌ Error extracting keywords: {str(e)}"
422
 
423
- def show_extracted_images():
424
- """Display information about extracted images"""
425
- global extracted_images
426
-
427
- if not extracted_images:
428
- return "No visual elements extracted from the current document."
429
-
430
- info = f"πŸ“Š Extracted {len(extracted_images)} visual elements:\n\n"
431
- for i, img in enumerate(extracted_images, 1):
432
- element_type = "πŸ“Š Table" if img['type'] == 'table' else "πŸ–ΌοΈ Image"
433
- info += f"{i}. {element_type}: {img['filename']}\n"
434
- info += f" Description: {img['description'][:150]}...\n\n"
435
-
436
- if i >= 5: # Limit display to first 5
437
- remaining = len(extracted_images) - 5
438
- if remaining > 0:
439
- info += f"... and {remaining} more visual elements."
440
- break
441
 
442
- return info
443
-
444
- def clear_interface_multimodal():
445
- """Enhanced clear function for multimodal system"""
446
- global index, retriever, current_pdf_name, pdf_text, extracted_images
 
447
 
 
448
  index = retriever = None
449
- current_pdf_name = pdf_text = None
450
- extracted_images = []
451
-
452
- if os.path.exists(FIGURES_DIR):
453
- for file in os.listdir(FIGURES_DIR):
454
- try:
455
- os.remove(os.path.join(FIGURES_DIR, file))
456
- except:
457
- pass
458
 
459
- return None, "", gr.update(interactive=False), "", "", "", "", ""
460
 
461
- # Enhanced Gradio UI
462
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
463
 
464
  with gr.Blocks(theme=theme, css="""
@@ -467,91 +258,81 @@ with gr.Blocks(theme=theme, css="""
467
  .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
468
  .main-title {
469
  text-align: center;
470
- font-size: 56px;
471
  font-weight: bold;
472
  margin-bottom: 20px;
473
- background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
474
- -webkit-background-clip: text;
475
- -webkit-text-fill-color: transparent;
476
  }
477
- .feature-badge {
478
- background: linear-gradient(45deg, #10b981, #3b82f6);
479
  color: white;
480
- padding: 4px 12px;
481
- border-radius: 15px;
482
- font-size: 11px;
483
- margin: 2px;
484
  display: inline-block;
 
485
  }
486
  """) as demo:
487
 
488
- gr.Markdown("<div class='main-title'>πŸ€– DocQueryAI Pro</div>")
489
- gr.Markdown("""
490
- <div style='text-align: center; margin-bottom: 25px;'>
491
- <span class='feature-badge'>πŸ” Advanced RAG</span>
492
- <span class='feature-badge'>πŸ–ΌοΈ Vision AI</span>
493
- <span class='feature-badge'>πŸ“Š Table Analysis</span>
494
- <span class='feature-badge'>πŸ“ˆ Chart Understanding</span>
495
- <span class='feature-badge'>🧠 Smart Retrieval</span>
496
- </div>
497
- """)
498
 
499
  with gr.Row():
500
  with gr.Column():
501
- gr.Markdown("## πŸ“„ Document Processing")
502
  pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
503
- pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
504
- upload_button = gr.Button("πŸš€ Process with AI Vision", variant="primary", size="lg")
505
- status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)
506
 
507
  with gr.Column():
508
- gr.Markdown("## πŸ’¬ Intelligent Q&A")
509
- gr.Markdown("*Ask about any content: text, images, charts, tables, or data visualizations*")
510
  question_input = gr.Textbox(
511
  lines=3,
512
- placeholder="Examples:\nβ€’ What does the chart show?\nβ€’ Summarize the table data\nβ€’ Explain the main findings",
513
- label="Your Question"
514
  )
515
- ask_button = gr.Button("πŸ” Get AI Answer", variant="primary", size="lg")
516
- answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
517
 
 
518
  with gr.Row():
519
  with gr.Column():
520
  summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
521
- summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)
522
-
523
  with gr.Column():
524
  keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
525
- keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)
526
-
527
- with gr.Row():
528
- with gr.Column():
529
- structure_button = gr.Button("πŸ—οΈ Analyze Structure", variant="secondary")
530
- structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)
531
-
532
- with gr.Column():
533
- images_button = gr.Button("πŸ–ΌοΈ Show Visual Elements", variant="secondary")
534
- images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)
535
-
536
- with gr.Row():
537
- clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary", size="sm")
538
 
 
 
 
539
  gr.Markdown("""
540
  <div class='footer'>
541
- πŸš€ <strong>Powered by Advanced AI</strong><br>
542
- πŸ”§ HuggingFace Transformers β€’ LangChain β€’ FAISS β€’ Unstructured<br>
543
- 🎯 Multimodal RAG: Text + Vision + Tables + Charts
544
  </div>
545
  """)
546
 
547
  # Event bindings
548
- upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
549
- ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
550
- summary_button.click(generate_summary_multimodal, [], summary_output)
551
- keywords_button.click(extract_keywords_multimodal, [], keywords_output)
552
- structure_button.click(analyze_document_structure, [], structure_output)
553
- images_button.click(show_extracted_images, [], images_output)
554
- clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])
 
 
 
 
 
 
 
 
 
 
555
 
556
  if __name__ == "__main__":
557
  demo.launch(debug=True, share=True)
 
1
  import os
2
  import gradio as gr
3
+ import tempfile
4
+ from pathlib import Path
 
 
5
 
6
  # Import vectorstore and embeddings from langchain community package
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  # Text splitter to break large documents into manageable chunks
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ # HF Inference client for running multimodal models
12
  from huggingface_hub import InferenceClient
13
+ # Unstructured for PDF processing with image extraction
14
  from unstructured.partition.pdf import partition_pdf
15
  from unstructured.partition.utils.constants import PartitionStrategy
16
 
 
18
  index = None # FAISS index storing document embeddings
19
  retriever = None # Retriever to fetch relevant chunks
20
  current_pdf_name = None # Name of the currently loaded PDF
21
+ extracted_content = None # Combined text and image descriptions
 
22
 
23
+ # ── HF Inference clients ─────────────────────────────────────────────────────
24
+ # Text generation client (using a good open model)
 
 
 
 
25
  text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
26
+ # Vision client for image analysis
27
+ vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
28
 
29
+ # ── Embeddings ───────────────────────────────────────────────────────────────
30
+ # Use BGE embeddings for vectorizing text chunks
31
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Create temporary directories for processing
34
+ temp_dir = tempfile.mkdtemp()
35
+ figures_dir = os.path.join(temp_dir, "figures")
36
+ os.makedirs(figures_dir, exist_ok=True)
37
 
38
+ def extract_image_description(image_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  """
40
+ Analyze an extracted image using vision model to get text description.
41
+ Args:
42
+ image_path: Path to the extracted image file
43
+ Returns:
44
+ Text description of the image content
45
  """
46
  try:
47
+ # Read image and send to vision model
48
+ with open(image_path, "rb") as img_file:
49
+ # Use vision client to analyze the image
50
+ response = vision_client.text_to_image_generation(
51
+ prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
52
+ image=img_file.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  )
54
+ return f"Image content: {response}"
 
 
 
 
 
 
55
  except Exception as e:
56
+ return f"Image content: [Could not analyze image - {str(e)}]"
 
57
 
58
+ def process_pdf_multimodal(pdf_file):
59
  """
60
+ 1. Extracts text and images from PDF using unstructured
61
+ 2. Analyzes extracted images with vision model
62
+ 3. Combines text and image descriptions
63
+ 4. Creates FAISS index for retrieval
64
+ Args:
65
+ pdf_file: Uploaded PDF file
66
+ Returns:
67
+ - PDF filename, status message, and UI updates
68
  """
69
+ global current_pdf_name, index, retriever, extracted_content
70
 
71
  if pdf_file is None:
72
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
73
 
74
  current_pdf_name = os.path.basename(pdf_file.name)
 
75
 
 
 
 
 
 
 
 
76
  try:
77
+ # Clear previous figures
78
+ for file in os.listdir(figures_dir):
79
+ os.remove(os.path.join(figures_dir, file))
80
+
81
+ # Extract elements from PDF including images
82
  elements = partition_pdf(
83
  pdf_file.name,
84
  strategy=PartitionStrategy.HI_RES,
85
  extract_image_block_types=["Image", "Table"],
86
+ extract_image_block_output_dir=figures_dir,
87
+ extract_image_block_to_payload=False
 
 
 
 
 
88
  )
89
 
90
+ # Separate text elements
91
  text_elements = []
 
 
92
  for element in elements:
93
+ if element.category not in ["Image", "Table"]:
94
+ text_elements.append(element.text)
95
+
96
+ # Process extracted images
97
+ image_descriptions = []
98
+ if os.path.exists(figures_dir):
99
+ for image_file in os.listdir(figures_dir):
100
+ if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
101
+ image_path = os.path.join(figures_dir, image_file)
102
+ description = extract_image_description(image_path)
103
+ image_descriptions.append(description)
104
+
105
+ # Combine text and image descriptions
106
+ all_content = text_elements + image_descriptions
107
+ extracted_content = "\n\n".join(all_content)
108
+
109
+ # Split into chunks
110
+ text_splitter = RecursiveCharacterTextSplitter(
111
+ chunk_size=1000,
112
+ chunk_overlap=200,
113
+ add_start_index=True
114
+ )
115
+ chunks = text_splitter.split_text(extracted_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ # Create FAISS index
118
+ index = FAISS.from_texts(chunks, embeddings)
119
+ retriever = index.as_retriever(search_kwargs={"k": 3})
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # Status message
122
+ num_images = len(image_descriptions)
123
+ status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} text chunks, {num_images} images analyzed"
124
 
125
  return current_pdf_name, status, gr.update(interactive=True)
126
 
127
  except Exception as e:
128
+ error_msg = f"❌ Error processing PDF: {str(e)}"
129
  return current_pdf_name, error_msg, gr.update(interactive=False)
130
 
131
+ def ask_multimodal_question(pdf_name, question):
132
  """
133
+ Answer questions using both text and image content from the PDF.
134
+ Args:
135
+ pdf_name: Display name (unused)
136
+ question: User's question
137
+ Returns:
138
+ Generated answer combining text and visual information
139
  """
140
+ global retriever
141
+
142
  if index is None or retriever is None:
143
  return "❌ Please upload and process a PDF first."
144
 
145
  if not question.strip():
146
  return "❌ Please enter a question."
147
+
148
  try:
149
+ # Retrieve relevant chunks (text + image descriptions)
150
  docs = retriever.get_relevant_documents(question)
151
+ context = "\n\n".join(doc.page_content for doc in docs)
152
+
153
+ # Enhanced prompt for multimodal content
154
+ prompt = (
155
+ "You are an AI assistant analyzing a document that contains both text and images. "
156
+ "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
157
+ "to answer the question comprehensively.\n\n"
158
+ f"Document Content:\n{context}\n\n"
159
+ f"Question: {question}\n\n"
160
+ "Provide a detailed answer based on both the textual information and visual elements described above. "
161
+ "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
162
+ "Answer:"
163
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ # Generate response
 
 
 
 
 
 
 
166
  response = text_client.chat_completion(
167
  messages=[{"role": "user", "content": prompt}],
168
+ max_tokens=256,
169
+ temperature=0.5
170
  )
171
 
172
  answer = response["choices"][0]["message"]["content"].strip()
 
175
  except Exception as e:
176
  return f"❌ Error generating answer: {str(e)}"
177
 
178
+ def generate_multimodal_summary():
179
  """
180
+ Generate a summary considering both text and visual elements.
181
  """
182
+ if not extracted_content:
 
 
183
  return "❌ Please upload and process a PDF first."
184
 
185
  try:
186
+ # Use first 3000 characters for summary
187
+ content_preview = extracted_content[:3000]
 
 
 
 
 
188
 
189
+ prompt = (
190
+ "Provide a comprehensive summary of this document that contains both text and visual elements "
191
+ "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
192
+ f"{content_preview}..."
 
 
 
 
 
 
193
  )
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  response = text_client.chat_completion(
196
  messages=[{"role": "user", "content": prompt}],
197
+ max_tokens=200,
198
  temperature=0.5
199
  )
200
 
 
203
  except Exception as e:
204
  return f"❌ Error generating summary: {str(e)}"
205
 
206
+ def extract_multimodal_keywords():
207
+ """
208
+ Extract keywords from both text and visual content.
209
+ """
210
+ if not extracted_content:
211
  return "❌ Please upload and process a PDF first."
212
 
213
  try:
214
+ content_preview = extracted_content[:3000]
 
 
 
 
 
 
 
215
 
216
+ prompt = (
217
+ "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
218
+ "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
219
+ f"{content_preview}..."
220
+ )
 
 
 
221
 
222
  response = text_client.chat_completion(
223
  messages=[{"role": "user", "content": prompt}],
224
+ max_tokens=100,
225
  temperature=0.5
226
  )
227
 
 
230
  except Exception as e:
231
  return f"❌ Error extracting keywords: {str(e)}"
232
 
233
+ def clear_multimodal_interface():
234
+ """
235
+ Reset all global state and clear UI.
236
+ """
237
+ global index, retriever, current_pdf_name, extracted_content
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ # Clear figures directory
240
+ try:
241
+ for file in os.listdir(figures_dir):
242
+ os.remove(os.path.join(figures_dir, file))
243
+ except:
244
+ pass
245
 
246
+ # Reset globals
247
  index = retriever = None
248
+ current_pdf_name = extracted_content = None
 
 
 
 
 
 
 
 
249
 
250
+ return None, "", gr.update(interactive=False)
251
 
252
+ # ── Gradio UI ────────────────────────────────────────────────────────────────
253
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
254
 
255
  with gr.Blocks(theme=theme, css="""
 
258
  .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
259
  .main-title {
260
  text-align: center;
261
+ font-size: 64px;
262
  font-weight: bold;
263
  margin-bottom: 20px;
 
 
 
264
  }
265
+ .multimodal-badge {
266
+ background: linear-gradient(45deg, #6366f1, #8b5cf6);
267
  color: white;
268
+ padding: 5px 15px;
269
+ border-radius: 20px;
270
+ font-size: 14px;
 
271
  display: inline-block;
272
+ margin: 10px auto;
273
  }
274
  """) as demo:
275
 
276
+ # Application title with multimodal badge
277
+ gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
278
+ gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>πŸ–ΌοΈ Text + Images + Charts</span></div>")
 
 
 
 
 
 
 
279
 
280
  with gr.Row():
281
  with gr.Column():
282
+ gr.Markdown("## πŸ“„ Document Input")
283
  pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
284
+ pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
285
+ upload_button = gr.Button("πŸ”„ Process Document (Extract Text + Images)", variant="primary")
286
+ status_box = gr.Textbox(label="Processing Status", interactive=False)
287
 
288
  with gr.Column():
289
+ gr.Markdown("## ❓ Ask Questions")
290
+ gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
291
  question_input = gr.Textbox(
292
  lines=3,
293
+ placeholder="Ask about text, images, charts, or any content in the PDF...",
294
+ interactive=False
295
  )
296
+ ask_button = gr.Button("πŸ” Ask Question", variant="primary")
297
+ answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
298
 
299
+ # Analysis tools
300
  with gr.Row():
301
  with gr.Column():
302
  summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
303
+ summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
 
304
  with gr.Column():
305
  keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
306
+ keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ # Clear button
309
+ clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
310
+
311
  gr.Markdown("""
312
  <div class='footer'>
313
+ Powered by LangChain + Unstructured + Vision AI + FAISS |
314
+ Supports: Text, Images, Charts, Tables, Diagrams
 
315
  </div>
316
  """)
317
 
318
  # Event bindings
319
+ upload_button.click(
320
+ process_pdf_multimodal,
321
+ [pdf_file],
322
+ [pdf_display, status_box, question_input]
323
+ )
324
+ ask_button.click(
325
+ ask_multimodal_question,
326
+ [pdf_display, question_input],
327
+ answer_output
328
+ )
329
+ summary_button.click(generate_multimodal_summary, [], summary_output)
330
+ keywords_button.click(extract_multimodal_keywords, [], keywords_output)
331
+ clear_button.click(
332
+ clear_multimodal_interface,
333
+ [],
334
+ [pdf_file, pdf_display, question_input]
335
+ )
336
 
337
  if __name__ == "__main__":
338
  demo.launch(debug=True, share=True)