Spaces:

Muzammil6376
/

Multimodal

Running

App Files Files Community

Muzammil6376 commited on 13 days ago

Commit

a6c0d87

verified ·

1 Parent(s): e5e4142

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -406

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
 import os
 import gradio as gr
-import base64
-from PIL import Image
-import io
-import requests
 # Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# HF Inference client for running chat completions
 from huggingface_hub import InferenceClient
-# Unstructured for advanced PDF processing with image/table extraction
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.utils.constants import PartitionStrategy
@@ -20,287 +18,155 @@ from unstructured.partition.utils.constants import PartitionStrategy
 index = None               # FAISS index storing document embeddings
 retriever = None           # Retriever to fetch relevant chunks
 current_pdf_name = None    # Name of the currently loaded PDF
-pdf_text = None            # Full text of the uploaded PDF
-extracted_images = []      # List to store extracted images and their descriptions
-# Create directories for storing extracted figures
-FIGURES_DIR = "extracted_figures/"
-os.makedirs(FIGURES_DIR, exist_ok=True)
-# ── HF Inference clients for different models ─────────────────────────────────
-# Text generation model
 text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
-# Vision-Language Models (choose one based on your needs and HF availability)
-# Option 1: BLIP-2 for general image understanding
-vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
-# Option 2: Alternative vision models you can use:
-# vision_client = InferenceClient(model="microsoft/git-base-coco")
-# vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
-# vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")
-# For more advanced multimodal tasks, you can use:
-# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium")  # For conversational AI
-# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b")   # For instruction following
-# ── Open Source Multimodal Embeddings ──────────────────────────────────────
-# Primary choices - all open source, no OpenAI dependency
-embedding_models = [
-    "sentence-transformers/all-mpnet-base-v2",      # Excellent general purpose
-    "BAAI/bge-large-en-v1.5",                      # Best Chinese model, great English
-    "intfloat/e5-large-v2",                        # Microsoft's open model
-    "sentence-transformers/all-MiniLM-L12-v2",     # Good balance speed/quality
-    "BAAI/bge-base-en-v1.5"                        # Fallback option
-]
-def initialize_embeddings():
-    """Initialize embeddings with fallback options"""
-    for model_name in embedding_models:
-        try:
-            embeddings = HuggingFaceEmbeddings(
-                model_name=model_name,
-                model_kwargs={'device': 'cpu', 'trust_remote_code': True},
-                encode_kwargs={'normalize_embeddings': True, 'batch_size': 16}
-            )
-            print(f"✅ Successfully loaded: {model_name}")
-            return embeddings
-        except Exception as e:
-            print(f"⚠️ Failed to load {model_name}: {e}")
-            continue
-    # Ultimate fallback - should always work
-    print("🔄 Using basic sentence-transformers model")
-    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Initialize embeddings
-embeddings = initialize_embeddings()
-def create_multimodal_embeddings(text_chunks, image_descriptions):
-    """
-    Create embeddings that combine text and visual information
-    """
-    try:
-        all_chunks = []
-        # Process text chunks
-        for chunk in text_chunks:
-            # Add context markers for better embedding
-            enhanced_chunk = f"Document text: {chunk}"
-            all_chunks.append(enhanced_chunk)
-        # Process image descriptions with special formatting
-        for img_desc in image_descriptions:
-            # Mark visual content for better embedding alignment
-            enhanced_desc = f"Visual content: {img_desc}"
-            all_chunks.append(enhanced_desc)
-        return all_chunks
-    except Exception as e:
-        print(f"Error creating multimodal embeddings: {e}")
-        return text_chunks + image_descriptions
     """
-    Enhanced image description using multiple vision models
     """
     try:
-        # Load and process image
-        with open(image_path, "rb") as f:
-            image_bytes = f.read()
-        # Method 1: Use BLIP-2 for detailed image captioning
-        try:
-            description = vision_client.image_to_text(image_bytes)
-            base_description = description if isinstance(description, str) else description.get('generated_text', '')
-        except Exception as e:
-            print(f"BLIP-2 failed: {e}")
-            base_description = "Image could not be processed with vision model"
-        # Method 2: Enhance with text-based analysis using the text model
-        enhancement_prompt = f"""
-        Analyze this image description and provide a detailed analysis focusing on:
-        1. Any text, numbers, or data visible
-        2. Charts, graphs, or tables
-        3. Key visual elements and their significance
-        4. Context and meaning
-        Description: {base_description}
-        Provide a comprehensive analysis:
-        """
-        try:
-            response = text_client.chat_completion(
-                messages=[{"role": "user", "content": enhancement_prompt}],
-                max_tokens=300,
-                temperature=0.3
             )
-            enhanced_description = response["choices"][0]["message"]["content"].strip()
-        except Exception as e:
-            print(f"Text enhancement failed: {e}")
-            enhanced_description = base_description
-        return f"Visual Element Analysis:\n{enhanced_description}"
     except Exception as e:
-        print(f"Error processing image {image_path}: {str(e)}")
-        return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"
-def process_pdf_multimodal_advanced(pdf_file):
     """
-    Advanced multimodal PDF processing with enhanced vision capabilities
     """
-    global current_pdf_name, index, retriever, pdf_text, extracted_images
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
     current_pdf_name = os.path.basename(pdf_file.name)
-    extracted_images = []
-    # Clear existing figures directory
-    for file in os.listdir(FIGURES_DIR):
-        try:
-            os.remove(os.path.join(FIGURES_DIR, file))
-        except:
-            pass
     try:
-        # Process PDF with unstructured
         elements = partition_pdf(
             pdf_file.name,
             strategy=PartitionStrategy.HI_RES,
             extract_image_block_types=["Image", "Table"],
-            extract_image_block_output_dir=FIGURES_DIR,
-            extract_image_block_to_payload=False,
-            # Additional parameters for better extraction
-            infer_table_structure=True,
-            chunking_strategy="by_title",
-            max_characters=1000,
-            combine_text_under_n_chars=100
         )
-        # Process elements
         text_elements = []
-        visual_descriptions = []
         for element in elements:
-            if element.category in ["Image", "Table"]:
-                # Handle image/table elements
-                continue
-            elif element.category == "Title":
-                text_elements.append(f"TITLE: {element.text}")
-            elif element.category == "Header":
-                text_elements.append(f"HEADER: {element.text}")
-            else:
-                if hasattr(element, 'text') and element.text.strip():
-                    text_elements.append(element.text)
-        pdf_text = "\n\n".join(text_elements)
-        # Process extracted visual elements
-        if os.path.exists(FIGURES_DIR):
-            for filename in sorted(os.listdir(FIGURES_DIR)):
-                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
-                    image_path = os.path.join(FIGURES_DIR, filename)
-                    # Get enhanced description
-                    description = extract_image_description_advanced(image_path)
-                    visual_descriptions.append(description)
-                    extracted_images.append({
-                        'path': image_path,
-                        'description': description,
-                        'filename': filename,
-                        'type': 'table' if 'table' in filename.lower() else 'image'
-                    })
-        # Combine all content
-        all_content = text_elements + visual_descriptions
-        # Combine text and visual content with enhanced embedding strategy
-        text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []
-        # Create multimodal embeddings
-        all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)
-        # Create FAISS index with optimized settings for multimodal content
-        if all_chunks:
-            index = FAISS.from_texts(all_chunks, embeddings)
-            retriever = index.as_retriever(
-                search_type="mmr",  # Maximum marginal relevance for diverse results
-                search_kwargs={
-                    "k": 5,           # Get more results for multimodal content
-                    "fetch_k": 10,    # Broader initial search
-                    "lambda_mult": 0.6  # Balance between relevance and diversity
-                }
-            )
-        else:
-            raise Exception("No content extracted from PDF")
-        status = f"✅ Advanced processing complete for '{current_pdf_name}'\n📄 {len(text_elements)} text sections\n🖼️ {len(extracted_images)} visual elements\n📦 {len(all_chunks)} total searchable chunks"
         return current_pdf_name, status, gr.update(interactive=True)
     except Exception as e:
-        error_msg = f"❌ Processing error: {str(e)}"
         return current_pdf_name, error_msg, gr.update(interactive=False)
-def ask_question_multimodal_advanced(pdf_name, question):
     """
-    Advanced multimodal question answering with smart routing
     """
-    global retriever, extracted_images
     if index is None or retriever is None:
         return "❌ Please upload and process a PDF first."
     if not question.strip():
         return "❌ Please enter a question."
     try:
-        # Retrieve relevant chunks
         docs = retriever.get_relevant_documents(question)
-        context = "\n\n".join([doc.page_content for doc in docs])
-        # Enhanced visual query detection
-        visual_keywords = [
-            'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
-            'visual', 'show', 'display', 'plot', 'data', 'visualization',
-            'illustration', 'screenshot', 'photo', 'drawing'
-        ]
-        is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)
-        # Smart context enhancement
-        if is_visual_query and extracted_images:
-            # Prioritize visual content for visual queries
-            visual_context = "\n\n".join([img['description'] for img in extracted_images])
-            enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
-        else:
-            enhanced_context = context
-        # Advanced prompting based on query type
-        if is_visual_query:
-            system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
-            You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
-            When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
-        else:
-            system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
-            Use the context provided to give detailed and helpful responses."""
-        prompt = f"""{system_prompt}
-Context: {enhanced_context}
-Question: {question}
-Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
-            max_tokens=400,
-            temperature=0.4
         )
         answer = response["choices"][0]["message"]["content"].strip()
@@ -309,72 +175,26 @@ Provide a detailed, accurate answer based on the context above. If the question
     except Exception as e:
         return f"❌ Error generating answer: {str(e)}"
-def analyze_document_structure():
     """
-    New feature: Analyze the overall structure of the document
     """
-    global pdf_text, extracted_images
-    if not pdf_text and not extracted_images:
         return "❌ Please upload and process a PDF first."
     try:
-        structure_prompt = f"""
-        Analyze the structure and organization of this document. Provide insights about:
-        1. Document type and purpose
-        2. Main sections and topics
-        3. Visual elements present ({len(extracted_images)} images/tables/charts)
-        4. Key information hierarchy
-        5. Overall document quality and completeness
-        Text content sample: {pdf_text[:1000]}
-        Visual elements: {len(extracted_images)} items detected
-        Provide a structural analysis:
-        """
-        response = text_client.chat_completion(
-            messages=[{"role": "user", "content": structure_prompt}],
-            max_tokens=300,
-            temperature=0.3
         )
-        return response["choices"][0]["message"]["content"].strip()
-    except Exception as e:
-        return f"❌ Error analyzing structure: {str(e)}"
-# [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]
-def generate_summary_multimodal():
-    """Enhanced summary generation considering both text and visual content"""
-    global pdf_text, extracted_images
-    if not pdf_text and not extracted_images:
-        return "❌ Please upload and process a PDF first."
-    try:
-        content_parts = []
-        if pdf_text:
-            content_parts.append(f"Text Content:\n{pdf_text[:2000]}")
-        if extracted_images:
-            visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
-            content_parts.append(f"Visual Content:\n{visual_summary}")
-        combined_content = "\n\n".join(content_parts)
-        prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
-        Focus on key findings, main topics, and insights from charts, tables, or images.
-        Content: {combined_content}
-        Summary:"""
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
-            max_tokens=250,
             temperature=0.5
         )
@@ -383,35 +203,25 @@ def generate_summary_multimodal():
     except Exception as e:
         return f"❌ Error generating summary: {str(e)}"
-def extract_keywords_multimodal():
-    """Enhanced keyword extraction from both text and visual content"""
-    global pdf_text, extracted_images
-    if not pdf_text and not extracted_images:
         return "❌ Please upload and process a PDF first."
     try:
-        content_parts = []
-        if pdf_text:
-            content_parts.append(f"Text: {pdf_text[:1500]}")
-        if extracted_images:
-            visual_content = "\n".join([img['description'][:150] for img in extracted_images])
-            content_parts.append(f"Visual Content: {visual_content}")
-        combined_content = "\n\n".join(content_parts)
-        prompt = f"""Extract key terms, concepts, and topics from this document content.
-        Include technical terms, important concepts, and themes from both text and visual elements.
-        Content: {combined_content}
-        Key terms and concepts:"""
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
-            max_tokens=120,
             temperature=0.5
         )
@@ -420,45 +230,26 @@ def extract_keywords_multimodal():
     except Exception as e:
         return f"❌ Error extracting keywords: {str(e)}"
-def show_extracted_images():
-    """Display information about extracted images"""
-    global extracted_images
-    if not extracted_images:
-        return "No visual elements extracted from the current document."
-    info = f"📊 Extracted {len(extracted_images)} visual elements:\n\n"
-    for i, img in enumerate(extracted_images, 1):
-        element_type = "📊 Table" if img['type'] == 'table' else "🖼️ Image"
-        info += f"{i}. {element_type}: {img['filename']}\n"
-        info += f"   Description: {img['description'][:150]}...\n\n"
-        if i >= 5:  # Limit display to first 5
-            remaining = len(extracted_images) - 5
-            if remaining > 0:
-                info += f"... and {remaining} more visual elements."
-            break
-    return info
-def clear_interface_multimodal():
-    """Enhanced clear function for multimodal system"""
-    global index, retriever, current_pdf_name, pdf_text, extracted_images
     index = retriever = None
-    current_pdf_name = pdf_text = None
-    extracted_images = []
-    if os.path.exists(FIGURES_DIR):
-        for file in os.listdir(FIGURES_DIR):
-            try:
-                os.remove(os.path.join(FIGURES_DIR, file))
-            except:
-                pass
-    return None, "", gr.update(interactive=False), "", "", "", "", ""
-# Enhanced Gradio UI
 theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
 with gr.Blocks(theme=theme, css="""
@@ -467,91 +258,81 @@ with gr.Blocks(theme=theme, css="""
     .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
     .main-title {
         text-align: center;
-        font-size: 56px;
         font-weight: bold;
         margin-bottom: 20px;
-        background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
     }
-    .feature-badge {
-        background: linear-gradient(45deg, #10b981, #3b82f6);
         color: white;
-        padding: 4px 12px;
-        border-radius: 15px;
-        font-size: 11px;
-        margin: 2px;
         display: inline-block;
     }
 """) as demo:
-    gr.Markdown("<div class='main-title'>🤖 DocQueryAI Pro</div>")
-    gr.Markdown("""
-    <div style='text-align: center; margin-bottom: 25px;'>
-        <span class='feature-badge'>🔍 Advanced RAG</span>
-        <span class='feature-badge'>🖼️ Vision AI</span>
-        <span class='feature-badge'>📊 Table Analysis</span>
-        <span class='feature-badge'>📈 Chart Understanding</span>
-        <span class='feature-badge'>🧠 Smart Retrieval</span>
-    </div>
-    """)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("## 📄 Document Processing")
             pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
-            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
-            upload_button = gr.Button("🚀 Process with AI Vision", variant="primary", size="lg")
-            status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)
         with gr.Column():
-            gr.Markdown("## 💬 Intelligent Q&A")
-            gr.Markdown("*Ask about any content: text, images, charts, tables, or data visualizations*")
             question_input = gr.Textbox(
                 lines=3,
-                placeholder="Examples:\n• What does the chart show?\n• Summarize the table data\n• Explain the main findings",
-                label="Your Question"
             )
-            ask_button = gr.Button("🔍 Get AI Answer", variant="primary", size="lg")
-            answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
     with gr.Row():
         with gr.Column():
             summary_button = gr.Button("📋 Generate Summary", variant="secondary")
-            summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)
         with gr.Column():
             keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
-            keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)
-    with gr.Row():
-        with gr.Column():
-            structure_button = gr.Button("🏗️ Analyze Structure", variant="secondary")
-            structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)
-        with gr.Column():
-            images_button = gr.Button("🖼️ Show Visual Elements", variant="secondary")
-            images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)
-    with gr.Row():
-        clear_button = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
     gr.Markdown("""
     <div class='footer'>
-        🚀 <strong>Powered by Advanced AI</strong><br>
-        🔧 HuggingFace Transformers • LangChain • FAISS • Unstructured<br>
-        🎯 Multimodal RAG: Text + Vision + Tables + Charts
     </div>
     """)
     # Event bindings
-    upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
-    ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
-    summary_button.click(generate_summary_multimodal, [], summary_output)
-    keywords_button.click(extract_keywords_multimodal, [], keywords_output)
-    structure_button.click(analyze_document_structure, [], structure_output)
-    images_button.click(show_extracted_images, [], images_output)
-    clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])
 if __name__ == "__main__":
     demo.launch(debug=True, share=True)

 import os
 import gradio as gr
+import tempfile
+from pathlib import Path
 # Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# HF Inference client for running multimodal models
 from huggingface_hub import InferenceClient
+# Unstructured for PDF processing with image extraction
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.utils.constants import PartitionStrategy
 index = None               # FAISS index storing document embeddings
 retriever = None           # Retriever to fetch relevant chunks
 current_pdf_name = None    # Name of the currently loaded PDF
+extracted_content = None   # Combined text and image descriptions
+# ── HF Inference clients ─────────────────────────────────────────────────────
+# Text generation client (using a good open model)
 text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
+# Vision client for image analysis
+vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
+# ── Embeddings ───────────────────────────────────────────────────────────────
+# Use BGE embeddings for vectorizing text chunks
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+# Create temporary directories for processing
+temp_dir = tempfile.mkdtemp()
+figures_dir = os.path.join(temp_dir, "figures")
+os.makedirs(figures_dir, exist_ok=True)
+def extract_image_description(image_path):
     """
+    Analyze an extracted image using vision model to get text description.
+    Args:
+        image_path: Path to the extracted image file
+    Returns:
+        Text description of the image content
     """
     try:
+        # Read image and send to vision model
+        with open(image_path, "rb") as img_file:
+            # Use vision client to analyze the image
+            response = vision_client.text_to_image_generation(
+                prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
+                image=img_file.read()
             )
+            return f"Image content: {response}"
     except Exception as e:
+        return f"Image content: [Could not analyze image - {str(e)}]"
+def process_pdf_multimodal(pdf_file):
     """
+    1. Extracts text and images from PDF using unstructured
+    2. Analyzes extracted images with vision model
+    3. Combines text and image descriptions
+    4. Creates FAISS index for retrieval
+    Args:
+        pdf_file: Uploaded PDF file
+    Returns:
+        - PDF filename, status message, and UI updates
     """
+    global current_pdf_name, index, retriever, extracted_content
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
     current_pdf_name = os.path.basename(pdf_file.name)
     try:
+        # Clear previous figures
+        for file in os.listdir(figures_dir):
+            os.remove(os.path.join(figures_dir, file))
+        # Extract elements from PDF including images
         elements = partition_pdf(
             pdf_file.name,
             strategy=PartitionStrategy.HI_RES,
             extract_image_block_types=["Image", "Table"],
+            extract_image_block_output_dir=figures_dir,
+            extract_image_block_to_payload=False
         )
+        # Separate text elements
         text_elements = []
         for element in elements:
+            if element.category not in ["Image", "Table"]:
+                text_elements.append(element.text)
+        # Process extracted images
+        image_descriptions = []
+        if os.path.exists(figures_dir):
+            for image_file in os.listdir(figures_dir):
+                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                    image_path = os.path.join(figures_dir, image_file)
+                    description = extract_image_description(image_path)
+                    image_descriptions.append(description)
+        # Combine text and image descriptions
+        all_content = text_elements + image_descriptions
+        extracted_content = "\n\n".join(all_content)
+        # Split into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            add_start_index=True
+        )
+        chunks = text_splitter.split_text(extracted_content)
+        # Create FAISS index
+        index = FAISS.from_texts(chunks, embeddings)
+        retriever = index.as_retriever(search_kwargs={"k": 3})
+        # Status message
+        num_images = len(image_descriptions)
+        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed"
         return current_pdf_name, status, gr.update(interactive=True)
     except Exception as e:
+        error_msg = f"❌ Error processing PDF: {str(e)}"
         return current_pdf_name, error_msg, gr.update(interactive=False)
+def ask_multimodal_question(pdf_name, question):
     """
+    Answer questions using both text and image content from the PDF.
+    Args:
+        pdf_name: Display name (unused)
+        question: User's question
+    Returns:
+        Generated answer combining text and visual information
     """
+    global retriever
     if index is None or retriever is None:
         return "❌ Please upload and process a PDF first."
     if not question.strip():
         return "❌ Please enter a question."
     try:
+        # Retrieve relevant chunks (text + image descriptions)
         docs = retriever.get_relevant_documents(question)
+        context = "\n\n".join(doc.page_content for doc in docs)
+        # Enhanced prompt for multimodal content
+        prompt = (
+            "You are an AI assistant analyzing a document that contains both text and images. "
+            "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
+            "to answer the question comprehensively.\n\n"
+            f"Document Content:\n{context}\n\n"
+            f"Question: {question}\n\n"
+            "Provide a detailed answer based on both the textual information and visual elements described above. "
+            "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
+            "Answer:"
+        )
+        # Generate response
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
+            max_tokens=256,
+            temperature=0.5
         )
         answer = response["choices"][0]["message"]["content"].strip()
     except Exception as e:
         return f"❌ Error generating answer: {str(e)}"
+def generate_multimodal_summary():
     """
+    Generate a summary considering both text and visual elements.
     """
+    if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
+        # Use first 3000 characters for summary
+        content_preview = extracted_content[:3000]
+        prompt = (
+            "Provide a comprehensive summary of this document that contains both text and visual elements "
+            "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
+            f"{content_preview}..."
         )
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
+            max_tokens=200,
             temperature=0.5
         )
     except Exception as e:
         return f"❌ Error generating summary: {str(e)}"
+def extract_multimodal_keywords():
+    """
+    Extract keywords from both text and visual content.
+    """
+    if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
+        content_preview = extracted_content[:3000]
+        prompt = (
+            "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
+            "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
+            f"{content_preview}..."
+        )
         response = text_client.chat_completion(
             messages=[{"role": "user", "content": prompt}],
+            max_tokens=100,
             temperature=0.5
         )
     except Exception as e:
         return f"❌ Error extracting keywords: {str(e)}"
+def clear_multimodal_interface():
+    """
+    Reset all global state and clear UI.
+    """
+    global index, retriever, current_pdf_name, extracted_content
+    # Clear figures directory
+    try:
+        for file in os.listdir(figures_dir):
+            os.remove(os.path.join(figures_dir, file))
+    except:
+        pass
+    # Reset globals
     index = retriever = None
+    current_pdf_name = extracted_content = None
+    return None, "", gr.update(interactive=False)
+# ── Gradio UI ────────────────────────────────────────────────────────────────
 theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
 with gr.Blocks(theme=theme, css="""
     .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
     .main-title {
         text-align: center;
+        font-size: 64px;
         font-weight: bold;
         margin-bottom: 20px;
     }
+    .multimodal-badge {
+        background: linear-gradient(45deg, #6366f1, #8b5cf6);
         color: white;
+        padding: 5px 15px;
+        border-radius: 20px;
+        font-size: 14px;
         display: inline-block;
+        margin: 10px auto;
     }
 """) as demo:
+    # Application title with multimodal badge
+    gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
+    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🖼️ Text + Images + Charts</span></div>")
     with gr.Row():
         with gr.Column():
+            gr.Markdown("## 📄 Document Input")
             pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
+            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
+            upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary")
+            status_box = gr.Textbox(label="Processing Status", interactive=False)
         with gr.Column():
+            gr.Markdown("## ❓ Ask Questions")
+            gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
             question_input = gr.Textbox(
                 lines=3,
+                placeholder="Ask about text, images, charts, or any content in the PDF...",
+                interactive=False
             )
+            ask_button = gr.Button("🔍 Ask Question", variant="primary")
+            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
+    # Analysis tools
     with gr.Row():
         with gr.Column():
             summary_button = gr.Button("📋 Generate Summary", variant="secondary")
+            summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
         with gr.Column():
             keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
+            keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
+    # Clear button
+    clear_button = gr.Button("🗑️ Clear All", variant="secondary")
     gr.Markdown("""
     <div class='footer'>
+        Powered by LangChain + Unstructured + Vision AI + FAISS |
+        Supports: Text, Images, Charts, Tables, Diagrams
     </div>
     """)
     # Event bindings
+    upload_button.click(
+        process_pdf_multimodal,
+        [pdf_file],
+        [pdf_display, status_box, question_input]
+    )
+    ask_button.click(
+        ask_multimodal_question,
+        [pdf_display, question_input],
+        answer_output
+    )
+    summary_button.click(generate_multimodal_summary, [], summary_output)
+    keywords_button.click(extract_multimodal_keywords, [], keywords_output)
+    clear_button.click(
+        clear_multimodal_interface,
+        [],
+        [pdf_file, pdf_display, question_input]
+    )
 if __name__ == "__main__":
     demo.launch(debug=True, share=True)