Spaces:

Muzammil6376
/

Multimodal

Running

App Files Files Community

Muzammil6376 commited on 12 days ago

Commit

fd644c0

verified ·

1 Parent(s): 7c301cc

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -298

app.py CHANGED Viewed

@@ -1,341 +1,198 @@
 import os
-import gradio as gr
 import tempfile
 from pathlib import Path
 import base64
-import fitz  # PyMuPDF - works on HF Spaces without additional dependencies
 from PIL import Image
 import io
-# Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-# Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# HF Inference client for multimodal model
-from huggingface_hub import InferenceClient
 # ── Globals ───────────────────────────────────────────────────────────────────
-index = None               # FAISS index storing document embeddings
-retriever = None           # Retriever to fetch relevant chunks
-current_pdf_name = None    # Name of the currently loaded PDF
-extracted_content = None   # Combined text and image descriptions
-extracted_images = []      # Store image paths for multimodal queries
 # ── Single Multimodal Model ──────────────────────────────────────────────────
-# Using a single multimodal model that can handle both text and images
 multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
-# ── Multimodal Embeddings ────────────────────────────────────────────────────
-# Using CLIP-based embeddings that can handle both text and images
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
-# Create temporary directories for processing
 temp_dir = tempfile.mkdtemp()
 figures_dir = os.path.join(temp_dir, "figures")
 os.makedirs(figures_dir, exist_ok=True)
 def encode_image_to_base64(image_path):
-    """Convert image to base64 for API calls"""
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
 def extract_images_from_pdf_pymupdf(pdf_path):
-    """
-    Extract images from PDF using PyMuPDF (works on HF Spaces)
-    Args:
-        pdf_path: Path to the PDF file
-    Returns:
-        List of image paths and their descriptions
-    """
     extracted_images = []
     image_descriptions = []
     try:
-        # Open PDF with PyMuPDF
         pdf_document = fitz.open(pdf_path)
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
-            image_list = page.get_images()
-            for img_index, img in enumerate(image_list):
-                # Get image data
                 xref = img[0]
                 pix = fitz.Pixmap(pdf_document, xref)
-                # Convert to PIL Image
-                if pix.n - pix.alpha < 4:  # GRAY or RGB
                     img_data = pix.tobytes("png")
                     img_pil = Image.open(io.BytesIO(img_data))
-                    # Save image
                     image_filename = f"page_{page_num}_img_{img_index}.png"
                     image_path = os.path.join(figures_dir, image_filename)
                     img_pil.save(image_path)
-                    # Analyze image with multimodal model
-                    description = analyze_image_with_multimodal_model(image_path)
                     extracted_images.append(image_path)
-                    image_descriptions.append(description)
-                pix = None  # Free memory
         pdf_document.close()
         return extracted_images, image_descriptions
     except Exception as e:
         print(f"Error extracting images: {e}")
         return [], []
 def analyze_image_with_multimodal_model(image_path):
-    """
-    Analyze an extracted image using the multimodal model.
-    Args:
-        image_path: Path to the extracted image file
-    Returns:
-        Text description of the image content
-    """
     try:
-        # Encode image to base64
-        image_base64 = encode_image_to_base64(image_path)
-        # Simple text-based prompt for HF Inference API
-        prompt = f"""Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive.
-Image: [Image data provided]
-Description:"""
-        # Use multimodal model for image analysis
-        # Note: Simplified for HF Spaces compatibility
-        response = multimodal_client.text_generation(
-            prompt=prompt,
-            max_new_tokens=200,
-            temperature=0.3
         )
-        description = response.strip()
-        return f"[IMAGE CONTENT]: {description}"
     except Exception as e:
-        return f"[IMAGE CONTENT]: Could not analyze image - {str(e)}"
 def process_pdf_multimodal(pdf_file):
-    """
-    Process PDF using PyMuPDF (HF Spaces compatible).
-    """
     global current_pdf_name, index, retriever, extracted_content, extracted_images
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
     current_pdf_name = os.path.basename(pdf_file.name)
     try:
-        # Clear previous data
-        extracted_images.clear()
-        for file in os.listdir(figures_dir):
-            os.remove(os.path.join(figures_dir, file))
-        # Extract text using PyMuPDF
         pdf_document = fitz.open(pdf_file.name)
         text_elements = []
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            text = page.get_text()
-            if text.strip():
-                text_elements.append(f"[PAGE {page_num + 1}]\n{text.strip()}")
         pdf_document.close()
-        # Extract images using PyMuPDF
-        image_paths, image_descriptions = extract_images_from_pdf_pymupdf(pdf_file.name)
-        extracted_images.extend(image_paths)
-        # Combine all content
-        all_content = text_elements + image_descriptions
         extracted_content = "\n\n".join(all_content)
-        if not extracted_content.strip():
-            return current_pdf_name, "❌ No content could be extracted from the PDF.", gr.update(interactive=False)
-        # Split into chunks for embedding
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            add_start_index=True
         )
-        chunks = text_splitter.split_text(extracted_content)
-        # Create FAISS index with multimodal embeddings
         index = FAISS.from_texts(chunks, embeddings)
         retriever = index.as_retriever(search_kwargs={"k": 3})
-        # Status message
-        num_images = len(image_descriptions)
-        num_text_pages = len(text_elements)
-        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} chunks ({num_text_pages} pages, {num_images} images analyzed)"
         return current_pdf_name, status, gr.update(interactive=True)
     except Exception as e:
-        error_msg = f"❌ Error processing PDF: {str(e)}"
-        return current_pdf_name, error_msg, gr.update(interactive=False)
 def ask_multimodal_question(pdf_name, question):
-    """
-    Answer questions using the single multimodal model with retrieved context.
-    """
-    global retriever, extracted_images
-    if index is None or retriever is None:
         return "❌ Please upload and process a PDF first."
     if not question.strip():
         return "❌ Please enter a question."
-    try:
-        # Retrieve relevant chunks
-        docs = retriever.get_relevant_documents(question)
-        context = "\n\n".join(doc.page_content for doc in docs)
-        # Create prompt for text generation
-        prompt = f"""You are an AI assistant analyzing a document that contains both text and visual elements.
-RETRIEVED CONTEXT:
-{context}
-QUESTION: {question}
-Please provide a comprehensive answer based on the retrieved context above. The context includes both textual information and descriptions of images, charts, tables, and other visual elements from the document.
-If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
-ANSWER:"""
-        # Generate response with multimodal model
-        response = multimodal_client.text_generation(
-            prompt=prompt,
-            max_new_tokens=300,
-            temperature=0.5
         )
-        return response.strip()
     except Exception as e:
-        return f"❌ Error generating answer: {str(e)}"
 def generate_multimodal_summary():
-    """
-    Generate summary using the multimodal model.
-    """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
-        # Use first 4000 characters for summary
-        content_preview = extracted_content[:4000]
         messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
-DOCUMENT CONTENT:
-{content_preview}
-Create a well-structured summary that captures:
-1. Main topics and key points from the text
-2. Important information from visual elements (charts, images, tables)
-3. Overall document purpose and conclusions
-SUMMARY:"""
-                    }
-                ]
-            }
         ]
-        response = multimodal_client.chat_completion(
-            messages=messages,
-            max_tokens=250,
-            temperature=0.3
         )
-        return response["choices"][0]["message"]["content"].strip()
     except Exception as e:
-        return f"❌ Error generating summary: {str(e)}"
 def extract_multimodal_keywords():
-    """
-    Extract keywords using the multimodal model.
-    """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
-        content_preview = extracted_content[:3000]
         messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
-DOCUMENT CONTENT:
-{content_preview}
-Extract key terms that represent:
-- Main topics and concepts
-- Important technical terms
-- Key findings or data points
-- Visual elements mentioned (chart types, image subjects)
-Format as a comma-separated list.
-KEY TERMS:"""
-                    }
-                ]
-            }
         ]
-        response = multimodal_client.chat_completion(
-            messages=messages,
-            max_tokens=120,
-            temperature=0.3
         )
-        return response["choices"][0]["message"]["content"].strip()
     except Exception as e:
-        return f"❌ Error extracting keywords: {str(e)}"
 def clear_multimodal_interface():
-    """
-    Reset all global state and clear UI.
-    """
     global index, retriever, current_pdf_name, extracted_content, extracted_images
-    # Clear figures directory
-    try:
-        for file in os.listdir(figures_dir):
-            os.remove(os.path.join(figures_dir, file))
-    except:
-        pass
-    # Reset globals
     index = retriever = None
     current_pdf_name = extracted_content = None
     extracted_images.clear()
     return None, "", gr.update(interactive=False)
 # ── Gradio UI ────────────────────────────────────────────────────────────────
@@ -345,37 +202,12 @@ with gr.Blocks(theme=theme, css="""
     .container { border-radius: 10px; padding: 15px; }
     .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
     .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
-    .main-title {
-        text-align: center;
-        font-size: 64px;
-        font-weight: bold;
-        margin-bottom: 20px;
-    }
-    .multimodal-badge {
-        background: linear-gradient(45deg, #6366f1, #8b5cf6);
-        color: white;
-        padding: 5px 15px;
-        border-radius: 20px;
-        font-size: 14px;
-        display: inline-block;
-        margin: 10px auto;
-    }
-    .model-info {
-        background: #f8fafc;
-        border: 1px solid #e2e8f0;
-        border-radius: 8px;
-        padding: 10px;
-        margin: 10px 0;
-        font-size: 12px;
-        color: #64748b;
-    }
 """) as demo:
-    # Application title with multimodal badge
     gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
-    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🧠 Single Model • Text + Vision</span></div>")
-    # Model information
     gr.Markdown("""
     <div class='model-info'>
     <strong>🤖 Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
@@ -389,19 +221,12 @@ with gr.Blocks(theme=theme, css="""
             pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
             upload_button = gr.Button("🔄 Process with Multimodal AI", variant="primary")
             status_box = gr.Textbox(label="Processing Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
-            gr.Markdown("*Single AI model understands both text and visual content*")
-            question_input = gr.Textbox(
-                lines=3,
-                placeholder="Ask about text content, images, charts, tables, or any visual elements...",
-                interactive=False
-            )
             ask_button = gr.Button("🔍 Ask Multimodal AI", variant="primary")
             answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
-    # Analysis tools
     with gr.Row():
         with gr.Column():
             summary_button = gr.Button("📋 Generate Summary", variant="secondary")
@@ -410,34 +235,18 @@ with gr.Blocks(theme=theme, css="""
             keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
             keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
-    # Clear button
     clear_button = gr.Button("🗑️ Clear All", variant="secondary")
     gr.Markdown("""
     <div class='footer'>
-        <strong>Unified Multimodal Pipeline:</strong> One model handles text analysis, image understanding, and question answering<br>
-        Supports: Text • Images • Charts • Tables • Diagrams • Mixed Content Queries
     </div>
     """)
-    # Event bindings
-    upload_button.click(
-        process_pdf_multimodal,
-        [pdf_file],
-        [pdf_display, status_box, question_input]
-    )
-    ask_button.click(
-        ask_multimodal_question,
-        [pdf_display, question_input],
-        answer_output
-    )
     summary_button.click(generate_multimodal_summary, [], summary_output)
     keywords_button.click(extract_multimodal_keywords, [], keywords_output)
-    clear_button.click(
-        clear_multimodal_interface,
-        [],
-        [pdf_file, pdf_display, question_input]
-    )
 if __name__ == "__main__":
-    demo.launch(debug=True, share=True)

+# app.py
 import os
 import tempfile
 from pathlib import Path
 import base64
+import fitz  # PyMuPDF
 from PIL import Image
 import io
+import gradio as gr
+from huggingface_hub import InferenceClient
+# Import vectorstore and embeddings from updated packages
 from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 # ── Globals ───────────────────────────────────────────────────────────────────
+index = None
+retriever = None
+current_pdf_name = None
+extracted_content = None
+extracted_images = []
 # ── Single Multimodal Model ──────────────────────────────────────────────────
 multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
+# Create temp dirs
 temp_dir = tempfile.mkdtemp()
 figures_dir = os.path.join(temp_dir, "figures")
 os.makedirs(figures_dir, exist_ok=True)
 def encode_image_to_base64(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
 def extract_images_from_pdf_pymupdf(pdf_path):
     extracted_images = []
     image_descriptions = []
     try:
         pdf_document = fitz.open(pdf_path)
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
+            for img_index, img in enumerate(page.get_images()):
                 xref = img[0]
                 pix = fitz.Pixmap(pdf_document, xref)
+                if pix.n - pix.alpha < 4:
                     img_data = pix.tobytes("png")
                     img_pil = Image.open(io.BytesIO(img_data))
                     image_filename = f"page_{page_num}_img_{img_index}.png"
                     image_path = os.path.join(figures_dir, image_filename)
                     img_pil.save(image_path)
+                    desc = analyze_image_with_multimodal_model(image_path)
                     extracted_images.append(image_path)
+                    image_descriptions.append(desc)
+                pix = None
         pdf_document.close()
         return extracted_images, image_descriptions
     except Exception as e:
         print(f"Error extracting images: {e}")
         return [], []
 def analyze_image_with_multimodal_model(image_path):
     try:
+        b64 = encode_image_to_base64(image_path)
+        prompt = (
+            "Analyze this image and provide a detailed description. Include any text, data, "
+            "charts, diagrams, tables, or important visual elements you can see.\n"
+            "Image: [Image data provided]\nDescription:"
         )
+        resp = multimodal_client.text_generation(
+            prompt=prompt, max_new_tokens=200, temperature=0.3
+        )
+        return "[IMAGE CONTENT]: " + resp.strip()
     except Exception as e:
+        return f"[IMAGE CONTENT]: Could not analyze image - {e}"
 def process_pdf_multimodal(pdf_file):
     global current_pdf_name, index, retriever, extracted_content, extracted_images
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
     current_pdf_name = os.path.basename(pdf_file.name)
+    extracted_images.clear()
+    for f in os.listdir(figures_dir):
+        os.remove(os.path.join(figures_dir, f))
     try:
+        # Text extraction
         pdf_document = fitz.open(pdf_file.name)
         text_elements = []
+        for i in range(len(pdf_document)):
+            p = pdf_document.load_page(i)
+            t = p.get_text().strip()
+            if t:
+                text_elements.append(f"[PAGE {i+1}]\n{t}")
         pdf_document.close()
+        # Image extraction & analysis
+        imgs, img_descs = extract_images_from_pdf_pymupdf(pdf_file.name)
+        extracted_images.extend(imgs)
+        # Combine content and split
+        all_content = text_elements + img_descs
         extracted_content = "\n\n".join(all_content)
+        if not extracted_content:
+            return current_pdf_name, "❌ No content extracted.", gr.update(interactive=False)
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200, add_start_index=True
         )
+        chunks = splitter.split_text(extracted_content)
         index = FAISS.from_texts(chunks, embeddings)
         retriever = index.as_retriever(search_kwargs={"k": 3})
+        status = (
+            f"✅ Processed '{current_pdf_name}' — "
+            f"{len(chunks)} chunks "
+            f"({len(text_elements)} pages, {len(img_descs)} images analyzed)"
+        )
         return current_pdf_name, status, gr.update(interactive=True)
     except Exception as e:
+        return current_pdf_name, f"❌ Error processing PDF: {e}", gr.update(interactive=False)
 def ask_multimodal_question(pdf_name, question):
+    global retriever
+    if not retriever:
         return "❌ Please upload and process a PDF first."
     if not question.strip():
         return "❌ Please enter a question."
+    try:
+        docs = retriever.invoke(question)
+        context = "\n\n".join(d.page_content for d in docs)
+        prompt = (
+            "You are an AI assistant analyzing a document that contains both text and visual elements.\n\n"
+            f"RETRIEVED CONTEXT:\n{context}\n\n"
+            f"QUESTION: {question}\n"
+            "Please provide a comprehensive answer based on the retrieved context above. "
+            "If you reference visual elements, mention them explicitly.\nANSWER:"
+        )
+        resp = multimodal_client.text_generation(
+            prompt=prompt, max_new_tokens=300, temperature=0.5
         )
+        return resp.strip()
     except Exception as e:
+        return f"❌ Error generating answer: {e}"
 def generate_multimodal_summary():
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
+        preview = extracted_content[:4000]
         messages = [
+            {"role":"user","content":[{"type":"text","text":
+                "Please provide a comprehensive summary of this document content. The content includes both textual "
+                f"information and descriptions of visual elements.\n\nDOCUMENT CONTENT:\n{preview}\n\nSUMMARY:"
+            }]}
         ]
+        resp = multimodal_client.chat_completion(
+            messages=messages, max_tokens=250, temperature=0.3
         )
+        return resp["choices"][0]["message"]["content"].strip()
     except Exception as e:
+        return f"❌ Error generating summary: {e}"
 def extract_multimodal_keywords():
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
+        preview = extracted_content[:3000]
         messages = [
+            {"role":"user","content":[{"type":"text","text":
+                "Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. "
+                f"DOCUMENT CONTENT:\n{preview}\n\nKEY TERMS:"
+            }]}
         ]
+        resp = multimodal_client.chat_completion(
+            messages=messages, max_tokens=120, temperature=0.3
         )
+        return resp["choices"][0]["message"]["content"].strip()
     except Exception as e:
+        return f"❌ Error extracting keywords: {e}"
 def clear_multimodal_interface():
     global index, retriever, current_pdf_name, extracted_content, extracted_images
+    for f in os.listdir(figures_dir):
+        try: os.remove(os.path.join(figures_dir, f))
+        except: pass
     index = retriever = None
     current_pdf_name = extracted_content = None
     extracted_images.clear()
     return None, "", gr.update(interactive=False)
 # ── Gradio UI ────────────────────────────────────────────────────────────────
     .container { border-radius: 10px; padding: 15px; }
     .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
     .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
+    .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
+    .multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; }
+    .model-info { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px; margin: 10px 0; font-size: 12px; color: #64748b; }
 """) as demo:
     gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
+    gr.Markdown("<div style='text-align:center;'><span class='multimodal-badge'>🧠 Single Model • Text + Vision</span></div>")
     gr.Markdown("""
     <div class='model-info'>
     <strong>🤖 Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
             pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
             upload_button = gr.Button("🔄 Process with Multimodal AI", variant="primary")
             status_box = gr.Textbox(label="Processing Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
+            question_input = gr.Textbox(lines=3, placeholder="Ask about text or visual content...", interactive=False)
             ask_button = gr.Button("🔍 Ask Multimodal AI", variant="primary")
             answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
     with gr.Row():
         with gr.Column():
             summary_button = gr.Button("📋 Generate Summary", variant="secondary")
             keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
             keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
     clear_button = gr.Button("🗑️ Clear All", variant="secondary")
     gr.Markdown("""
     <div class='footer'>
+        <strong>Unified Multimodal Pipeline:</strong> One model handles text, images, charts, tables, diagrams, and mixed content queries
     </div>
     """)
+    upload_button.click(process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input])
+    ask_button.click(ask_multimodal_question, [pdf_display, question_input], answer_output)
     summary_button.click(generate_multimodal_summary, [], summary_output)
     keywords_button.click(extract_multimodal_keywords, [], keywords_output)
+    clear_button.click(clear_multimodal_interface, [], [pdf_file, pdf_display, question_input])
 if __name__ == "__main__":
+    demo.launch(debug=True)