Spaces:

mgbam
/

SmartDocAnalyzer

Sleeping

App Files Files Community

mgbam commited on Jan 9

Commit

f9bd215

verified ·

1 Parent(s): bbea25a

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -144

app.py CHANGED Viewed

@@ -1,59 +1,30 @@
-import gradio as gr
-import base64
 import os
 import re
-from io import BytesIO
 from PIL import Image
 from huggingface_hub import InferenceClient
 from mistralai import Mistral
-from feifeilib.feifeichat import feifeichat  # Assuming this utility is still relevant or replace with SmartDocAnalyzer logic as needed.
-# Initialize Hugging Face inference clients
 client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
 client.headers["x-use-cache"] = "0"
 api_key = os.getenv("MISTRAL_API_KEY")
 Mistralclient = Mistral(api_key=api_key)
-# Gradio interface setup for SmartDocAnalyzer
-SmartDocAnalyzer = gr.ChatInterface(
-    feifeichat,  # This should be replaced with a suitable function for SmartDocAnalyzer if needed.
-    type="messages",
-    multimodal=True,
-    additional_inputs=[
-        gr.Checkbox(label="Enable Analyzer Mode", value=True),
-        gr.Dropdown(
-            [
-                "meta-llama/Llama-3.3-70B-Instruct",
-                "CohereForAI/c4ai-command-r-plus-08-2024",
-                "Qwen/Qwen2.5-72B-Instruct",
-                "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
-                "NousResearch/Hermes-3-Llama-3.1-8B",
-                "mistralai/Mistral-Nemo-Instruct-2411",
-                "microsoft/phi-4"
-            ],
-            value="mistralai/Mistral-Nemo-Instruct-2411",
-            show_label=False,
-            container=False
-        ),
-        gr.Radio(
-            ["pixtral", "Vision"],
-            value="pixtral",
-            show_label=False,
-            container=False
-        )
-    ],
-    title="SmartDocAnalyzer",
-    description="An advanced document analysis tool powered by AI."
-)
-SmartDocAnalyzer.launch()
 def encode_image(image_path):
-    """
-    Encode the image at the given path to a base64 JPEG.
-    Resizes image height to 512 pixels while maintaining aspect ratio.
-    """
     try:
         image = Image.open(image_path).convert("RGB")
         base_height = 512
@@ -63,68 +34,109 @@ def encode_image(image_path):
         buffered = BytesIO()
         image.save(buffered, format="JPEG")
         return base64.b64encode(buffered.getvalue()).decode("utf-8")
-    except FileNotFoundError:
-        print(f"Error: The file {image_path} was not found.")
     except Exception as e:
-        print(f"Error: {e}")
-    return None
-def feifeiprompt(feifei_select=True, message_text="", history=""):
     """
-    Constructs a prompt for the chatbot based on message text and history.
-    Enhancements for SmartDocAnalyzer context can be added here.
     """
-    input_prompt = []
-    # Special handling for drawing requests
-    if message_text.startswith("画") or message_text.startswith("draw"):
-        feifei_photo = (
-            "You are FeiFei. Background: FeiFei was born in Tokyo and is a natural-born photographer, "
-            "hailing from a family with a long history in photography... [truncated for brevity]"
-        )
-        message_text = message_text.replace("画", "").replace("draw", "")
-        message_text = f"提示词是'{message_text}',根据提示词帮我生成一张高质量照片的一句话英文回复"
-        system_prompt = {"role": "system", "content": feifei_photo}
-        user_input_part = {"role": "user", "content": str(message_text)}
-        return [system_prompt, user_input_part]
-    # Default prompt construction for FeiFei character
-    if feifei_select:
-        feifei = (
-            "[Character Name]: Aifeifei (AI Feifei) [Gender]: Female [Age]: 19 years old ... "
-            "[Identity]: User's virtual girlfriend"
-        )
-        system_prompt = {"role": "system", "content": feifei}
-        user_input_part = {"role": "user", "content": str(message_text)}
-        pattern = re.compile(r"gradio")
-        if history:
-            history = [item for item in history if not pattern.search(str(item["content"]))]
-            input_prompt = [system_prompt] + history + [user_input_part]
-        else:
-            input_prompt = [system_prompt, user_input_part]
-    else:
-        input_prompt = [{"role": "user", "content": str(message_text)}]
-    return input_prompt
-def feifeiimgprompt(message_files, message_text, image_mod):
     """
-    Handles image-based prompts for either 'Vision' or 'pixtral' modes.
     """
-    message_file = message_files[0]
-    base64_image = encode_image(message_file)
-    if base64_image is None:
         return
-    # Vision mode using meta-llama model
     if image_mod == "Vision":
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": message_text},
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-            ]
-        }]
         stream = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
@@ -133,69 +145,68 @@ def feifeiimgprompt(message_files, message_text, image_mod):
         )
         temp = ""
         for chunk in stream:
-            if chunk.choices[0].delta.content is not None:
                 temp += chunk.choices[0].delta.content
                 yield temp
-    # Pixtral mode using Mistral model
     else:
         model = "pixtral-large-2411"
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": message_text},
-                {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
-            ]
-        }]
         partial_message = ""
         for chunk in Mistralclient.chat.stream(model=model, messages=messages):
-            if chunk.data.choices[0].delta.content is not None:
                 partial_message += chunk.data.choices[0].delta.content
                 yield partial_message
-def feifeichatmod(additional_dropdown, input_prompt):
     """
-    Chooses the appropriate chat model based on the dropdown selection.
-    """
-    if additional_dropdown == "mistralai/Mistral-Nemo-Instruct-2411":
-        model = "mistral-large-2411"
-        stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
-        partial_message = ""
-        for chunk in stream_response:
-            if chunk.data.choices[0].delta.content is not None:
-                partial_message += chunk.data.choices[0].delta.content
-                yield partial_message
-    else:
-        stream = client.chat.completions.create(
-            model=additional_dropdown,
-            messages=input_prompt,
-            temperature=0.5,
-            max_tokens=1024,
-            top_p=0.7,
-            stream=True
-        )
-        temp = ""
-        for chunk in stream:
-            if chunk.choices[0].delta.content is not None:
-                temp += chunk.choices[0].delta.content
-                yield temp
-def feifeichat(message, history, feifei_select, additional_dropdown, image_mod):
-    """
-    Main chat function that decides between image-based and text-based handling.
-    This function can be further enhanced for SmartDocAnalyzer-specific logic.
     """
     message_text = message.get("text", "")
     message_files = message.get("files", [])
     if message_files:
-        # Process image input
-        yield from feifeiimgprompt(message_files, message_text, image_mod)
     else:
-        # Process text input
-        input_prompt = feifeiprompt(feifei_select, message_text, history)
-        yield from feifeichatmod(additional_dropdown, input_prompt)
-# Enhancement Note:
-# For the SmartDocAnalyzer space, consider integrating document parsing,
-# OCR functionalities, semantic analysis of documents, and more advanced
-# error handling as needed. This template serves as a starting point.

 import os
 import re
+import base64
+import gradio as gr
+import pdfplumber  # For PDF document parsing
+import fitz  # PyMuPDF for advanced PDF handling (alternative to pdfplumber)
+import pytesseract  # OCR for extracting text from images
 from PIL import Image
+from io import BytesIO
+from transformers import pipeline  # For semantic analysis tasks
 from huggingface_hub import InferenceClient
 from mistralai import Mistral
+# Initialize inference clients for different models
 client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
 client.headers["x-use-cache"] = "0"
 api_key = os.getenv("MISTRAL_API_KEY")
 Mistralclient = Mistral(api_key=api_key)
+# Initialize semantic analysis pipelines using transformers (for local tasks)
+# Example: summarization, sentiment-analysis, named-entity-recognition, etc.
+summarizer = pipeline("summarization")
+sentiment_analyzer = pipeline("sentiment-analysis")
+ner_tagger = pipeline("ner")
 def encode_image(image_path):
+    """Resizes and encodes an image to base64."""
     try:
         image = Image.open(image_path).convert("RGB")
         base_height = 512
         buffered = BytesIO()
         image.save(buffered, format="JPEG")
         return base64.b64encode(buffered.getvalue()).decode("utf-8")
     except Exception as e:
+        print(f"Image encoding error: {e}")
+        return None
+def extract_text_from_document(file_path):
+    """Extracts text from a PDF or image document."""
+    text = ""
+    # Try PDF parsing with pdfplumber
+    if file_path.lower().endswith(".pdf"):
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    text += page.extract_text() + "\n"
+            return text.strip()
+        except Exception as e:
+            print(f"PDF parsing error: {e}")
+    # If not PDF or parsing fails, attempt OCR on the first page of an image-based PDF or an image file.
+    try:
+        # Open the file as an image for OCR
+        image = Image.open(file_path)
+        text = pytesseract.image_to_string(image)
+    except Exception as e:
+        print(f"OCR error: {e}")
+    return text.strip()
+def perform_semantic_analysis(text, analysis_type):
+    """Applies semantic analysis tasks to the provided text."""
+    if analysis_type == "Summarization":
+        return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
+    elif analysis_type == "Sentiment Analysis":
+        return sentiment_analyzer(text)[0]
+    elif analysis_type == "Named Entity Recognition":
+        return ner_tagger(text)
+    # Add more analysis types as needed
+    return text
+def process_text_input(message_text, history, model_choice, analysis_type):
     """
+    Process text-based inputs using selected model and apply semantic analysis if requested.
     """
+    # Optionally perform semantic analysis before sending to the model
+    if analysis_type and analysis_type != "None":
+        analysis_result = perform_semantic_analysis(message_text, analysis_type)
+        # Incorporate analysis_result into prompt or display separately
+        message_text += f"\n\n[Analysis Result]: {analysis_result}"
+    # Construct a prompt for model inference
+    input_prompt = [{"role": "user", "content": message_text}]
+    if model_choice == "mistralai/Mistral-Nemo-Instruct-2411":
+        model = "mistral-large-2411"
+        stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
+        for chunk in stream_response:
+            if chunk.data.choices[0].delta.content:
+                yield chunk.data.choices[0].delta.content
+    else:
+        stream = client.chat.completions.create(
+            model=model_choice,
+            messages=input_prompt,
+            temperature=0.5,
+            max_tokens=1024,
+            top_p=0.7,
+            stream=True
+        )
+        temp = ""
+        for chunk in stream:
+            if chunk.choices[0].delta.content:
+                temp += chunk.choices[0].delta.content
+                yield temp
+def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type):
     """
+    Process image-based inputs using selected model and mode.
+    Applies OCR if needed and semantic analysis.
     """
+    # Save uploaded image temporarily to extract text if necessary
+    temp_image_path = "temp_upload.jpg"
+    image_file.save(temp_image_path)
+    # Extract text from document/image using OCR if needed
+    extracted_text = extract_text_from_document(temp_image_path)
+    if extracted_text:
+        message_text += f"\n\n[Extracted Text]: {extracted_text}"
+        # Optionally perform semantic analysis on the extracted text
+        if analysis_type and analysis_type != "None":
+            analysis_result = perform_semantic_analysis(extracted_text, analysis_type)
+            message_text += f"\n\n[Analysis Result]: {analysis_result}"
+    base64_image = encode_image(temp_image_path)
+    if not base64_image:
+        yield "Failed to process image."
         return
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "text", "text": message_text},
+            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
+        ]
+    }]
     if image_mod == "Vision":
         stream = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
         )
         temp = ""
         for chunk in stream:
+            if chunk.choices[0].delta.content:
                 temp += chunk.choices[0].delta.content
                 yield temp
     else:
         model = "pixtral-large-2411"
         partial_message = ""
         for chunk in Mistralclient.chat.stream(model=model, messages=messages):
+            if chunk.data.choices[0].delta.content:
                 partial_message += chunk.data.choices[0].delta.content
                 yield partial_message
+def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type):
     """
+    Main response function that handles text and image inputs, applies parsing, OCR, and semantic analysis.
     """
     message_text = message.get("text", "")
     message_files = message.get("files", [])
     if message_files:
+        # If an image/document is uploaded, process it
+        image_file = message_files[0]
+        yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type)
     else:
+        # Process plain text inputs
+        yield from process_text_input(message_text, history, model_choice, analysis_type)
+# Set up the Gradio interface with additional user customization options
+MultiModalAnalyzer = gr.ChatInterface(
+    fn=multimodal_response,
+    type="messages",
+    multimodal=True,
+    additional_inputs=[
+        gr.Checkbox(label="Enable Analyzer Mode", value=True),
+        gr.Dropdown(
+            choices=[
+                "meta-llama/Llama-3.3-70B-Instruct",
+                "CohereForAI/c4ai-command-r-plus-08-2024",
+                "Qwen/Qwen2.5-72B-Instruct",
+                "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+                "NousResearch/Hermes-3-Llama-3.1-8B",
+                "mistralai/Mistral-Nemo-Instruct-2411",
+                "microsoft/phi-4"
+            ],
+            value="mistralai/Mistral-Nemo-Instruct-2411",
+            show_label=False,
+            container=False
+        ),
+        gr.Radio(
+            choices=["pixtral", "Vision"],
+            value="pixtral",
+            show_label=False,
+            container=False
+        ),
+        gr.Dropdown(
+            choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"],
+            value="None",
+            label="Select Analysis Type",
+            container=False
+        )
+    ],
+    title="MultiModal Analyzer",
+    description="Upload documents or images, select a model and analysis type to interact with your content."
+)
+MultiModalAnalyzer.launch()