Spaces:

mgbam
/

SmartDocAnalyzer

Sleeping

File size: 8,180 Bytes

import os
import re
import base64
import gradio as gr
import pdfplumber  # For PDF document parsing
import fitz  # PyMuPDF for advanced PDF handling (alternative to pdfplumber)
import pytesseract  # OCR for extracting text from images
from PIL import Image
from io import BytesIO
from transformers import pipeline  # For semantic analysis tasks
from huggingface_hub import InferenceClient
from mistralai import Mistral

# Initialize inference clients for different models
client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
client.headers["x-use-cache"] = "0"
api_key = os.getenv("MISTRAL_API_KEY")
Mistralclient = Mistral(api_key=api_key)

# Initialize semantic analysis pipelines using transformers (for local tasks)
# Example: summarization, sentiment-analysis, named-entity-recognition, etc.
summarizer = pipeline("summarization")
sentiment_analyzer = pipeline("sentiment-analysis")
ner_tagger = pipeline("ner")

def encode_image(image_path):
    """Resizes and encodes an image to base64."""
    try:
        image = Image.open(image_path).convert("RGB")
        base_height = 512
        h_percent = (base_height / float(image.size[1]))
        w_size = int((float(image.size[0]) * float(h_percent)))
        image = image.resize((w_size, base_height), Image.LANCZOS)
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")
    except Exception as e:
        print(f"Image encoding error: {e}")
        return None

def extract_text_from_document(file_path):
    """Extracts text from a PDF or image document."""
    text = ""
    # Try PDF parsing with pdfplumber
    if file_path.lower().endswith(".pdf"):
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() + "\n"
            return text.strip()
        except Exception as e:
            print(f"PDF parsing error: {e}")
    
    # If not PDF or parsing fails, attempt OCR on the first page of an image-based PDF or an image file.
    try:
        # Open the file as an image for OCR
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    except Exception as e:
        print(f"OCR error: {e}")
    return text.strip()

def perform_semantic_analysis(text, analysis_type):
    """Applies semantic analysis tasks to the provided text."""
    if analysis_type == "Summarization":
        return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
    elif analysis_type == "Sentiment Analysis":
        return sentiment_analyzer(text)[0]
    elif analysis_type == "Named Entity Recognition":
        return ner_tagger(text)
    # Add more analysis types as needed
    return text

def process_text_input(message_text, history, model_choice, analysis_type):
    """
    Process text-based inputs using selected model and apply semantic analysis if requested.
    """
    # Optionally perform semantic analysis before sending to the model
    if analysis_type and analysis_type != "None":
        analysis_result = perform_semantic_analysis(message_text, analysis_type)
        # Incorporate analysis_result into prompt or display separately
        message_text += f"\n\n[Analysis Result]: {analysis_result}"

    # Construct a prompt for model inference
    input_prompt = [{"role": "user", "content": message_text}]
    
    if model_choice == "mistralai/Mistral-Nemo-Instruct-2411":
        model = "mistral-large-2411"
        stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
        for chunk in stream_response:
            if chunk.data.choices[0].delta.content:
                yield chunk.data.choices[0].delta.content
    else:
        stream = client.chat.completions.create(
            model=model_choice,
            messages=input_prompt,
            temperature=0.5,
            max_tokens=1024,
            top_p=0.7,
            stream=True
        )
        temp = ""
        for chunk in stream:
            if chunk.choices[0].delta.content:
                temp += chunk.choices[0].delta.content
                yield temp

def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type):
    """
    Process image-based inputs using selected model and mode.
    Applies OCR if needed and semantic analysis.
    """
    # Save uploaded image temporarily to extract text if necessary
    temp_image_path = "temp_upload.jpg"
    image_file.save(temp_image_path)

    # Extract text from document/image using OCR if needed
    extracted_text = extract_text_from_document(temp_image_path)
    if extracted_text:
        message_text += f"\n\n[Extracted Text]: {extracted_text}"
        # Optionally perform semantic analysis on the extracted text
        if analysis_type and analysis_type != "None":
            analysis_result = perform_semantic_analysis(extracted_text, analysis_type)
            message_text += f"\n\n[Analysis Result]: {analysis_result}"

    base64_image = encode_image(temp_image_path)
    if not base64_image:
        yield "Failed to process image."
        return

    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": message_text},
            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
        ]
    }]

    if image_mod == "Vision":
        stream = client.chat.completions.create(
            model="meta-llama/Llama-3.2-11B-Vision-Instruct",
            messages=messages,
            max_tokens=500,
            stream=True
        )
        temp = ""
        for chunk in stream:
            if chunk.choices[0].delta.content:
                temp += chunk.choices[0].delta.content
                yield temp
    else:
        model = "pixtral-large-2411"
        partial_message = ""
        for chunk in Mistralclient.chat.stream(model=model, messages=messages):
            if chunk.data.choices[0].delta.content:
                partial_message += chunk.data.choices[0].delta.content
                yield partial_message

def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type):
    """
    Main response function that handles text and image inputs, applies parsing, OCR, and semantic analysis.
    """
    message_text = message.get("text", "")
    message_files = message.get("files", [])

    if message_files:
        # If an image/document is uploaded, process it
        image_file = message_files[0]
        yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type)
    else:
        # Process plain text inputs
        yield from process_text_input(message_text, history, model_choice, analysis_type)

# Set up the Gradio interface with additional user customization options
MultiModalAnalyzer = gr.ChatInterface(
    fn=multimodal_response,
    type="messages",
    multimodal=True,
    additional_inputs=[
        gr.Checkbox(label="Enable Analyzer Mode", value=True),
        gr.Dropdown(
            choices=[
                "meta-llama/Llama-3.3-70B-Instruct",
                "CohereForAI/c4ai-command-r-plus-08-2024",
                "Qwen/Qwen2.5-72B-Instruct",
                "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
                "NousResearch/Hermes-3-Llama-3.1-8B",
                "mistralai/Mistral-Nemo-Instruct-2411",
                "microsoft/phi-4"
            ],
            value="mistralai/Mistral-Nemo-Instruct-2411",
            show_label=False,
            container=False
        ),
        gr.Radio(
            choices=["pixtral", "Vision"],
            value="pixtral",
            show_label=False,
            container=False
        ),
        gr.Dropdown(
            choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"],
            value="None",
            label="Select Analysis Type",
            container=False
        )
    ],
    title="MultiModal Analyzer",
    description="Upload documents or images, select a model and analysis type to interact with your content."
)

MultiModalAnalyzer.launch()