mgbam's picture
Update app.py
f9bd215 verified
raw
history blame
8.18 kB
import os
import re
import base64
import gradio as gr
import pdfplumber # For PDF document parsing
import fitz # PyMuPDF for advanced PDF handling (alternative to pdfplumber)
import pytesseract # OCR for extracting text from images
from PIL import Image
from io import BytesIO
from transformers import pipeline # For semantic analysis tasks
from huggingface_hub import InferenceClient
from mistralai import Mistral
# Initialize inference clients for different models
client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
client.headers["x-use-cache"] = "0"
api_key = os.getenv("MISTRAL_API_KEY")
Mistralclient = Mistral(api_key=api_key)
# Initialize semantic analysis pipelines using transformers (for local tasks)
# Example: summarization, sentiment-analysis, named-entity-recognition, etc.
summarizer = pipeline("summarization")
sentiment_analyzer = pipeline("sentiment-analysis")
ner_tagger = pipeline("ner")
def encode_image(image_path):
"""Resizes and encodes an image to base64."""
try:
image = Image.open(image_path).convert("RGB")
base_height = 512
h_percent = (base_height / float(image.size[1]))
w_size = int((float(image.size[0]) * float(h_percent)))
image = image.resize((w_size, base_height), Image.LANCZOS)
buffered = BytesIO()
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
except Exception as e:
print(f"Image encoding error: {e}")
return None
def extract_text_from_document(file_path):
"""Extracts text from a PDF or image document."""
text = ""
# Try PDF parsing with pdfplumber
if file_path.lower().endswith(".pdf"):
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
print(f"PDF parsing error: {e}")
# If not PDF or parsing fails, attempt OCR on the first page of an image-based PDF or an image file.
try:
# Open the file as an image for OCR
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
except Exception as e:
print(f"OCR error: {e}")
return text.strip()
def perform_semantic_analysis(text, analysis_type):
"""Applies semantic analysis tasks to the provided text."""
if analysis_type == "Summarization":
return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
elif analysis_type == "Sentiment Analysis":
return sentiment_analyzer(text)[0]
elif analysis_type == "Named Entity Recognition":
return ner_tagger(text)
# Add more analysis types as needed
return text
def process_text_input(message_text, history, model_choice, analysis_type):
"""
Process text-based inputs using selected model and apply semantic analysis if requested.
"""
# Optionally perform semantic analysis before sending to the model
if analysis_type and analysis_type != "None":
analysis_result = perform_semantic_analysis(message_text, analysis_type)
# Incorporate analysis_result into prompt or display separately
message_text += f"\n\n[Analysis Result]: {analysis_result}"
# Construct a prompt for model inference
input_prompt = [{"role": "user", "content": message_text}]
if model_choice == "mistralai/Mistral-Nemo-Instruct-2411":
model = "mistral-large-2411"
stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
for chunk in stream_response:
if chunk.data.choices[0].delta.content:
yield chunk.data.choices[0].delta.content
else:
stream = client.chat.completions.create(
model=model_choice,
messages=input_prompt,
temperature=0.5,
max_tokens=1024,
top_p=0.7,
stream=True
)
temp = ""
for chunk in stream:
if chunk.choices[0].delta.content:
temp += chunk.choices[0].delta.content
yield temp
def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type):
"""
Process image-based inputs using selected model and mode.
Applies OCR if needed and semantic analysis.
"""
# Save uploaded image temporarily to extract text if necessary
temp_image_path = "temp_upload.jpg"
image_file.save(temp_image_path)
# Extract text from document/image using OCR if needed
extracted_text = extract_text_from_document(temp_image_path)
if extracted_text:
message_text += f"\n\n[Extracted Text]: {extracted_text}"
# Optionally perform semantic analysis on the extracted text
if analysis_type and analysis_type != "None":
analysis_result = perform_semantic_analysis(extracted_text, analysis_type)
message_text += f"\n\n[Analysis Result]: {analysis_result}"
base64_image = encode_image(temp_image_path)
if not base64_image:
yield "Failed to process image."
return
messages = [{
"role": "user",
"content": [
{"type": "text", "text": message_text},
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
]
}]
if image_mod == "Vision":
stream = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
messages=messages,
max_tokens=500,
stream=True
)
temp = ""
for chunk in stream:
if chunk.choices[0].delta.content:
temp += chunk.choices[0].delta.content
yield temp
else:
model = "pixtral-large-2411"
partial_message = ""
for chunk in Mistralclient.chat.stream(model=model, messages=messages):
if chunk.data.choices[0].delta.content:
partial_message += chunk.data.choices[0].delta.content
yield partial_message
def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type):
"""
Main response function that handles text and image inputs, applies parsing, OCR, and semantic analysis.
"""
message_text = message.get("text", "")
message_files = message.get("files", [])
if message_files:
# If an image/document is uploaded, process it
image_file = message_files[0]
yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type)
else:
# Process plain text inputs
yield from process_text_input(message_text, history, model_choice, analysis_type)
# Set up the Gradio interface with additional user customization options
MultiModalAnalyzer = gr.ChatInterface(
fn=multimodal_response,
type="messages",
multimodal=True,
additional_inputs=[
gr.Checkbox(label="Enable Analyzer Mode", value=True),
gr.Dropdown(
choices=[
"meta-llama/Llama-3.3-70B-Instruct",
"CohereForAI/c4ai-command-r-plus-08-2024",
"Qwen/Qwen2.5-72B-Instruct",
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
"NousResearch/Hermes-3-Llama-3.1-8B",
"mistralai/Mistral-Nemo-Instruct-2411",
"microsoft/phi-4"
],
value="mistralai/Mistral-Nemo-Instruct-2411",
show_label=False,
container=False
),
gr.Radio(
choices=["pixtral", "Vision"],
value="pixtral",
show_label=False,
container=False
),
gr.Dropdown(
choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"],
value="None",
label="Select Analysis Type",
container=False
)
],
title="MultiModal Analyzer",
description="Upload documents or images, select a model and analysis type to interact with your content."
)
MultiModalAnalyzer.launch()