import os import re import base64 import gradio as gr import pdfplumber # For PDF document parsing import fitz # PyMuPDF for advanced PDF handling (alternative to pdfplumber) import pytesseract # OCR for extracting text from images from PIL import Image from io import BytesIO from transformers import pipeline # For semantic analysis tasks from huggingface_hub import InferenceClient from mistralai import Mistral # Initialize inference clients for different models client = InferenceClient(api_key=os.getenv('HF_TOKEN')) client.headers["x-use-cache"] = "0" api_key = os.getenv("MISTRAL_API_KEY") Mistralclient = Mistral(api_key=api_key) # Initialize semantic analysis pipelines using transformers (for local tasks) # Example: summarization, sentiment-analysis, named-entity-recognition, etc. summarizer = pipeline("summarization") sentiment_analyzer = pipeline("sentiment-analysis") ner_tagger = pipeline("ner") def encode_image(image_path): """Resizes and encodes an image to base64.""" try: image = Image.open(image_path).convert("RGB") base_height = 512 h_percent = (base_height / float(image.size[1])) w_size = int((float(image.size[0]) * float(h_percent))) image = image.resize((w_size, base_height), Image.LANCZOS) buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") except Exception as e: print(f"Image encoding error: {e}") return None def extract_text_from_document(file_path): """Extracts text from a PDF or image document.""" text = "" # Try PDF parsing with pdfplumber if file_path.lower().endswith(".pdf"): try: with pdfplumber.open(file_path) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: print(f"PDF parsing error: {e}") # If not PDF or parsing fails, attempt OCR on the first page of an image-based PDF or an image file. try: # Open the file as an image for OCR image = Image.open(file_path) text = pytesseract.image_to_string(image) except Exception as e: print(f"OCR error: {e}") return text.strip() def perform_semantic_analysis(text, analysis_type): """Applies semantic analysis tasks to the provided text.""" if analysis_type == "Summarization": return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text'] elif analysis_type == "Sentiment Analysis": return sentiment_analyzer(text)[0] elif analysis_type == "Named Entity Recognition": return ner_tagger(text) # Add more analysis types as needed return text def process_text_input(message_text, history, model_choice, analysis_type): """ Process text-based inputs using selected model and apply semantic analysis if requested. """ # Optionally perform semantic analysis before sending to the model if analysis_type and analysis_type != "None": analysis_result = perform_semantic_analysis(message_text, analysis_type) # Incorporate analysis_result into prompt or display separately message_text += f"\n\n[Analysis Result]: {analysis_result}" # Construct a prompt for model inference input_prompt = [{"role": "user", "content": message_text}] if model_choice == "mistralai/Mistral-Nemo-Instruct-2411": model = "mistral-large-2411" stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt) for chunk in stream_response: if chunk.data.choices[0].delta.content: yield chunk.data.choices[0].delta.content else: stream = client.chat.completions.create( model=model_choice, messages=input_prompt, temperature=0.5, max_tokens=1024, top_p=0.7, stream=True ) temp = "" for chunk in stream: if chunk.choices[0].delta.content: temp += chunk.choices[0].delta.content yield temp def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type): """ Process image-based inputs using selected model and mode. Applies OCR if needed and semantic analysis. """ # Save uploaded image temporarily to extract text if necessary temp_image_path = "temp_upload.jpg" image_file.save(temp_image_path) # Extract text from document/image using OCR if needed extracted_text = extract_text_from_document(temp_image_path) if extracted_text: message_text += f"\n\n[Extracted Text]: {extracted_text}" # Optionally perform semantic analysis on the extracted text if analysis_type and analysis_type != "None": analysis_result = perform_semantic_analysis(extracted_text, analysis_type) message_text += f"\n\n[Analysis Result]: {analysis_result}" base64_image = encode_image(temp_image_path) if not base64_image: yield "Failed to process image." return messages = [{ "role": "user", "content": [ {"type": "text", "text": message_text}, {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"} ] }] if image_mod == "Vision": stream = client.chat.completions.create( model="meta-llama/Llama-3.2-11B-Vision-Instruct", messages=messages, max_tokens=500, stream=True ) temp = "" for chunk in stream: if chunk.choices[0].delta.content: temp += chunk.choices[0].delta.content yield temp else: model = "pixtral-large-2411" partial_message = "" for chunk in Mistralclient.chat.stream(model=model, messages=messages): if chunk.data.choices[0].delta.content: partial_message += chunk.data.choices[0].delta.content yield partial_message def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type): """ Main response function that handles text and image inputs, applies parsing, OCR, and semantic analysis. """ message_text = message.get("text", "") message_files = message.get("files", []) if message_files: # If an image/document is uploaded, process it image_file = message_files[0] yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type) else: # Process plain text inputs yield from process_text_input(message_text, history, model_choice, analysis_type) # Set up the Gradio interface with additional user customization options MultiModalAnalyzer = gr.ChatInterface( fn=multimodal_response, type="messages", multimodal=True, additional_inputs=[ gr.Checkbox(label="Enable Analyzer Mode", value=True), gr.Dropdown( choices=[ "meta-llama/Llama-3.3-70B-Instruct", "CohereForAI/c4ai-command-r-plus-08-2024", "Qwen/Qwen2.5-72B-Instruct", "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", "NousResearch/Hermes-3-Llama-3.1-8B", "mistralai/Mistral-Nemo-Instruct-2411", "microsoft/phi-4" ], value="mistralai/Mistral-Nemo-Instruct-2411", show_label=False, container=False ), gr.Radio( choices=["pixtral", "Vision"], value="pixtral", show_label=False, container=False ), gr.Dropdown( choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"], value="None", label="Select Analysis Type", container=False ) ], title="MultiModal Analyzer", description="Upload documents or images, select a model and analysis type to interact with your content." ) MultiModalAnalyzer.launch()