import os import re import base64 from io import BytesIO from functools import lru_cache import gradio as gr import pdfplumber # For PDF document parsing import pytesseract # OCR for extracting text from images from PIL import Image from huggingface_hub import InferenceClient from mistralai import Mistral # Initialize clients that don't require heavy model loading client = InferenceClient(api_key=os.getenv('HF_TOKEN')) client.headers["x-use-cache"] = "0" api_key = os.getenv("MISTRAL_API_KEY") Mistralclient = Mistral(api_key=api_key) ### Lazy Loading and Caching for Transformers Pipelines ### @lru_cache(maxsize=1) def get_summarizer(): from transformers import pipeline # Use a smaller model for faster loading return pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") @lru_cache(maxsize=1) def get_sentiment_analyzer(): from transformers import pipeline return pipeline("sentiment-analysis") @lru_cache(maxsize=1) def get_ner_tagger(): from transformers import pipeline return pipeline("ner") ### Helper Functions ### def encode_image(image_path): """Resizes and encodes an image to base64.""" try: image = Image.open(image_path).convert("RGB") base_height = 512 h_percent = (base_height / float(image.size[1])) w_size = int((float(image.size[0]) * float(h_percent))) image = image.resize((w_size, base_height), Image.LANCZOS) buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") except Exception as e: print(f"Image encoding error: {e}") return None def extract_text_from_document(file_path): """Extracts text from a PDF or image document using pdfplumber and OCR.""" text = "" if file_path.lower().endswith(".pdf"): try: with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if text.strip(): return text.strip() except Exception as e: print(f"PDF parsing error: {e}") # Fallback to OCR for non-PDF or if PDF parsing yields no text try: image = Image.open(file_path) text = pytesseract.image_to_string(image) except Exception as e: print(f"OCR error: {e}") return text.strip() def perform_semantic_analysis(text, analysis_type): """Applies semantic analysis tasks to the provided text using cached pipelines.""" if analysis_type == "Summarization": summarizer = get_summarizer() return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text'] elif analysis_type == "Sentiment Analysis": sentiment_analyzer = get_sentiment_analyzer() return sentiment_analyzer(text)[0] elif analysis_type == "Named Entity Recognition": ner_tagger = get_ner_tagger() return ner_tagger(text) return text def process_text_input(message_text, history, model_choice, analysis_type): """Processes text-based inputs using selected model and optional semantic analysis.""" if analysis_type and analysis_type != "None": analysis_result = perform_semantic_analysis(message_text, analysis_type) message_text += f"\n\n[Analysis Result]: {analysis_result}" input_prompt = [{"role": "user", "content": message_text}] if model_choice == "mistralai/Mistral-Nemo-Instruct-2411": model = "mistral-large-2411" stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt) for chunk in stream_response: if chunk.data.choices[0].delta.content: yield chunk.data.choices[0].delta.content else: stream = client.chat.completions.create( model=model_choice, messages=input_prompt, temperature=0.5, max_tokens=1024, top_p=0.7, stream=True ) temp = "" for chunk in stream: if chunk.choices[0].delta.content: temp += chunk.choices[0].delta.content yield temp def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type): """Processes image-based inputs, applies OCR, and optional semantic analysis.""" # Save the uploaded image temporarily temp_image_path = "temp_upload.jpg" image_file.save(temp_image_path) extracted_text = extract_text_from_document(temp_image_path) if extracted_text: message_text += f"\n\n[Extracted Text]: {extracted_text}" if analysis_type and analysis_type != "None": analysis_result = perform_semantic_analysis(extracted_text, analysis_type) message_text += f"\n\n[Analysis Result]: {analysis_result}" base64_image = encode_image(temp_image_path) if not base64_image: yield "Failed to process image." return messages = [{ "role": "user", "content": [ {"type": "text", "text": message_text}, {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"} ] }] if image_mod == "Vision": stream = client.chat.completions.create( model="meta-llama/Llama-3.2-11B-Vision-Instruct", messages=messages, max_tokens=500, stream=True ) temp = "" for chunk in stream: if chunk.choices[0].delta.content: temp += chunk.choices[0].delta.content yield temp else: model = "pixtral-large-2411" partial_message = "" for chunk in Mistralclient.chat.stream(model=model, messages=messages): if chunk.data.choices[0].delta.content: partial_message += chunk.data.choices[0].delta.content yield partial_message def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type): """Main response function handling both text and image inputs with analysis.""" message_text = message.get("text", "") message_files = message.get("files", []) if message_files: image_file = message_files[0] yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type) else: yield from process_text_input(message_text, history, model_choice, analysis_type) # Set up the Gradio interface with user customization options MultiModalAnalyzer = gr.ChatInterface( fn=multimodal_response, type="messages", multimodal=True, additional_inputs=[ gr.Checkbox(label="Enable Analyzer Mode", value=True), gr.Dropdown( choices=[ "meta-llama/Llama-3.3-70B-Instruct", "CohereForAI/c4ai-command-r-plus-08-2024", "Qwen/Qwen2.5-72B-Instruct", "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", "NousResearch/Hermes-3-Llama-3.1-8B", "mistralai/Mistral-Nemo-Instruct-2411", "microsoft/phi-4" ], value="mistralai/Mistral-Nemo-Instruct-2411", show_label=False, container=False ), gr.Radio( choices=["pixtral", "Vision"], value="pixtral", show_label=False, container=False ), gr.Dropdown( choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"], value="None", label="Select Analysis Type", container=False ) ], title="MultiModal Analyzer", description="Upload documents or images, select a model and analysis type to interact with your content." ) MultiModalAnalyzer.launch()