import os import mimetypes import PyPDF2 import docx import cv2 import numpy as np from PIL import Image import pytesseract def process_image_for_model(image): """Convert image to base64 for model input""" if image is None: return None # Convert numpy array to PIL Image if needed import io import base64 # Handle numpy array from Gradio if isinstance(image, np.ndarray): image = Image.fromarray(image) buffer = io.BytesIO() image.save(buffer, format='PNG') img_str = base64.b64encode(buffer.getvalue()).decode() return f"data:image/png;base64,{img_str}" def extract_text_from_image(image_path): """Extract text from image using OCR""" try: # Check if tesseract is available try: pytesseract.get_tesseract_version() except Exception: return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions." image = cv2.imread(image_path) if image is None: return "Error: Could not read image file" image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) text = pytesseract.image_to_string(binary, config='--psm 6') return text.strip() if text.strip() else "No text found in image" except Exception as e: return f"Error extracting text from image: {e}" def extract_text_from_file(file_path): if not file_path: return "" ext = os.path.splitext(file_path)[1].lower() try: if ext == ".pdf": with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) return "\n".join(page.extract_text() or "" for page in reader.pages) elif ext in [".txt", ".md", ".csv"]: with open(file_path, "r", encoding="utf-8") as f: return f.read() elif ext == ".docx": doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) elif ext.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"]: return extract_text_from_image(file_path) else: return "" except Exception as e: return f"Error extracting text: {e}" def create_multimodal_message(text, image=None): """Create a multimodal message with text and optional image""" if image is None: return {"role": "user", "content": text} content = [ { "type": "text", "text": text }, { "type": "image_url", "image_url": { "url": process_image_for_model(image) } } ] return {"role": "user", "content": content}