|
import os
|
|
import mimetypes
|
|
import PyPDF2
|
|
import docx
|
|
import cv2
|
|
import numpy as np
|
|
from PIL import Image
|
|
import pytesseract
|
|
|
|
def process_image_for_model(image):
|
|
"""Convert image to base64 for model input"""
|
|
if image is None:
|
|
return None
|
|
|
|
|
|
import io
|
|
import base64
|
|
|
|
|
|
if isinstance(image, np.ndarray):
|
|
image = Image.fromarray(image)
|
|
|
|
buffer = io.BytesIO()
|
|
image.save(buffer, format='PNG')
|
|
img_str = base64.b64encode(buffer.getvalue()).decode()
|
|
return f"data:image/png;base64,{img_str}"
|
|
|
|
def extract_text_from_image(image_path):
|
|
"""Extract text from image using OCR"""
|
|
try:
|
|
|
|
try:
|
|
pytesseract.get_tesseract_version()
|
|
except Exception:
|
|
return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
|
|
|
|
image = cv2.imread(image_path)
|
|
if image is None:
|
|
return "Error: Could not read image file"
|
|
|
|
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
|
|
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
text = pytesseract.image_to_string(binary, config='--psm 6')
|
|
return text.strip() if text.strip() else "No text found in image"
|
|
|
|
except Exception as e:
|
|
return f"Error extracting text from image: {e}"
|
|
|
|
def extract_text_from_file(file_path):
|
|
if not file_path:
|
|
return ""
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
try:
|
|
if ext == ".pdf":
|
|
with open(file_path, "rb") as f:
|
|
reader = PyPDF2.PdfReader(f)
|
|
return "\n".join(page.extract_text() or "" for page in reader.pages)
|
|
elif ext in [".txt", ".md", ".csv"]:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
elif ext == ".docx":
|
|
doc = docx.Document(file_path)
|
|
return "\n".join([para.text for para in doc.paragraphs])
|
|
elif ext.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"]:
|
|
return extract_text_from_image(file_path)
|
|
else:
|
|
return ""
|
|
except Exception as e:
|
|
return f"Error extracting text: {e}"
|
|
|
|
def create_multimodal_message(text, image=None):
|
|
"""Create a multimodal message with text and optional image"""
|
|
if image is None:
|
|
return {"role": "user", "content": text}
|
|
|
|
content = [
|
|
{
|
|
"type": "text",
|
|
"text": text
|
|
},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": process_image_for_model(image)
|
|
}
|
|
}
|
|
]
|
|
|
|
return {"role": "user", "content": content} |