builder / file_processing.py
mgbam's picture
Upload 6 files
51d8a3f verified
import os
import mimetypes
import PyPDF2
import docx
import cv2
import numpy as np
from PIL import Image
import pytesseract
def process_image_for_model(image):
"""Convert image to base64 for model input"""
if image is None:
return None
# Convert numpy array to PIL Image if needed
import io
import base64
# Handle numpy array from Gradio
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
buffer = io.BytesIO()
image.save(buffer, format='PNG')
img_str = base64.b64encode(buffer.getvalue()).decode()
return f"data:image/png;base64,{img_str}"
def extract_text_from_image(image_path):
"""Extract text from image using OCR"""
try:
# Check if tesseract is available
try:
pytesseract.get_tesseract_version()
except Exception:
return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
image = cv2.imread(image_path)
if image is None:
return "Error: Could not read image file"
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
text = pytesseract.image_to_string(binary, config='--psm 6')
return text.strip() if text.strip() else "No text found in image"
except Exception as e:
return f"Error extracting text from image: {e}"
def extract_text_from_file(file_path):
if not file_path:
return ""
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
return "\n".join(page.extract_text() or "" for page in reader.pages)
elif ext in [".txt", ".md", ".csv"]:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
elif ext == ".docx":
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif ext.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"]:
return extract_text_from_image(file_path)
else:
return ""
except Exception as e:
return f"Error extracting text: {e}"
def create_multimodal_message(text, image=None):
"""Create a multimodal message with text and optional image"""
if image is None:
return {"role": "user", "content": text}
content = [
{
"type": "text",
"text": text
},
{
"type": "image_url",
"image_url": {
"url": process_image_for_model(image)
}
}
]
return {"role": "user", "content": content}