import gradio as gr import torch import cv2 import os import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO from decord import VideoReader, cpu from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer from backPrompt import main as main_b from frontPrompt import main as main_f model_path = "best.pt" modelY = YOLO(model_path) os.environ["TRANSFORMERS_CACHE"] = "./.cache" cache_folder = "./.cache" # Load the Hugging Face model and tokenizer globally (downloaded only once) model = AutoModel.from_pretrained( path, cache_dir=cache_folder, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True ).eval().to("cpu") tokenizer = AutoTokenizer.from_pretrained( path, cache_dir=cache_folder, trust_remote_code=True, use_fast=False ) def preprocessing(image): """Apply three enhancement filters, including brightness reduction, and resize.""" image = Image.fromarray(np.array(image)) image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness width = 800 aspect_ratio = image.height / image.width height = int(width * aspect_ratio) image = image.resize((width, height)) return image def imageRotation(image): if image.height > image.width: return image.rotate(90, expand=True) return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = modelY(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = modelY.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): if doc_type == "front": results = main_f(image,model,tokenizer) if doc_type == "back": results = main_b(image,model,tokenizer) return results def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = preprocessing(image) rotated_image = imageRotation(processed_image) # Placeholder for rotation detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") api_results = { "front": front_result, "back": back_result } single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image return single_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()