import gradio as gr import torch import cv2 import os import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO from decord import VideoReader, cpu from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer from backPrompt import main as main_b from frontPrompt import main as main_f import sentencepiece as spm model_path = "best.pt" modelY = YOLO(model_path) os.environ["TRANSFORMERS_CACHE"] = "./.cache" cache_folder = "./.cache" path = "OpenGVLab/InternVL2_5-2B" # Load the Hugging Face model and tokenizer globally (downloaded only once) model = AutoModel.from_pretrained( path, cache_dir=cache_folder, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # load_in_8bit=True, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True ).eval().cpu() tokenizer = AutoTokenizer.from_pretrained( path, cache_dir=cache_folder, trust_remote_code=True, use_fast=False ) def preprocessing(image, image_size=800): """ Apply enhancement filters and pad the image to match the target size while keeping full content. """ # Convert input to a PIL Image (if it isn’t already) image = Image.fromarray(np.array(image)) # Apply enhancement filters image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness # Get original dimensions orig_width, orig_height = image.size # Determine the padding needed to fit the image within a square of size `image_size` pad_x = max(image_size - orig_width, 0) pad_y = max(image_size - orig_height, 0) # Create a new blank image with a white background padded_image = Image.new("RGB", (orig_width + pad_x, orig_height + pad_y), (255, 255, 255)) # Paste the original image in the center padded_image.paste(image, (pad_x // 2, pad_y // 2)) return padded_image def imageRotation(image): if image.height > image.width: return image.rotate(90, expand=True) return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = modelY(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = modelY.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): if doc_type == "front": results = main_f(image,model,tokenizer) if doc_type == "back": results = main_b(image,model,tokenizer) return results def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = preprocessing(image) rotated_image = imageRotation(processed_image) # Placeholder for rotation detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") api_results = { "front": front_result, "back": back_result } single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image return single_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()