import gradio as gr import torch import cv2 import os import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO from decord import VideoReader, cpu from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer from backPrompt import main as main_b from frontPrompt import main as main_f import sentencepiece as spm model_path = "best.pt" modelY = YOLO(model_path) os.environ["TRANSFORMERS_CACHE"] = "./.cache" cache_folder = "./.cache" path = "OpenGVLab/InternVL2_5-2B" # Load the Hugging Face model and tokenizer globally (downloaded only once) model = AutoModel.from_pretrained( path, cache_dir=cache_folder, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # load_in_8bit=True, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True ).eval().cpu() tokenizer = AutoTokenizer.from_pretrained( path, cache_dir=cache_folder, trust_remote_code=True, use_fast=False ) def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): # Ensure the image is a PIL Image if not isinstance(image, Image.Image): image = Image.fromarray(image) orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # Calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num ) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # Find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size ) # Calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] # Resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): # Calculate the crop box for each block box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size ) # Split the image split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images[0] def imageRotation(image): if image.height > image.width: return image.rotate(90, expand=True) return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = modelY(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = modelY.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): if doc_type == "front": results = main_f(image,model,tokenizer) if doc_type == "back": results = main_b(image,model,tokenizer) return results def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = dynamic_preprocess(image) rotated_image = imageRotation(processed_image) # Placeholder for rotation detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") api_results = { "front": front_result, "back": back_result } single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image return single_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()