import torch from torch import nn import torchvision.transforms as transforms import cv2 import numpy as np import gradio as gr from PIL import Image from facenet_pytorch import MTCNN from transformers import ViTImageProcessor, ViTModel import time # Define the ViT class class ViT(nn.Module): def __init__(self, base_model): super(ViT, self).__init__() self.base_model = base_model def forward(self, x): x = self.base_model(x).pooler_output return x # Load the model and processor model_name = "google/vit-base-patch16-224" processor = ViTImageProcessor.from_pretrained(model_name) base_model = ViTModel.from_pretrained("WinKawaks/vit-small-patch16-224") model = ViT(base_model) model.load_state_dict(torch.load('faceViT6.pth')) # Set the model to evaluation mode model.eval() # Check if CUDA is available and move the model to GPU if it is device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Initialize MTCNN for face detection mtcnn = MTCNN(keep_all=True, min_face_size=20, thresholds=[0.6, 0.7, 0.7], device=device) # Define the transformation transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor() ]) def cosine_similarity(embedding1, embedding2): similarity = torch.nn.functional.cosine_similarity(embedding1.flatten().unsqueeze(0), embedding2.flatten().unsqueeze(0)) return similarity.item() def align_face(frame): # Convert the frame to a PIL image if it's a numpy array if isinstance(frame, np.ndarray): frame = Image.fromarray(frame) boxes, _ = mtcnn.detect(frame) if boxes is not None and len(boxes) > 0: faces = mtcnn(frame) if faces is not None and len(faces) > 0: face = faces[0] # Convert the face tensor to PIL Image face = transforms.ToPILImage()(face) return face return None def process_images(image1, image2): start_time = time.time() frame1 = np.array(image1) frame2 = np.array(image2) face1 = align_face(frame1) face2 = align_face(frame2) if face1 is None or face2 is None: return None, "Face not detected in one or both images." face1 = transform(face1).unsqueeze(0).to(device) face2 = transform(face2).unsqueeze(0).to(device) with torch.no_grad(): embedding1 = model(face1) embedding2 = model(face2) similarity = cosine_similarity(embedding1, embedding2) end_time = time.time() inference_time = end_time - start_time result = f"Similarity: {similarity:.2f}\nInference time: {inference_time:.2f} seconds" return (frame1, frame2), result # Create the Gradio interface iface = gr.Interface( fn=process_images, inputs=[gr.Image(type="pil"), gr.Image(type="pil")], outputs=[gr.Gallery(), gr.Textbox()], title="Face Verification with MTCNN and ViT", description="Upload two images and the model will verify if the faces in both images are of the same person." ) # Launch the interface iface.launch(share=True, debug=True)