import torch from torch import nn import torchvision.transforms as transforms import cv2 import numpy as np import gradio as gr from PIL import Image from facenet_pytorch import MTCNN from transformers import ViTImageProcessor, ViTModel import time # Define the ViT class class ViT(nn.Module): def __init__(self, base_model): super(ViT, self).__init__() self.base_model = base_model def forward(self, x): x = self.base_model(x).pooler_output return x # Load the model and processor model_name = "google/vit-base-patch16-224" processor = ViTImageProcessor.from_pretrained(model_name) base_model = ViTModel.from_pretrained("WinKawaks/vit-small-patch16-224") model = ViT(base_model) model.load_state_dict(torch.load('faceViT4.pth')) # Set the model to evaluation mode model.eval() # Check if CUDA is available and move the model to GPU if it is device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Initialize MTCNN for face detection mtcnn = MTCNN(keep_all=True, device=device) # Define the transformation transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor() ]) def compare_faces(embedding1, embedding2, threshold=1.0): dist = np.linalg.norm(embedding1.cpu().numpy() - embedding2.cpu().numpy()) return dist, dist < threshold def align_face(frame): # Convert the frame to a PIL image if it's a numpy array if isinstance(frame, np.ndarray): frame = Image.fromarray(frame) boxes, _ = mtcnn.detect(frame) if boxes is not None and len(boxes) > 0: faces = mtcnn(frame) if faces is not None and len(faces) > 0: face = faces[0] # Convert the face tensor to PIL Image face = transforms.ToPILImage()(face) return face return None def l2_normalize(tensor): norm = torch.norm(tensor, p=2, dim=1, keepdim=True) return tensor / norm def process_images(image1, image2): start_time = time.time() frame1 = np.array(image1) frame2 = np.array(image2) face1 = align_face(frame1) face2 = align_face(frame2) if face1 is None or face2 is None: return None, "Face not detected in one or both images." face1 = transform(face1).unsqueeze(0).to(device) face2 = transform(face2).unsqueeze(0).to(device) with torch.no_grad(): embedding1 = model(face1) embedding2 = model(face2) embedding1 = l2_normalize(embedding1) embedding2 = l2_normalize(embedding2) distance, is_match = compare_faces(embedding1, embedding2) end_time = time.time() inference_time = end_time - start_time result = f"Distance: {distance:.2f}\nMatch: {is_match}\nInference time: {inference_time:.2f} seconds" return (image1, image2), result # Create the Gradio interface iface = gr.Interface( fn=process_images, inputs=[gr.Image(type="pil"), gr.Image(type="pil")], outputs=[gr.Gallery(), gr.Textbox()], title="Face Verification with MTCNN and ViT", description="Upload two images and the model will verify if the faces in both images are of the same person." ) # Launch the interface iface.launch(share=True, debug=True)