import torch import torch.nn as nn import numpy as np from PIL import Image, ImageDraw from torchvision import transforms from transformers import ViTImageProcessor, ViTModel from facenet_pytorch import MTCNN import gradio as gr import time # Define the Vision Transformer (ViT) architecture class ViT(nn.Module): def __init__(self, base_model): super(ViT, self).__init__() self.base_model = base_model self.dropout = nn.Dropout(p=0.2) self.fc = nn.Linear(base_model.config.hidden_size, 512) self.dropout2 = nn.Dropout(p=0.2) self.l2_norm = nn.functional.normalize def forward(self, x): x = self.base_model(x).pooler_output x = self.dropout(x) x = self.fc(x) x = self.dropout2(x) x = self.l2_norm(x, p=2, dim=1) # Apply L2 normalization return x # Load the pre-trained ViT model and processor model_name = "google/vit-base-patch16-224" processor = ViTImageProcessor.from_pretrained(model_name) base_model = ViTModel.from_pretrained(model_name) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = ViT(base_model).to(device) model_path = r'best_vit11.pth' model.load_state_dict(torch.load(model_path)) # Initialize MTCNN for face detection mtcnn = MTCNN(keep_all=True, post_process=False, min_face_size=12, device=device) def align_face(frame): # Convert the frame to a PIL image if it's a numpy array if isinstance(frame, np.ndarray): frame = Image.fromarray(frame) boxes, _ = mtcnn.detect(frame) if boxes is not None and len(boxes) > 0: faces = mtcnn(frame) if faces is not None and len(faces) > 0: face = faces[0] # Convert the face tensor to PIL Image face = transforms.ToPILImage()(face) return face, boxes[0] return None, None def draw_bounding_box(image, box): draw = ImageDraw.Draw(image) draw.rectangle(box.tolist(), outline="red", width=3) return image def euclidean_distance(embedding1, embedding2): return np.linalg.norm(embedding1 - embedding2) def process_images(image1, image2): start_time = time.time() frame1 = np.array(image1) frame2 = np.array(image2) face1, box1 = align_face(frame1) face2, box2 = align_face(frame2) if face1 is None or face2 is None: return None, "Face not detected in one or both images." # Use processor to preprocess the images face1 = processor(images=face1, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu') face2 = processor(images=face2, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu') with torch.no_grad(): embedding1 = model(face1).cpu().numpy() embedding2 = model(face2).cpu().numpy() # Flatten the embeddings if necessary (ensuring they are 1D) embedding1 = embedding1.flatten() embedding2 = embedding2.flatten() euclidean_dist = euclidean_distance(embedding1, embedding2) is_match = euclidean_dist < 0.8 # Calculate confidence confidence = max(0.0, 1.0 - euclidean_dist / 1.0) # Ensure confidence is between 0 and 1 print(f'confidence={confidence}') end_time = time.time() inference_time = end_time - start_time # Draw bounding boxes on the original images image1_with_box = draw_bounding_box(image1, box1) image2_with_box = draw_bounding_box(image2, box2) result = f"Euclidean Distance: {euclidean_dist:.2f}\n" result += f"Match: {is_match}\n" result += f"Inference time: {inference_time:.2f} seconds" return [image1_with_box, image2_with_box], result # Create the Gradio interface iface = gr.Interface( fn=process_images, inputs=[gr.Image(type="pil"), gr.Image(type="pil")], outputs=[gr.Gallery(), gr.Textbox()], title="Face Verification with Vision Transformer", description="Upload two images and the model will verify if the faces in both images are of the same person." ) # Launch the interface iface.launch(share=True, debug=True)