Spaces:
Sleeping
Sleeping
import torch | |
from torch import nn | |
import torchvision.transforms as transforms | |
import cv2 | |
import numpy as np | |
import gradio as gr | |
from PIL import Image | |
from facenet_pytorch import MTCNN | |
from transformers import ViTImageProcessor, ViTModel | |
import time | |
# Define the ViT class | |
class ViT(nn.Module): | |
def __init__(self, base_model): | |
super(ViT, self).__init__() | |
self.base_model = base_model | |
def forward(self, x): | |
x = self.base_model(x).pooler_output | |
return x | |
# Load the model and processor | |
model_name = "google/vit-base-patch16-224" | |
processor = ViTImageProcessor.from_pretrained(model_name) | |
base_model = ViTModel.from_pretrained("WinKawaks/vit-small-patch16-224") | |
model = ViT(base_model) | |
model.load_state_dict(torch.load('faceViT6.pth')) | |
# Set the model to evaluation mode | |
model.eval() | |
# Check if CUDA is available and move the model to GPU if it is | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model.to(device) | |
# Initialize MTCNN for face detection | |
mtcnn = MTCNN(keep_all=True, min_face_size=20, thresholds=[0.6, 0.7, 0.7], device=device) | |
# Define the transformation | |
transform = transforms.Compose([ | |
transforms.Resize((224, 224)), | |
transforms.ToTensor() | |
]) | |
def cosine_similarity(embedding1, embedding2): | |
similarity = torch.nn.functional.cosine_similarity(embedding1.flatten().unsqueeze(0), embedding2.flatten().unsqueeze(0)) | |
return similarity.item() | |
def align_face(frame): | |
# Convert the frame to a PIL image if it's a numpy array | |
if isinstance(frame, np.ndarray): | |
frame = Image.fromarray(frame) | |
boxes, _ = mtcnn.detect(frame) | |
if boxes is not None and len(boxes) > 0: | |
faces = mtcnn(frame) | |
if faces is not None and len(faces) > 0: | |
face = faces[0] | |
# Convert the face tensor to PIL Image | |
face = transforms.ToPILImage()(face) | |
return face | |
return None | |
def process_images(image1, image2): | |
start_time = time.time() | |
frame1 = np.array(image1) | |
frame2 = np.array(image2) | |
face1 = align_face(frame1) | |
face2 = align_face(frame2) | |
if face1 is None or face2 is None: | |
return None, "Face not detected in one or both images." | |
face1 = transform(face1).unsqueeze(0).to(device) | |
face2 = transform(face2).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
embedding1 = model(face1) | |
embedding2 = model(face2) | |
similarity = cosine_similarity(embedding1, embedding2) | |
end_time = time.time() | |
inference_time = end_time - start_time | |
result = f"Similarity: {similarity:.2f}\nInference time: {inference_time:.2f} seconds" | |
return (frame1, frame2), result | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=process_images, | |
inputs=[gr.Image(type="pil"), gr.Image(type="pil")], | |
outputs=[gr.Gallery(), gr.Textbox()], | |
title="Face Verification with MTCNN and ViT", | |
description="Upload two images and the model will verify if the faces in both images are of the same person." | |
) | |
# Launch the interface | |
iface.launch(share=True, debug=True) |