Spaces:
Sleeping
Sleeping
# import torch | |
# import torchvision.transforms as transforms | |
# import numpy as np | |
# import gradio as gr | |
# from PIL import Image, ImageDraw | |
# from facenet_pytorch import MTCNN, InceptionResnetV1 | |
# import time | |
# # Initialize MTCNN for face detection with smaller face size detection | |
# mtcnn = MTCNN(keep_all=True, device='cuda' if torch.cuda.is_available() else 'cpu', min_face_size=20) | |
# # Load the pre-trained FaceNet model | |
# facenet = InceptionResnetV1(pretrained='vggface2').eval().to('cuda' if torch.cuda.is_available() else 'cpu') | |
# model_path = r'faceNet_update_transformation.pth' | |
# model_state_dict = torch.load(model_path) | |
# facenet.load_state_dict(model_state_dict) | |
# facenet.eval() # Set the model to evaluation mode | |
# # Define the transformation with normalization | |
# val_test_transform = transforms.Compose([ | |
# transforms.Resize((160, 160)), # FaceNet expects 160x160 input | |
# transforms.ToTensor(), | |
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |
# ]) | |
# def compare_faces(embedding1, embedding2, threshold=0.2): # Adjusted threshold | |
# dist = np.linalg.norm(embedding1 - embedding2) | |
# return dist, dist < threshold | |
# def align_face(frame): | |
# # Convert the frame to a PIL image if it's a numpy array | |
# if isinstance(frame, np.ndarray): | |
# frame = Image.fromarray(frame) | |
# boxes, _ = mtcnn.detect(frame) | |
# if boxes is not None and len(boxes) > 0: | |
# faces = mtcnn(frame) | |
# if faces is not None and len(faces) > 0: | |
# face = faces[0] | |
# # Convert the face tensor to PIL Image | |
# face = transforms.ToPILImage()(face) | |
# return face, boxes[0] | |
# return None, None | |
# def draw_bounding_box(image, box): | |
# draw = ImageDraw.Draw(image) | |
# draw.rectangle(box.tolist(), outline="red", width=3) | |
# return image | |
# def l2_normalize(tensor): | |
# norm = np.linalg.norm(tensor, ord=2, axis=1, keepdims=True) | |
# return tensor / norm | |
# def process_images(image1, image2): | |
# start_time = time.time() | |
# frame1 = np.array(image1) | |
# frame2 = np.array(image2) | |
# face1, box1 = align_face(frame1) | |
# face2, box2 = align_face(frame2) | |
# if face1 is None or face2 is None: | |
# return None, "Face not detected in one or both images." | |
# face1 = val_test_transform(face1).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu') | |
# face2 = val_test_transform(face2).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu') | |
# with torch.no_grad(): | |
# embedding1 = facenet(face1).cpu().numpy() | |
# embedding2 = facenet(face2).cpu().numpy() | |
# embedding1 = l2_normalize(embedding1) | |
# embedding2 = l2_normalize(embedding2) | |
# distance, is_match = compare_faces(embedding1, embedding2, threshold=0.2) | |
# # Calculate confidence | |
# confidence = max(0.0, 1.0 - distance / 1.0) # Ensure confidence is between 0 and 1 | |
# print(f'confidence={confidence}') | |
# end_time = time.time() | |
# inference_time = end_time - start_time | |
# # Draw bounding boxes on the original images | |
# image1_with_box = draw_bounding_box(image1, box1) | |
# image2_with_box = draw_bounding_box(image2, box2) | |
# result = f"Distance: {distance:.2f}\nMatch: {is_match}\nInference time: {inference_time:.2f} seconds" | |
# return [image1_with_box, image2_with_box], result | |
# # Create the Gradio interface | |
# iface = gr.Interface( | |
# fn=process_images, | |
# inputs=[gr.Image(type="pil"), gr.Image(type="pil")], | |
# outputs=[gr.Gallery(), gr.Textbox()], | |
# title="Face Verification with FaceNet", | |
# description="Upload two images and the model will verify if the faces in both images are of the same person." | |
# ) | |
# # Launch the interface | |
# iface.launch(share=True, debug=True) | |
import torch | |
import torch.nn as nn | |
import numpy as np | |
from PIL import Image, ImageDraw | |
from torchvision import transforms | |
from transformers import ViTImageProcessor, ViTModel | |
from facenet_pytorch import MTCNN | |
import gradio as gr | |
import time | |
# Define the Vision Transformer (ViT) architecture | |
class ViT(nn.Module): | |
def __init__(self, base_model): | |
super(ViT, self).__init__() | |
self.base_model = base_model | |
self.dropout = nn.Dropout(p=0.2) | |
self.fc = nn.Linear(base_model.config.hidden_size, 512) | |
self.dropout2 = nn.Dropout(p=0.2) | |
self.l2_norm = nn.functional.normalize | |
def forward(self, x): | |
x = self.base_model(x).pooler_output | |
x = self.dropout(x) | |
x = self.fc(x) | |
x = self.dropout2(x) | |
x = self.l2_norm(x, p=2, dim=1) # Apply L2 normalization | |
return x | |
# Load the pre-trained ViT model and processor | |
model_name = "google/vit-base-patch16-224" | |
processor = ViTImageProcessor.from_pretrained(model_name) | |
base_model = ViTModel.from_pretrained(model_name) | |
model = ViT(base_model) | |
model_path = r'best_vit11.pth' | |
model.load_state_dict(torch.load(model_path)) | |
model.eval().to('cuda' if torch.cuda.is_available() else 'cpu') | |
# Initialize MTCNN for face detection | |
mtcnn = MTCNN(keep_all=True, min_face_size=20, device='cuda' if torch.cuda.is_available() else 'cpu') | |
def align_face(frame): | |
# Convert the frame to a PIL image if it's a numpy array | |
if isinstance(frame, np.ndarray): | |
frame = Image.fromarray(frame) | |
boxes, _ = mtcnn.detect(frame) | |
if boxes is not None and len(boxes) > 0: | |
faces = mtcnn(frame) | |
if faces is not None and len(faces) > 0: | |
face = faces[0] | |
# Convert the face tensor to PIL Image | |
face = transforms.ToPILImage()(face) | |
return face, boxes[0] | |
return None, None | |
def draw_bounding_box(image, box): | |
draw = ImageDraw.Draw(image) | |
draw.rectangle(box.tolist(), outline="red", width=3) | |
return image | |
def euclidean_distance(embedding1, embedding2): | |
return np.linalg.norm(embedding1 - embedding2) | |
def cosine_similarity(embedding1, embedding2): | |
return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) | |
def process_images(image1, image2): | |
start_time = time.time() | |
frame1 = np.array(image1) | |
frame2 = np.array(image2) | |
face1, box1 = align_face(frame1) | |
face2, box2 = align_face(frame2) | |
if face1 is None or face2 is None: | |
return None, "Face not detected in one or both images." | |
# Use processor to preprocess the images | |
face1 = processor(images=face1, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu') | |
face2 = processor(images=face2, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu') | |
with torch.no_grad(): | |
embedding1 = model(face1).cpu().numpy() | |
embedding2 = model(face2).cpu().numpy() | |
# Flatten the embeddings if necessary (ensuring they are 1D) | |
embedding1 = embedding1.flatten() | |
embedding2 = embedding2.flatten() | |
euclidean_dist = euclidean_distance(embedding1, embedding2) | |
cosine_sim = cosine_similarity(embedding1, embedding2) | |
is_match = euclidean_dist < 0.2 | |
# Calculate confidence | |
confidence = max(0.0, 1.0 - euclidean_dist / 1.0) # Ensure confidence is between 0 and 1 | |
print(f'confidence={confidence}') | |
end_time = time.time() | |
inference_time = end_time - start_time | |
# Draw bounding boxes on the original images | |
image1_with_box = draw_bounding_box(image1, box1) | |
image2_with_box = draw_bounding_box(image2, box2) | |
result = f"Euclidean Distance: {euclidean_dist:.2f}\n" | |
# result += f"Cosine Similarity: {cosine_sim:.2f}\n" | |
result += f"Match: {is_match}\n" | |
result += f"Inference time: {inference_time:.2f} seconds" | |
return [image1_with_box, image2_with_box], result | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=process_images, | |
inputs=[gr.Image(type="pil"), gr.Image(type="pil")], | |
outputs=[gr.Gallery(), gr.Textbox()], | |
title="Face Verification with Vision Transformer", | |
description="Upload two images and the model will verify if the faces in both images are of the same person." | |
) | |
# Launch the interface | |
iface.launch(share=True, debug=True) |