Spaces:

hasnanmr
/

recognition_anti-spoofing

Sleeping

File size: 8,231 Bytes

# import torch
# import torchvision.transforms as transforms
# import numpy as np
# import gradio as gr
# from PIL import Image, ImageDraw
# from facenet_pytorch import MTCNN, InceptionResnetV1
# import time

# # Initialize MTCNN for face detection with smaller face size detection
# mtcnn = MTCNN(keep_all=True, device='cuda' if torch.cuda.is_available() else 'cpu', min_face_size=20)

# # Load the pre-trained FaceNet model
# facenet = InceptionResnetV1(pretrained='vggface2').eval().to('cuda' if torch.cuda.is_available() else 'cpu')
# model_path = r'faceNet_update_transformation.pth'
# model_state_dict = torch.load(model_path)
# facenet.load_state_dict(model_state_dict)
# facenet.eval()  # Set the model to evaluation mode

# # Define the transformation with normalization
# val_test_transform = transforms.Compose([
#     transforms.Resize((160, 160)),  # FaceNet expects 160x160 input
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

# def compare_faces(embedding1, embedding2, threshold=0.2):  # Adjusted threshold
#     dist = np.linalg.norm(embedding1 - embedding2)
#     return dist, dist < threshold

# def align_face(frame):
#     # Convert the frame to a PIL image if it's a numpy array
#     if isinstance(frame, np.ndarray):
#         frame = Image.fromarray(frame)
#         boxes, _ = mtcnn.detect(frame)
#     if boxes is not None and len(boxes) > 0:
#         faces = mtcnn(frame)
#         if faces is not None and len(faces) > 0:
#             face = faces[0]
#             # Convert the face tensor to PIL Image
#             face = transforms.ToPILImage()(face)
#             return face, boxes[0]
#     return None, None

# def draw_bounding_box(image, box):
#     draw = ImageDraw.Draw(image)
#     draw.rectangle(box.tolist(), outline="red", width=3)
#     return image

# def l2_normalize(tensor):
#     norm = np.linalg.norm(tensor, ord=2, axis=1, keepdims=True)
#     return tensor / norm

# def process_images(image1, image2):
#     start_time = time.time()
    
#     frame1 = np.array(image1)
#     frame2 = np.array(image2)
    
#     face1, box1 = align_face(frame1)
#     face2, box2 = align_face(frame2)
    
#     if face1 is None or face2 is None:
#         return None, "Face not detected in one or both images."
    
#     face1 = val_test_transform(face1).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')
#     face2 = val_test_transform(face2).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')
    
#     with torch.no_grad():
#         embedding1 = facenet(face1).cpu().numpy()
#         embedding2 = facenet(face2).cpu().numpy()
    
#     embedding1 = l2_normalize(embedding1)
#     embedding2 = l2_normalize(embedding2)
    
#     distance, is_match = compare_faces(embedding1, embedding2, threshold=0.2)
    
#     # Calculate confidence
#     confidence = max(0.0, 1.0 - distance / 1.0)  # Ensure confidence is between 0 and 1
#     print(f'confidence={confidence}')
#     end_time = time.time()
#     inference_time = end_time - start_time
    
#     # Draw bounding boxes on the original images
#     image1_with_box = draw_bounding_box(image1, box1)
#     image2_with_box = draw_bounding_box(image2, box2)
    
#     result = f"Distance: {distance:.2f}\nMatch: {is_match}\nInference time: {inference_time:.2f} seconds"
    
#     return [image1_with_box, image2_with_box], result

# # Create the Gradio interface
# iface = gr.Interface(
#     fn=process_images,
#     inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
#     outputs=[gr.Gallery(), gr.Textbox()],
#     title="Face Verification with FaceNet",
#     description="Upload two images and the model will verify if the faces in both images are of the same person."
# )

# # Launch the interface
# iface.launch(share=True, debug=True)

import torch
import torch.nn as nn
import numpy as np
from PIL import Image, ImageDraw
from torchvision import transforms
from transformers import ViTImageProcessor, ViTModel
from facenet_pytorch import MTCNN
import gradio as gr
import time

# Define the Vision Transformer (ViT) architecture
class ViT(nn.Module):
    def __init__(self, base_model):
        super(ViT, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(base_model.config.hidden_size, 512)
        self.dropout2 = nn.Dropout(p=0.2)
        self.l2_norm = nn.functional.normalize

    def forward(self, x):
        x = self.base_model(x).pooler_output
        x = self.dropout(x)
        x = self.fc(x)
        x = self.dropout2(x)
        x = self.l2_norm(x, p=2, dim=1)  # Apply L2 normalization
        return x

# Load the pre-trained ViT model and processor
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)
base_model = ViTModel.from_pretrained(model_name)
model = ViT(base_model)
model_path = r'best_vit11.pth'
model.load_state_dict(torch.load(model_path))
model.eval().to('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=20, device='cuda' if torch.cuda.is_available() else 'cpu')

def align_face(frame):
    # Convert the frame to a PIL image if it's a numpy array
    if isinstance(frame, np.ndarray):
        frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None and len(boxes) > 0:
        faces = mtcnn(frame)
        if faces is not None and len(faces) > 0:
            face = faces[0]
            # Convert the face tensor to PIL Image
            face = transforms.ToPILImage()(face)
            return face, boxes[0]
    return None, None

def draw_bounding_box(image, box):
    draw = ImageDraw.Draw(image)
    draw.rectangle(box.tolist(), outline="red", width=3)
    return image

def euclidean_distance(embedding1, embedding2):
    return np.linalg.norm(embedding1 - embedding2)

def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def process_images(image1, image2):
    start_time = time.time()
    
    frame1 = np.array(image1)
    frame2 = np.array(image2)
    
    face1, box1 = align_face(frame1)
    face2, box2 = align_face(frame2)
    
    if face1 is None or face2 is None:
        return None, "Face not detected in one or both images."
    
    # Use processor to preprocess the images
    face1 = processor(images=face1, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu')
    face2 = processor(images=face2, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    with torch.no_grad():
        embedding1 = model(face1).cpu().numpy()
        embedding2 = model(face2).cpu().numpy()
    
    # Flatten the embeddings if necessary (ensuring they are 1D)
    embedding1 = embedding1.flatten()
    embedding2 = embedding2.flatten()
    
    euclidean_dist = euclidean_distance(embedding1, embedding2)
    cosine_sim = cosine_similarity(embedding1, embedding2)
    is_match = euclidean_dist < 0.2
    
    # Calculate confidence
    confidence = max(0.0, 1.0 - euclidean_dist / 1.0)  # Ensure confidence is between 0 and 1
    print(f'confidence={confidence}')
    end_time = time.time()
    inference_time = end_time - start_time
    
    # Draw bounding boxes on the original images
    image1_with_box = draw_bounding_box(image1, box1)
    image2_with_box = draw_bounding_box(image2, box2)
    
    result = f"Euclidean Distance: {euclidean_dist:.2f}\n"
    # result += f"Cosine Similarity: {cosine_sim:.2f}\n"
    result += f"Match: {is_match}\n"
    result += f"Inference time: {inference_time:.2f} seconds"
    
    return [image1_with_box, image2_with_box], result

# Create the Gradio interface
iface = gr.Interface(
    fn=process_images,
    inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
    outputs=[gr.Gallery(), gr.Textbox()],
    title="Face Verification with Vision Transformer",
    description="Upload two images and the model will verify if the faces in both images are of the same person."
)

# Launch the interface
iface.launch(share=True, debug=True)