File size: 3,120 Bytes
bc26e93
2d3c662
 
607a1be
2d3c662
607a1be
 
2d3c662
d621e56
2d3c662
d621e56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc26e93
d621e56
bc26e93
d621e56
607a1be
d621e56
bc26e93
 
d621e56
 
 
 
 
 
 
34e5b71
 
d621e56
f7f224b
7894302
 
 
bc26e93
f7f224b
39f5823
f7f224b
 
 
 
 
 
bc26e93
f7f224b
2d3c662
 
f7f224b
 
 
 
 
d621e56
f7f224b
 
 
 
 
 
 
 
 
 
 
d621e56
2d3c662
 
 
f7f224b
 
 
d621e56
607a1be
2d3c662
f7f224b
 
 
 
 
2d3c662
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import torch
from torch import nn
import torchvision.transforms as transforms
import cv2
import numpy as np
import gradio as gr
from PIL import Image
from facenet_pytorch import MTCNN
from transformers import ViTImageProcessor, ViTModel
import time

# Define the ViT class
class ViT(nn.Module):
    def __init__(self, base_model):
        super(ViT, self).__init__()
        self.base_model = base_model
    
    def forward(self, x):
        x = self.base_model(x).pooler_output 
        return x

# Load the model and processor
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)   
base_model = ViTModel.from_pretrained("WinKawaks/vit-small-patch16-224")
model = ViT(base_model)
model.load_state_dict(torch.load('faceViT6.pth'))

# Set the model to evaluation mode
model.eval()

# Check if CUDA is available and move the model to GPU if it is
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=20, thresholds=[0.6, 0.7, 0.7], device=device)

# Define the transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def cosine_similarity(embedding1, embedding2):
    similarity = torch.nn.functional.cosine_similarity(embedding1.flatten().unsqueeze(0), embedding2.flatten().unsqueeze(0))
    return similarity.item()

def align_face(frame):
    # Convert the frame to a PIL image if it's a numpy array
    if isinstance(frame, np.ndarray):
        frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None and len(boxes) > 0:
        faces = mtcnn(frame)
        if faces is not None and len(faces) > 0:
            face = faces[0]
            # Convert the face tensor to PIL Image
            face = transforms.ToPILImage()(face)
            return face
    return None

def process_images(image1, image2):
    start_time = time.time()
    
    frame1 = np.array(image1)
    frame2 = np.array(image2)
    
    face1 = align_face(frame1)
    face2 = align_face(frame2)
    
    if face1 is None or face2 is None:
        return None, "Face not detected in one or both images."
    
    face1 = transform(face1).unsqueeze(0).to(device)
    face2 = transform(face2).unsqueeze(0).to(device)
    
    with torch.no_grad():
        embedding1 = model(face1)
        embedding2 = model(face2)
    
    similarity = cosine_similarity(embedding1, embedding2)
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    result = f"Similarity: {similarity:.2f}\nInference time: {inference_time:.2f} seconds"
    
    return (frame1, frame2), result

# Create the Gradio interface
iface = gr.Interface(
    fn=process_images,
    inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
    outputs=[gr.Gallery(), gr.Textbox()],
    title="Face Verification with MTCNN and ViT",
    description="Upload two images and the model will verify if the faces in both images are of the same person."
)

# Launch the interface
iface.launch(share=True, debug=True)