File size: 8,231 Bytes
ea05742
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc26e93
ea05742
2d3c662
e0006de
ea05742
 
 
 
2d3c662
d621e56
ea05742
 
 
 
 
 
 
 
 
607a1be
ea05742
 
 
 
 
 
 
bc26e93
ea05742
 
 
 
 
 
 
 
d621e56
ea05742
 
d621e56
f7f224b
7894302
 
 
bc26e93
f7f224b
39f5823
f7f224b
 
 
 
e0006de
 
 
 
 
 
 
bc26e93
ea05742
 
 
 
 
fd50f72
f7f224b
2d3c662
 
f7f224b
 
 
e0006de
 
d621e56
f7f224b
 
 
ea05742
 
 
f7f224b
 
ea05742
 
f7f224b
ea05742
 
 
fd50f72
ea05742
 
 
007113c
61114d2
ea05742
61114d2
2d3c662
 
 
e0006de
 
 
 
ea05742
 
 
 
f7f224b
e0006de
d621e56
607a1be
2d3c662
f7f224b
 
 
ea05742
f7f224b
2d3c662
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# import torch
# import torchvision.transforms as transforms
# import numpy as np
# import gradio as gr
# from PIL import Image, ImageDraw
# from facenet_pytorch import MTCNN, InceptionResnetV1
# import time

# # Initialize MTCNN for face detection with smaller face size detection
# mtcnn = MTCNN(keep_all=True, device='cuda' if torch.cuda.is_available() else 'cpu', min_face_size=20)

# # Load the pre-trained FaceNet model
# facenet = InceptionResnetV1(pretrained='vggface2').eval().to('cuda' if torch.cuda.is_available() else 'cpu')
# model_path = r'faceNet_update_transformation.pth'
# model_state_dict = torch.load(model_path)
# facenet.load_state_dict(model_state_dict)
# facenet.eval()  # Set the model to evaluation mode

# # Define the transformation with normalization
# val_test_transform = transforms.Compose([
#     transforms.Resize((160, 160)),  # FaceNet expects 160x160 input
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

# def compare_faces(embedding1, embedding2, threshold=0.2):  # Adjusted threshold
#     dist = np.linalg.norm(embedding1 - embedding2)
#     return dist, dist < threshold

# def align_face(frame):
#     # Convert the frame to a PIL image if it's a numpy array
#     if isinstance(frame, np.ndarray):
#         frame = Image.fromarray(frame)
#         boxes, _ = mtcnn.detect(frame)
#     if boxes is not None and len(boxes) > 0:
#         faces = mtcnn(frame)
#         if faces is not None and len(faces) > 0:
#             face = faces[0]
#             # Convert the face tensor to PIL Image
#             face = transforms.ToPILImage()(face)
#             return face, boxes[0]
#     return None, None

# def draw_bounding_box(image, box):
#     draw = ImageDraw.Draw(image)
#     draw.rectangle(box.tolist(), outline="red", width=3)
#     return image

# def l2_normalize(tensor):
#     norm = np.linalg.norm(tensor, ord=2, axis=1, keepdims=True)
#     return tensor / norm

# def process_images(image1, image2):
#     start_time = time.time()
    
#     frame1 = np.array(image1)
#     frame2 = np.array(image2)
    
#     face1, box1 = align_face(frame1)
#     face2, box2 = align_face(frame2)
    
#     if face1 is None or face2 is None:
#         return None, "Face not detected in one or both images."
    
#     face1 = val_test_transform(face1).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')
#     face2 = val_test_transform(face2).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')
    
#     with torch.no_grad():
#         embedding1 = facenet(face1).cpu().numpy()
#         embedding2 = facenet(face2).cpu().numpy()
    
#     embedding1 = l2_normalize(embedding1)
#     embedding2 = l2_normalize(embedding2)
    
#     distance, is_match = compare_faces(embedding1, embedding2, threshold=0.2)
    
#     # Calculate confidence
#     confidence = max(0.0, 1.0 - distance / 1.0)  # Ensure confidence is between 0 and 1
#     print(f'confidence={confidence}')
#     end_time = time.time()
#     inference_time = end_time - start_time
    
#     # Draw bounding boxes on the original images
#     image1_with_box = draw_bounding_box(image1, box1)
#     image2_with_box = draw_bounding_box(image2, box2)
    
#     result = f"Distance: {distance:.2f}\nMatch: {is_match}\nInference time: {inference_time:.2f} seconds"
    
#     return [image1_with_box, image2_with_box], result

# # Create the Gradio interface
# iface = gr.Interface(
#     fn=process_images,
#     inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
#     outputs=[gr.Gallery(), gr.Textbox()],
#     title="Face Verification with FaceNet",
#     description="Upload two images and the model will verify if the faces in both images are of the same person."
# )

# # Launch the interface
# iface.launch(share=True, debug=True)

import torch
import torch.nn as nn
import numpy as np
from PIL import Image, ImageDraw
from torchvision import transforms
from transformers import ViTImageProcessor, ViTModel
from facenet_pytorch import MTCNN
import gradio as gr
import time

# Define the Vision Transformer (ViT) architecture
class ViT(nn.Module):
    def __init__(self, base_model):
        super(ViT, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(base_model.config.hidden_size, 512)
        self.dropout2 = nn.Dropout(p=0.2)
        self.l2_norm = nn.functional.normalize

    def forward(self, x):
        x = self.base_model(x).pooler_output
        x = self.dropout(x)
        x = self.fc(x)
        x = self.dropout2(x)
        x = self.l2_norm(x, p=2, dim=1)  # Apply L2 normalization
        return x

# Load the pre-trained ViT model and processor
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)
base_model = ViTModel.from_pretrained(model_name)
model = ViT(base_model)
model_path = r'best_vit11.pth'
model.load_state_dict(torch.load(model_path))
model.eval().to('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=20, device='cuda' if torch.cuda.is_available() else 'cpu')

def align_face(frame):
    # Convert the frame to a PIL image if it's a numpy array
    if isinstance(frame, np.ndarray):
        frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None and len(boxes) > 0:
        faces = mtcnn(frame)
        if faces is not None and len(faces) > 0:
            face = faces[0]
            # Convert the face tensor to PIL Image
            face = transforms.ToPILImage()(face)
            return face, boxes[0]
    return None, None

def draw_bounding_box(image, box):
    draw = ImageDraw.Draw(image)
    draw.rectangle(box.tolist(), outline="red", width=3)
    return image

def euclidean_distance(embedding1, embedding2):
    return np.linalg.norm(embedding1 - embedding2)

def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def process_images(image1, image2):
    start_time = time.time()
    
    frame1 = np.array(image1)
    frame2 = np.array(image2)
    
    face1, box1 = align_face(frame1)
    face2, box2 = align_face(frame2)
    
    if face1 is None or face2 is None:
        return None, "Face not detected in one or both images."
    
    # Use processor to preprocess the images
    face1 = processor(images=face1, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu')
    face2 = processor(images=face2, return_tensors="pt").pixel_values.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    with torch.no_grad():
        embedding1 = model(face1).cpu().numpy()
        embedding2 = model(face2).cpu().numpy()
    
    # Flatten the embeddings if necessary (ensuring they are 1D)
    embedding1 = embedding1.flatten()
    embedding2 = embedding2.flatten()
    
    euclidean_dist = euclidean_distance(embedding1, embedding2)
    cosine_sim = cosine_similarity(embedding1, embedding2)
    is_match = euclidean_dist < 0.2
    
    # Calculate confidence
    confidence = max(0.0, 1.0 - euclidean_dist / 1.0)  # Ensure confidence is between 0 and 1
    print(f'confidence={confidence}')
    end_time = time.time()
    inference_time = end_time - start_time
    
    # Draw bounding boxes on the original images
    image1_with_box = draw_bounding_box(image1, box1)
    image2_with_box = draw_bounding_box(image2, box2)
    
    result = f"Euclidean Distance: {euclidean_dist:.2f}\n"
    # result += f"Cosine Similarity: {cosine_sim:.2f}\n"
    result += f"Match: {is_match}\n"
    result += f"Inference time: {inference_time:.2f} seconds"
    
    return [image1_with_box, image2_with_box], result

# Create the Gradio interface
iface = gr.Interface(
    fn=process_images,
    inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
    outputs=[gr.Gallery(), gr.Textbox()],
    title="Face Verification with Vision Transformer",
    description="Upload two images and the model will verify if the faces in both images are of the same person."
)

# Launch the interface
iface.launch(share=True, debug=True)