import torch
from torch import nn
import torchvision.transforms as transforms
import cv2
import numpy as np
import gradio as gr
from PIL import Image
from facenet_pytorch import MTCNN
from transformers import ViTImageProcessor, ViTModel
import time

# Define the ViT class
class ViT(nn.Module):
    def __init__(self, base_model):
        super(ViT, self).__init__()
        self.base_model = base_model
    
    def forward(self, x):
        x = self.base_model(x).pooler_output 
        return x

# Load the model and processor
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)   
base_model = ViTModel.from_pretrained("WinKawaks/vit-small-patch16-224")
model = ViT(base_model)
model.load_state_dict(torch.load('faceViT6.pth'))

# Set the model to evaluation mode
model.eval()

# Check if CUDA is available and move the model to GPU if it is
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=20, thresholds=[0.6, 0.7, 0.7], device=device)

# Define the transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def cosine_similarity(embedding1, embedding2):
    similarity = torch.nn.functional.cosine_similarity(embedding1.flatten().unsqueeze(0), embedding2.flatten().unsqueeze(0))
    return similarity.item()

def align_face(frame):
    # Convert the frame to a PIL image if it's a numpy array
    if isinstance(frame, np.ndarray):
        frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None and len(boxes) > 0:
        faces = mtcnn(frame)
        if faces is not None and len(faces) > 0:
            face = faces[0]
            # Convert the face tensor to PIL Image
            face = transforms.ToPILImage()(face)
            return face
    return None

def process_images(image1, image2):
    start_time = time.time()
    
    frame1 = np.array(image1)
    frame2 = np.array(image2)
    
    face1 = align_face(frame1)
    face2 = align_face(frame2)
    
    if face1 is None or face2 is None:
        return None, "Face not detected in one or both images."
    
    face1 = transform(face1).unsqueeze(0).to(device)
    face2 = transform(face2).unsqueeze(0).to(device)
    
    with torch.no_grad():
        embedding1 = model(face1)
        embedding2 = model(face2)
    
    similarity = cosine_similarity(embedding1, embedding2)
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    result = f"Similarity: {similarity:.2f}\nInference time: {inference_time:.2f} seconds"
    
    return (frame1, frame2), result

# Create the Gradio interface
iface = gr.Interface(
    fn=process_images,
    inputs=[gr.Image(type="pil"), gr.Image(type="pil")],
    outputs=[gr.Gallery(), gr.Textbox()],
    title="Face Verification with MTCNN and ViT",
    description="Upload two images and the model will verify if the faces in both images are of the same person."
)

# Launch the interface
iface.launch(share=True, debug=True)