import gradio as gr
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the NASA-specific bi-encoder model and tokenizer
bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
bi_model = AutoModel.from_pretrained(bi_encoder_model_name)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Function to generate embeddings and return dimensions
def generate_embedding_with_dim(text):
    # Tokenize the text and create input tensors
    inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Use torch.no_grad() to disable gradient calculation during inference
    with torch.no_grad():
        # Pass inputs to the model to generate embeddings
        outputs = bi_model(**inputs)
    
    # Mean pooling to get the final embedding for the text
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Get the number of dimensions
    embedding_dim = embedding.shape[0]
    
    return embedding, f"Embedding Dimensions: {embedding_dim}"

# Function to handle the full workflow: extract text, generate embeddings, and compute similarity
def compare_pdfs(pdf1, pdf2):
    # Extract text from both PDFs
    text1 = extract_text_from_pdf(pdf1)
    text2 = extract_text_from_pdf(pdf2)

    # Generate embeddings and get their dimensions
    embedding1, dim1 = generate_embedding_with_dim(text1)
    embedding2, dim2 = generate_embedding_with_dim(text2)

    # Compute cosine similarity between the two embeddings
    similarity_score = compute_cosine_similarity(embedding1, embedding2)

    # Return similarity score + embedding dimensions
    return f"The cosine similarity between the two PDFs is: {similarity_score:.4f}", dim1, dim2

# Function to compute the cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Gradio interface: accept two PDFs, show similarity + embedding dimensions
inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
outputs = [
    gr.Textbox(label="Cosine Similarity Score"),
    gr.Textbox(label="Embedding Dimensions (PDF 1)"),
    gr.Textbox(label="Embedding Dimensions (PDF 2)")
]

# Set up the Gradio interface
gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()