import gradio as gr import fitz # PyMuPDF for extracting text from PDFs from transformers import AutoTokenizer, AutoModel import torch from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load the NASA-specific bi-encoder model and tokenizer bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2" bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name) bi_model = AutoModel.from_pretrained(bi_encoder_model_name) # Function to extract text from a PDF def extract_text_from_pdf(pdf_file): text = "" with fitz.open(pdf_file) as doc: for page in doc: text += page.get_text() # Extract text from each page return text # Function to generate embeddings and return dimensions def generate_embedding_with_dim(text): # Tokenize the text and create input tensors inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Use torch.no_grad() to disable gradient calculation during inference with torch.no_grad(): # Pass inputs to the model to generate embeddings outputs = bi_model(**inputs) # Mean pooling to get the final embedding for the text embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Get the number of dimensions embedding_dim = embedding.shape[0] return embedding, f"Embedding Dimensions: {embedding_dim}" # Function to handle the full workflow: extract text, generate embeddings, and compute similarity def compare_pdfs(pdf1, pdf2): # Extract text from both PDFs text1 = extract_text_from_pdf(pdf1) text2 = extract_text_from_pdf(pdf2) # Generate embeddings and get their dimensions embedding1, dim1 = generate_embedding_with_dim(text1) embedding2, dim2 = generate_embedding_with_dim(text2) # Compute cosine similarity between the two embeddings similarity_score = compute_cosine_similarity(embedding1, embedding2) # Return similarity score + embedding dimensions return f"The cosine similarity between the two PDFs is: {similarity_score:.4f}", dim1, dim2 # Function to compute the cosine similarity between two embeddings def compute_cosine_similarity(embedding1, embedding2): embedding1 = embedding1.reshape(1, -1) embedding2 = embedding2.reshape(1, -1) return cosine_similarity(embedding1, embedding2)[0][0] # Gradio interface: accept two PDFs, show similarity + embedding dimensions inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")] outputs = [ gr.Textbox(label="Cosine Similarity Score"), gr.Textbox(label="Embedding Dimensions (PDF 1)"), gr.Textbox(label="Embedding Dimensions (PDF 2)") ] # Set up the Gradio interface gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()