nickprock's picture
Update app.py
23ef606 verified
raw
history blame
5.44 kB
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
from datasets import load_dataset
from annoy import AnnoyIndex
import os
try:
# Load the dataset (Italian subset, test split)
dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test")
df = pd.DataFrame(dataset)
# Extract sentences (sentence1 and sentence2)
sentences1 = df["sentence1"].tolist()
sentences2 = df["sentence2"].tolist()
# Sentence-transformers models to test
model_names = [
"nickprock/multi-sentence-BERTino",
"nickprock/sentence-bert-base-italian-uncased",
"nickprock/sentence-bert-base-italian-xxl-uncased",
"nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl",
]
models = {name: SentenceTransformer(name) for name in model_names}
annoy_indexes1 = {} # Store Annoy indexes for sentence1
annoy_indexes2 = {} # Store Annoy indexes for sentence2
def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
"""Finds the most similar sentence using Annoy."""
model = models[model_name]
sentence_embedding = model.encode(sentence)
nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1)
best_sentence_index = nearest_neighbors[0]
return sentence_list[best_sentence_index]
def calculate_similarity(sentence1, sentence2, model):
"""Calculates the cosine similarity between two sentences using a given model."""
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
similarity = util.cos_sim(embedding1, embedding2).item()
return similarity
def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
"""Compares the results of different models using Annoy."""
sentence1_results = {}
sentence2_results = {}
similarity_results = {}
sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1)
sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1)
sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2)
sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2)
sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
# Calculate similarity between the retrieved sentences
for model_name in model_names:
similarity_results[model_name] = calculate_similarity(
sentence1_results[model_name], sentence2_results[model_name], models[model_name]
)
return sentence1_results, sentence2_results, similarity_results
def format_results(sentence1_results, sentence2_results, similarity_results):
"""Formats the results for display in Gradio."""
output_text = ""
for model_name in model_names:
output_text += f"**{model_name}**\n"
output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n"
return output_text
def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
"""Gradio interface function."""
sentence1_results, sentence2_results, similarity_results = compare_models_annoy(
sentence, model1_name, model2_name, model3_name, model4_name
)
return format_results(sentence1_results, sentence2_results, similarity_results)
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your sentence here..."),
gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
],
outputs=gr.Markdown(),
title="Sentence Transformer Model Comparison (Annoy)",
description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.",
)
iface.launch()
except Exception as e:
print(f"Error loading dataset: {e}")
iface = gr.Interface(
fn=lambda: "Dataset loading failed. Check console for details.",
inputs=[],
outputs=gr.Textbox(),
title="Dataset Loading Error",
description="There was an error loading the dataset.",
)
iface.launch()