|
import gradio as gr |
|
from sentence_transformers import SentenceTransformer, util |
|
import pandas as pd |
|
from datasets import load_dataset |
|
from annoy import AnnoyIndex |
|
import os |
|
|
|
try: |
|
|
|
dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test") |
|
df = pd.DataFrame(dataset) |
|
|
|
|
|
sentences1 = df["sentence1"].tolist() |
|
sentences2 = df["sentence2"].tolist() |
|
|
|
|
|
model_names = [ |
|
"nickprock/multi-sentence-BERTino", |
|
"nickprock/sentence-bert-base-italian-uncased", |
|
"nickprock/static-similarity-mmarco3m-mrl-BERTino-v1.5", |
|
"nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl", |
|
] |
|
|
|
models = {name: SentenceTransformer(name) for name in model_names} |
|
annoy_indexes1 = {} |
|
annoy_indexes2 = {} |
|
|
|
def build_annoy_index(model_name, sentences): |
|
"""Builds an Annoy index for a given model and sentences.""" |
|
model = models[model_name] |
|
embeddings = model.encode(sentences) |
|
embedding_dim = embeddings.shape[1] |
|
annoy_index = AnnoyIndex(embedding_dim, "angular") |
|
for i, embedding in enumerate(embeddings): |
|
annoy_index.add_item(i, embedding) |
|
annoy_index.build(10) |
|
return annoy_index |
|
|
|
|
|
for model_name in model_names: |
|
annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1) |
|
annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2) |
|
|
|
def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index): |
|
"""Finds the most similar sentence using Annoy.""" |
|
model = models[model_name] |
|
sentence_embedding = model.encode(sentence) |
|
nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1) |
|
best_sentence_index = nearest_neighbors[0] |
|
return sentence_list[best_sentence_index] |
|
|
|
def calculate_cosine_similarity(sentence1, sentence2, model): |
|
"""Calculates the cosine similarity between two sentences.""" |
|
embedding1 = model.encode(sentence1) |
|
embedding2 = model.encode(sentence2) |
|
return util.cos_sim(embedding1, embedding2).item() |
|
|
|
def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name): |
|
"""Compares the results of different models using Annoy.""" |
|
sentence1_results = {} |
|
sentence2_results = {} |
|
similarities = {} |
|
|
|
sentence1_results[model1_name] = find_similar_sentence_annoy( |
|
sentence, model1_name, sentences1, annoy_indexes1 |
|
) |
|
sentence1_results[model2_name] = find_similar_sentence_annoy( |
|
sentence, model2_name, sentences1, annoy_indexes1 |
|
) |
|
sentence1_results[model3_name] = find_similar_sentence_annoy( |
|
sentence, model3_name, sentences1, annoy_indexes1 |
|
) |
|
sentence1_results[model4_name] = find_similar_sentence_annoy( |
|
sentence, model4_name, sentences1, annoy_indexes1 |
|
) |
|
|
|
sentence2_results[model1_name] = find_similar_sentence_annoy( |
|
sentence, model1_name, sentences2, annoy_indexes2 |
|
) |
|
sentence2_results[model2_name] = find_similar_sentence_annoy( |
|
sentence, model2_name, sentences2, annoy_indexes2 |
|
) |
|
sentence2_results[model3_name] = find_similar_sentence_annoy( |
|
sentence, model3_name, sentences2, annoy_indexes2 |
|
) |
|
sentence2_results[model4_name] = find_similar_sentence_annoy( |
|
sentence, model4_name, sentences2, annoy_indexes2 |
|
) |
|
|
|
|
|
for model_name in model_names: |
|
similarities[model_name] = calculate_cosine_similarity( |
|
sentence1_results[model_name], sentence2_results[model_name], models[model_name] |
|
) |
|
|
|
return sentence1_results, sentence2_results, similarities |
|
|
|
def format_results(sentence1_results, sentence2_results, similarities): |
|
"""Formats the results for display in Gradio.""" |
|
output_text = "" |
|
for model_name in model_names: |
|
output_text += f"**{model_name}**\n" |
|
output_text += ( |
|
f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n" |
|
) |
|
output_text += ( |
|
f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n" |
|
) |
|
output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n" |
|
return output_text |
|
|
|
def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name): |
|
"""Gradio interface function.""" |
|
sentence1_results, sentence2_results, similarities = compare_models_annoy( |
|
sentence, model1_name, model2_name, model3_name, model4_name |
|
) |
|
return format_results(sentence1_results, sentence2_results, similarities) |
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.Textbox(lines=2, placeholder="Enter your sentence here..."), |
|
gr.Dropdown(model_names, value=model_names[0], label="Model 1"), |
|
gr.Dropdown(model_names, value=model_names[1], label="Model 2"), |
|
gr.Dropdown(model_names, value=model_names[2], label="Model 3"), |
|
gr.Dropdown(model_names, value=model_names[3], label="Model 4"), |
|
], |
|
outputs=gr.Markdown(), |
|
title="Sentence Transformer Model Comparison (Annoy)", |
|
description=( |
|
"Inserisce una frase e confronta le frasi più simili generate da diversi modelli " |
|
"sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 " |
|
"che dalla frase2. Calcola anche la similarità del coseno tra le frasi. " |
|
"Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt." |
|
), |
|
) |
|
|
|
iface.launch() |
|
|
|
except Exception as e: |
|
print(f"Error loading dataset: {e}") |
|
iface = gr.Interface( |
|
fn=lambda: "Dataset loading failed. Check console for details.", |
|
inputs=[], |
|
outputs=gr.Textbox(), |
|
title="Dataset Loading Error", |
|
description="There was an error loading the dataset.", |
|
) |
|
iface.launch() |