|
import gradio as gr |
|
from sentence_transformers import SentenceTransformer |
|
import pandas as pd |
|
from datasets import load_dataset |
|
from annoy import AnnoyIndex |
|
import os |
|
|
|
try: |
|
|
|
dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test") |
|
df = pd.DataFrame(dataset) |
|
|
|
|
|
sentences1 = df["sentence1"].tolist() |
|
sentences2 = df["sentence2"].tolist() |
|
|
|
|
|
model_names = [ |
|
"nickprock/multi-sentence-BERTino", |
|
"nickprock/sentence-bert-base-italian-uncased", |
|
"nickprock/sentence-bert-base-italian-xxl-uncased", |
|
"nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl", |
|
] |
|
|
|
models = {name: SentenceTransformer(name) for name in model_names} |
|
annoy_indexes1 = {} |
|
annoy_indexes2 = {} |
|
|
|
def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index): |
|
"""Finds the most similar sentence using Annoy.""" |
|
model = models[model_name] |
|
sentence_embedding = model.encode(sentence) |
|
nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1) |
|
best_sentence_index = nearest_neighbors[0] |
|
return sentence_list[best_sentence_index] |
|
|
|
def calculate_similarity(sentence1, sentence2, model): |
|
"""Calculates the cosine similarity between two sentences using a given model.""" |
|
embedding1 = model.encode(sentence1, convert_to_tensor=True) |
|
embedding2 = model.encode(sentence2, convert_to_tensor=True) |
|
similarity = util.cos_sim(embedding1, embedding2).item() |
|
return similarity |
|
|
|
def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name): |
|
"""Compares the results of different models using Annoy.""" |
|
sentence1_results = {} |
|
sentence2_results = {} |
|
similarity_results = {} |
|
|
|
sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1) |
|
sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1) |
|
sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1) |
|
sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1) |
|
|
|
sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2) |
|
sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2) |
|
sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2) |
|
sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2) |
|
|
|
|
|
for model_name in model_names: |
|
similarity_results[model_name] = calculate_similarity( |
|
sentence1_results[model_name], sentence2_results[model_name], models[model_name] |
|
) |
|
|
|
return sentence1_results, sentence2_results, similarity_results |
|
|
|
def format_results(sentence1_results, sentence2_results, similarity_results): |
|
"""Formats the results for display in Gradio.""" |
|
output_text = "" |
|
for model_name in model_names: |
|
output_text += f"**{model_name}**\n" |
|
output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n" |
|
output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n" |
|
output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n" |
|
return output_text |
|
|
|
def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name): |
|
"""Gradio interface function.""" |
|
sentence1_results, sentence2_results, similarity_results = compare_models_annoy( |
|
sentence, model1_name, model2_name, model3_name, model4_name |
|
) |
|
return format_results(sentence1_results, sentence2_results, similarity_results) |
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.Textbox(lines=2, placeholder="Enter your sentence here..."), |
|
gr.Dropdown(model_names, value=model_names[0], label="Model 1"), |
|
gr.Dropdown(model_names, value=model_names[1], label="Model 2"), |
|
gr.Dropdown(model_names, value=model_names[2], label="Model 3"), |
|
gr.Dropdown(model_names, value=model_names[3], label="Model 4"), |
|
], |
|
outputs=gr.Markdown(), |
|
title="Sentence Transformer Model Comparison (Annoy)", |
|
description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.", |
|
) |
|
|
|
iface.launch() |
|
|
|
except Exception as e: |
|
print(f"Error loading dataset: {e}") |
|
iface = gr.Interface( |
|
fn=lambda: "Dataset loading failed. Check console for details.", |
|
inputs=[], |
|
outputs=gr.Textbox(), |
|
title="Dataset Loading Error", |
|
description="There was an error loading the dataset.", |
|
) |
|
iface.launch() |