File size: 6,508 Bytes
cdf4f6a 6f0eb8d cdf4f6a 950925d 8a6c00d 23ef606 950925d 6f0eb8d 950925d 6f0eb8d 23ef606 950925d 6f0eb8d 950925d 6f0eb8d 950925d 6f0eb8d 950925d 6f0eb8d 23ef606 6f0eb8d 23ef606 6f0eb8d 950925d 6f0eb8d 950925d 6f0eb8d 950925d 6f0eb8d 23ef606 6f0eb8d 950925d 6f0eb8d 950925d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from datasets import load_dataset
from annoy import AnnoyIndex
import os
try:
# Load the dataset (Italian subset, test split)
dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test")
df = pd.DataFrame(dataset)
# Extract sentences (sentence1 and sentence2)
sentences1 = df["sentence1"].tolist()
sentences2 = df["sentence2"].tolist()
# Sentence-transformers models to test
model_names = [
"nickprock/multi-sentence-BERTino",
"nickprock/sentence-bert-base-italian-uncased",
"nickprock/static-similarity-mmarco3m-mrl-BERTino-v1.5",
"nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl",
]
models = {name: SentenceTransformer(name) for name in model_names}
annoy_indexes1 = {} # Store Annoy indexes for sentence1
annoy_indexes2 = {} # Store Annoy indexes for sentence2
def build_annoy_index(model_name, sentences):
"""Builds an Annoy index for a given model and sentences."""
model = models[model_name]
embeddings = model.encode(sentences)
embedding_dim = embeddings.shape[1]
annoy_index = AnnoyIndex(embedding_dim, "angular") # Use angular distance for cosine similarity
for i, embedding in enumerate(embeddings):
annoy_index.add_item(i, embedding)
annoy_index.build(10) # Build with 10 trees
return annoy_index
# Build Annoy indexes for each model
for model_name in model_names:
annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)
def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
"""Finds the most similar sentence using Annoy."""
model = models[model_name]
sentence_embedding = model.encode(sentence)
nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1)
best_sentence_index = nearest_neighbors[0]
return sentence_list[best_sentence_index]
def calculate_cosine_similarity(sentence1, sentence2, model):
"""Calculates the cosine similarity between two sentences."""
embedding1 = model.encode(sentence1)
embedding2 = model.encode(sentence2)
return util.cos_sim(embedding1, embedding2).item()
def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
"""Compares the results of different models using Annoy."""
sentence1_results = {}
sentence2_results = {}
similarities = {}
sentence1_results[model1_name] = find_similar_sentence_annoy(
sentence, model1_name, sentences1, annoy_indexes1
)
sentence1_results[model2_name] = find_similar_sentence_annoy(
sentence, model2_name, sentences1, annoy_indexes1
)
sentence1_results[model3_name] = find_similar_sentence_annoy(
sentence, model3_name, sentences1, annoy_indexes1
)
sentence1_results[model4_name] = find_similar_sentence_annoy(
sentence, model4_name, sentences1, annoy_indexes1
)
sentence2_results[model1_name] = find_similar_sentence_annoy(
sentence, model1_name, sentences2, annoy_indexes2
)
sentence2_results[model2_name] = find_similar_sentence_annoy(
sentence, model2_name, sentences2, annoy_indexes2
)
sentence2_results[model3_name] = find_similar_sentence_annoy(
sentence, model3_name, sentences2, annoy_indexes2
)
sentence2_results[model4_name] = find_similar_sentence_annoy(
sentence, model4_name, sentences2, annoy_indexes2
)
# Calculate cosine similarities
for model_name in model_names:
similarities[model_name] = calculate_cosine_similarity(
sentence1_results[model_name], sentence2_results[model_name], models[model_name]
)
return sentence1_results, sentence2_results, similarities
def format_results(sentence1_results, sentence2_results, similarities):
"""Formats the results for display in Gradio."""
output_text = ""
for model_name in model_names:
output_text += f"**{model_name}**\n"
output_text += (
f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
)
output_text += (
f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
)
output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n"
return output_text
def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
"""Gradio interface function."""
sentence1_results, sentence2_results, similarities = compare_models_annoy(
sentence, model1_name, model2_name, model3_name, model4_name
)
return format_results(sentence1_results, sentence2_results, similarities)
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your sentence here..."),
gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
],
outputs=gr.Markdown(),
title="Sentence Transformer Model Comparison (Annoy)",
description=(
"Inserisce una frase e confronta le frasi più simili generate da diversi modelli "
"sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 "
"che dalla frase2. Calcola anche la similarità del coseno tra le frasi. "
"Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt."
),
)
iface.launch()
except Exception as e:
print(f"Error loading dataset: {e}")
iface = gr.Interface(
fn=lambda: "Dataset loading failed. Check console for details.",
inputs=[],
outputs=gr.Textbox(),
title="Dataset Loading Error",
description="There was an error loading the dataset.",
)
iface.launch() |