File size: 6,508 Bytes
cdf4f6a
6f0eb8d
cdf4f6a
 
 
950925d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a6c00d
23ef606
950925d
 
 
 
 
 
6f0eb8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950925d
 
 
 
 
 
 
 
6f0eb8d
 
 
 
 
23ef606
950925d
 
 
 
6f0eb8d
950925d
6f0eb8d
 
 
 
 
 
 
 
 
 
 
 
950925d
6f0eb8d
 
 
 
 
 
 
 
 
 
 
 
950925d
6f0eb8d
23ef606
6f0eb8d
23ef606
 
 
6f0eb8d
950925d
6f0eb8d
950925d
 
 
 
6f0eb8d
 
 
 
 
 
 
950925d
 
 
 
6f0eb8d
23ef606
 
6f0eb8d
950925d
 
 
 
 
 
 
 
 
 
 
 
6f0eb8d
 
 
 
 
 
950925d
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from datasets import load_dataset
from annoy import AnnoyIndex
import os

try:
    # Load the dataset (Italian subset, test split)
    dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test")
    df = pd.DataFrame(dataset)

    # Extract sentences (sentence1 and sentence2)
    sentences1 = df["sentence1"].tolist()
    sentences2 = df["sentence2"].tolist()

    # Sentence-transformers models to test
    model_names = [
        "nickprock/multi-sentence-BERTino",
        "nickprock/sentence-bert-base-italian-uncased",
        "nickprock/static-similarity-mmarco3m-mrl-BERTino-v1.5",
        "nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl",
    ]

    models = {name: SentenceTransformer(name) for name in model_names}
    annoy_indexes1 = {}  # Store Annoy indexes for sentence1
    annoy_indexes2 = {}  # Store Annoy indexes for sentence2

    def build_annoy_index(model_name, sentences):
        """Builds an Annoy index for a given model and sentences."""
        model = models[model_name]
        embeddings = model.encode(sentences)
        embedding_dim = embeddings.shape[1]
        annoy_index = AnnoyIndex(embedding_dim, "angular")  # Use angular distance for cosine similarity
        for i, embedding in enumerate(embeddings):
            annoy_index.add_item(i, embedding)
        annoy_index.build(10)  # Build with 10 trees
        return annoy_index

    # Build Annoy indexes for each model
    for model_name in model_names:
        annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
        annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)

    def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
        """Finds the most similar sentence using Annoy."""
        model = models[model_name]
        sentence_embedding = model.encode(sentence)
        nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1)
        best_sentence_index = nearest_neighbors[0]
        return sentence_list[best_sentence_index]

    def calculate_cosine_similarity(sentence1, sentence2, model):
        """Calculates the cosine similarity between two sentences."""
        embedding1 = model.encode(sentence1)
        embedding2 = model.encode(sentence2)
        return util.cos_sim(embedding1, embedding2).item()

    def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
        """Compares the results of different models using Annoy."""
        sentence1_results = {}
        sentence2_results = {}
        similarities = {}

        sentence1_results[model1_name] = find_similar_sentence_annoy(
            sentence, model1_name, sentences1, annoy_indexes1
        )
        sentence1_results[model2_name] = find_similar_sentence_annoy(
            sentence, model2_name, sentences1, annoy_indexes1
        )
        sentence1_results[model3_name] = find_similar_sentence_annoy(
            sentence, model3_name, sentences1, annoy_indexes1
        )
        sentence1_results[model4_name] = find_similar_sentence_annoy(
            sentence, model4_name, sentences1, annoy_indexes1
        )

        sentence2_results[model1_name] = find_similar_sentence_annoy(
            sentence, model1_name, sentences2, annoy_indexes2
        )
        sentence2_results[model2_name] = find_similar_sentence_annoy(
            sentence, model2_name, sentences2, annoy_indexes2
        )
        sentence2_results[model3_name] = find_similar_sentence_annoy(
            sentence, model3_name, sentences2, annoy_indexes2
        )
        sentence2_results[model4_name] = find_similar_sentence_annoy(
            sentence, model4_name, sentences2, annoy_indexes2
        )

        # Calculate cosine similarities
        for model_name in model_names:
            similarities[model_name] = calculate_cosine_similarity(
                sentence1_results[model_name], sentence2_results[model_name], models[model_name]
            )

        return sentence1_results, sentence2_results, similarities

    def format_results(sentence1_results, sentence2_results, similarities):
        """Formats the results for display in Gradio."""
        output_text = ""
        for model_name in model_names:
            output_text += f"**{model_name}**\n"
            output_text += (
                f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
            )
            output_text += (
                f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
            )
            output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n"
        return output_text

    def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
        """Gradio interface function."""
        sentence1_results, sentence2_results, similarities = compare_models_annoy(
            sentence, model1_name, model2_name, model3_name, model4_name
        )
        return format_results(sentence1_results, sentence2_results, similarities)

    iface = gr.Interface(
        fn=gradio_interface,
        inputs=[
            gr.Textbox(lines=2, placeholder="Enter your sentence here..."),
            gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
            gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
            gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
            gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
        ],
        outputs=gr.Markdown(),
        title="Sentence Transformer Model Comparison (Annoy)",
        description=(
            "Inserisce una frase e confronta le frasi più simili generate da diversi modelli "
            "sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 "
            "che dalla frase2. Calcola anche la similarità del coseno tra le frasi. "
            "Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt."
        ),
    )

    iface.launch()

except Exception as e:
    print(f"Error loading dataset: {e}")
    iface = gr.Interface(
        fn=lambda: "Dataset loading failed. Check console for details.",
        inputs=[],
        outputs=gr.Textbox(),
        title="Dataset Loading Error",
        description="There was an error loading the dataset.",
    )
    iface.launch()