File size: 5,294 Bytes
cdf4f6a
 
 
 
 
950925d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
from datasets import load_dataset
from annoy import AnnoyIndex
import os

try:
    # Load the dataset (Italian subset, test split)
    dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test")
    df = pd.DataFrame(dataset)

    # Extract sentences (sentence1 and sentence2)
    sentences1 = df["sentence1"].tolist()
    sentences2 = df["sentence2"].tolist()

    # Sentence-transformers models to test
    model_names = [
        "nickprock/multi-sentence-BERTino",
        "nickprock/sentence-bert-base-italian-uncased",
        "nickprock/sentence-bert-base-italian-xxl-uncased",
        "nickprock/mmarco-bert-base-italian-uncased",
    ]

    models = {name: SentenceTransformer(name) for name in model_names}
    annoy_indexes1 = {}  # Store Annoy indexes for sentence1
    annoy_indexes2 = {}  # Store Annoy indexes for sentence2

    def build_annoy_index(model_name, sentences):
        """Builds an Annoy index for a given model and sentences."""
        model = models[model_name]
        embeddings = model.encode(sentences)
        embedding_dim = embeddings.shape[1]
        annoy_index = AnnoyIndex(embedding_dim, "angular")  # Use angular distance for cosine similarity
        for i, embedding in enumerate(embeddings):
            annoy_index.add_item(i, embedding)
        annoy_index.build(10)  # Build with 10 trees
        return annoy_index

    # Build Annoy indexes for each model
    for model_name in model_names:
        annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
        annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)

    def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
        """Finds the most similar sentence using Annoy."""
        model = models[model_name]
        sentence_embedding = model.encode(sentence)
        nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1)
        best_sentence_index = nearest_neighbors[0]
        return sentence_list[best_sentence_index]

    def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
        """Compares the results of different models using Annoy."""
        sentence1_results = {}
        sentence2_results = {}

        sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
        sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
        sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1)
        sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1)

        sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2)
        sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2)
        sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
        sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)

        return sentence1_results, sentence2_results

    def format_results(sentence1_results, sentence2_results):
        """Formats the results for display in Gradio."""
        output_text = ""
        for model_name in model_names:
            output_text += f"**{model_name}**\n"
            output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
            output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n\n"
        return output_text

    def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
        """Gradio interface function."""
        sentence1_results, sentence2_results = compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name)
        return format_results(sentence1_results, sentence2_results)

    iface = gr.Interface(
        fn=gradio_interface,
        inputs=[
            gr.Textbox(lines=2, placeholder="Enter your sentence here..."),
            gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
            gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
            gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
            gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
        ],
        outputs=gr.Markdown(),
        title="Sentence Transformer Model Comparison (Annoy)",
        description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.",
    )

    iface.launch()

except Exception as e:
    print(f"Error loading dataset: {e}")
    iface = gr.Interface(
        fn=lambda: "Dataset loading failed. Check console for details.",
        inputs=[],
        outputs=gr.Textbox(),
        title="Dataset Loading Error",
        description="There was an error loading the dataset.",
    )
    iface.launch()