nickprock commited on
Commit
950925d
·
verified ·
1 Parent(s): 2d20319

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -73
app.py CHANGED
@@ -3,76 +3,107 @@ from sentence_transformers import SentenceTransformer
3
  import pandas as pd
4
  from datasets import load_dataset
5
  from annoy import AnnoyIndex
6
- import numpy as np
7
-
8
- # Load the dataset
9
- dataset = load_dataset("DeepMount00/CulturaViva-ITA")
10
- df = pd.DataFrame(dataset["train"])
11
-
12
- # Extract questions and answers
13
- questions = df["question"].tolist()
14
- answers = df["answer"].tolist()
15
-
16
- # Sentence-transformers models to test
17
- model_names = [
18
- "nickprock/multi-sentence-BERTino",
19
- "nickprock/sentence-bert-base-italian-uncased",
20
- "nickprock/sentence-bert-base-italian-xxl-uncased",
21
- "nickprock/mmarco-bert-base-italian-uncased",
22
- ]
23
-
24
- models = {name: SentenceTransformer(name) for name in model_names}
25
- annoy_indexes = {} # Store Annoy indexes for each model
26
-
27
- def build_annoy_index(model_name):
28
- """Builds an Annoy index for a given model."""
29
- model = models[model_name]
30
- embeddings = model.encode(answers)
31
- embedding_dim = embeddings.shape[1]
32
- annoy_index = AnnoyIndex(embedding_dim, "angular") # Use angular distance for cosine similarity
33
- for i, embedding in enumerate(embeddings):
34
- annoy_index.add_item(i, embedding)
35
- annoy_index.build(10) # Build with 10 trees
36
- return annoy_index
37
-
38
- # Build Annoy indexes for each model
39
- for model_name in model_names:
40
- annoy_indexes[model_name] = build_annoy_index(model_name)
41
-
42
- def find_similar_answer_annoy(question, model_name):
43
- """Finds the most similar answer using Annoy."""
44
- model = models[model_name]
45
- annoy_index = annoy_indexes[model_name]
46
- question_embedding = model.encode(question)
47
- nearest_neighbors = annoy_index.get_nns_by_vector(question_embedding, 1) # Get the nearest neighbor
48
- best_answer_index = nearest_neighbors[0]
49
- return answers[best_answer_index]
50
-
51
- def compare_models_annoy(question, model1_name, model2_name, model3_name, model4_name):
52
- """Compares the results of different models using Annoy."""
53
- answer1 = find_similar_answer_annoy(question, model1_name)
54
- answer2 = find_similar_answer_annoy(question, model2_name)
55
- answer3 = find_similar_answer_annoy(question, model3_name)
56
- answer4 = find_similar_answer_annoy(question, model4_name)
57
- return answer1, answer2, answer3, answer4
58
-
59
- iface = gr.Interface(
60
- fn=compare_models_annoy,
61
- inputs=[
62
- gr.Textbox(lines=2, placeholder="Enter your question here..."),
63
- gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
64
- gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
65
- gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
66
- gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
67
- ],
68
- outputs=[
69
- gr.Textbox(label=model_names[0]),
70
- gr.Textbox(label=model_names[1]),
71
- gr.Textbox(label=model_names[2]),
72
- gr.Textbox(label=model_names[3]),
73
- ],
74
- title="Sentence Transformer Model Comparison (Annoy)",
75
- description="Enter a question and compare the answers generated by different sentence-transformer models (using Annoy for faster search).",
76
- )
77
-
78
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from datasets import load_dataset
5
  from annoy import AnnoyIndex
6
+ import os
7
+
8
+ try:
9
+ # Load the dataset (Italian subset, test split)
10
+ dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test")
11
+ df = pd.DataFrame(dataset)
12
+
13
+ # Extract sentences (sentence1 and sentence2)
14
+ sentences1 = df["sentence1"].tolist()
15
+ sentences2 = df["sentence2"].tolist()
16
+
17
+ # Sentence-transformers models to test
18
+ model_names = [
19
+ "nickprock/multi-sentence-BERTino",
20
+ "nickprock/sentence-bert-base-italian-uncased",
21
+ "nickprock/sentence-bert-base-italian-xxl-uncased",
22
+ "nickprock/mmarco-bert-base-italian-uncased",
23
+ ]
24
+
25
+ models = {name: SentenceTransformer(name) for name in model_names}
26
+ annoy_indexes1 = {} # Store Annoy indexes for sentence1
27
+ annoy_indexes2 = {} # Store Annoy indexes for sentence2
28
+
29
+ def build_annoy_index(model_name, sentences):
30
+ """Builds an Annoy index for a given model and sentences."""
31
+ model = models[model_name]
32
+ embeddings = model.encode(sentences)
33
+ embedding_dim = embeddings.shape[1]
34
+ annoy_index = AnnoyIndex(embedding_dim, "angular") # Use angular distance for cosine similarity
35
+ for i, embedding in enumerate(embeddings):
36
+ annoy_index.add_item(i, embedding)
37
+ annoy_index.build(10) # Build with 10 trees
38
+ return annoy_index
39
+
40
+ # Build Annoy indexes for each model
41
+ for model_name in model_names:
42
+ annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
43
+ annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)
44
+
45
+ def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
46
+ """Finds the most similar sentence using Annoy."""
47
+ model = models[model_name]
48
+ sentence_embedding = model.encode(sentence)
49
+ nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1)
50
+ best_sentence_index = nearest_neighbors[0]
51
+ return sentence_list[best_sentence_index]
52
+
53
+ def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
54
+ """Compares the results of different models using Annoy."""
55
+ sentence1_results = {}
56
+ sentence2_results = {}
57
+
58
+ sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
59
+ sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
60
+ sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1)
61
+ sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1)
62
+
63
+ sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2)
64
+ sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2)
65
+ sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
66
+ sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
67
+
68
+ return sentence1_results, sentence2_results
69
+
70
+ def format_results(sentence1_results, sentence2_results):
71
+ """Formats the results for display in Gradio."""
72
+ output_text = ""
73
+ for model_name in model_names:
74
+ output_text += f"**{model_name}**\n"
75
+ output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
76
+ output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n\n"
77
+ return output_text
78
+
79
+ def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
80
+ """Gradio interface function."""
81
+ sentence1_results, sentence2_results = compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name)
82
+ return format_results(sentence1_results, sentence2_results)
83
+
84
+ iface = gr.Interface(
85
+ fn=gradio_interface,
86
+ inputs=[
87
+ gr.Textbox(lines=2, placeholder="Enter your sentence here..."),
88
+ gr.Dropdown(model_names, value=model_names[0], label="Model 1"),
89
+ gr.Dropdown(model_names, value=model_names[1], label="Model 2"),
90
+ gr.Dropdown(model_names, value=model_names[2], label="Model 3"),
91
+ gr.Dropdown(model_names, value=model_names[3], label="Model 4"),
92
+ ],
93
+ outputs=gr.Markdown(),
94
+ title="Sentence Transformer Model Comparison (Annoy)",
95
+ description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.",
96
+ )
97
+
98
+ iface.launch()
99
+
100
+ except Exception as e:
101
+ print(f"Error loading dataset: {e}")
102
+ iface = gr.Interface(
103
+ fn=lambda: "Dataset loading failed. Check console for details.",
104
+ inputs=[],
105
+ outputs=gr.Textbox(),
106
+ title="Dataset Loading Error",
107
+ description="There was an error loading the dataset.",
108
+ )
109
+ iface.launch()