nickprock commited on
Commit
6f0eb8d
·
verified ·
1 Parent(s): 23ef606

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -26
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from sentence_transformers import SentenceTransformer
3
  import pandas as pd
4
  from datasets import load_dataset
5
  from annoy import AnnoyIndex
@@ -26,6 +26,22 @@ try:
26
  annoy_indexes1 = {} # Store Annoy indexes for sentence1
27
  annoy_indexes2 = {} # Store Annoy indexes for sentence2
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
30
  """Finds the most similar sentence using Annoy."""
31
  model = models[model_name]
@@ -34,53 +50,72 @@ try:
34
  best_sentence_index = nearest_neighbors[0]
35
  return sentence_list[best_sentence_index]
36
 
37
- def calculate_similarity(sentence1, sentence2, model):
38
- """Calculates the cosine similarity between two sentences using a given model."""
39
- embedding1 = model.encode(sentence1, convert_to_tensor=True)
40
- embedding2 = model.encode(sentence2, convert_to_tensor=True)
41
- similarity = util.cos_sim(embedding1, embedding2).item()
42
- return similarity
43
 
44
  def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
45
  """Compares the results of different models using Annoy."""
46
  sentence1_results = {}
47
  sentence2_results = {}
48
- similarity_results = {}
49
 
50
- sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
51
- sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
52
- sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1)
53
- sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1)
 
 
 
 
 
 
 
 
54
 
55
- sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2)
56
- sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2)
57
- sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
58
- sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
 
 
 
 
 
 
 
 
59
 
60
- # Calculate similarity between the retrieved sentences
61
  for model_name in model_names:
62
- similarity_results[model_name] = calculate_similarity(
63
  sentence1_results[model_name], sentence2_results[model_name], models[model_name]
64
  )
65
 
66
- return sentence1_results, sentence2_results, similarity_results
67
 
68
- def format_results(sentence1_results, sentence2_results, similarity_results):
69
  """Formats the results for display in Gradio."""
70
  output_text = ""
71
  for model_name in model_names:
72
  output_text += f"**{model_name}**\n"
73
- output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
74
- output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
75
- output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n"
 
 
 
 
76
  return output_text
77
 
78
  def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
79
  """Gradio interface function."""
80
- sentence1_results, sentence2_results, similarity_results = compare_models_annoy(
81
  sentence, model1_name, model2_name, model3_name, model4_name
82
  )
83
- return format_results(sentence1_results, sentence2_results, similarity_results)
84
 
85
  iface = gr.Interface(
86
  fn=gradio_interface,
@@ -93,7 +128,12 @@ try:
93
  ],
94
  outputs=gr.Markdown(),
95
  title="Sentence Transformer Model Comparison (Annoy)",
96
- description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.",
 
 
 
 
 
97
  )
98
 
99
  iface.launch()
 
1
  import gradio as gr
2
+ from sentence_transformers import SentenceTransformer, util
3
  import pandas as pd
4
  from datasets import load_dataset
5
  from annoy import AnnoyIndex
 
26
  annoy_indexes1 = {} # Store Annoy indexes for sentence1
27
  annoy_indexes2 = {} # Store Annoy indexes for sentence2
28
 
29
+ def build_annoy_index(model_name, sentences):
30
+ """Builds an Annoy index for a given model and sentences."""
31
+ model = models[model_name]
32
+ embeddings = model.encode(sentences)
33
+ embedding_dim = embeddings.shape[1]
34
+ annoy_index = AnnoyIndex(embedding_dim, "angular") # Use angular distance for cosine similarity
35
+ for i, embedding in enumerate(embeddings):
36
+ annoy_index.add_item(i, embedding)
37
+ annoy_index.build(10) # Build with 10 trees
38
+ return annoy_index
39
+
40
+ # Build Annoy indexes for each model
41
+ for model_name in model_names:
42
+ annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
43
+ annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)
44
+
45
  def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
46
  """Finds the most similar sentence using Annoy."""
47
  model = models[model_name]
 
50
  best_sentence_index = nearest_neighbors[0]
51
  return sentence_list[best_sentence_index]
52
 
53
+ def calculate_cosine_similarity(sentence1, sentence2, model):
54
+ """Calculates the cosine similarity between two sentences."""
55
+ embedding1 = model.encode(sentence1)
56
+ embedding2 = model.encode(sentence2)
57
+ return util.cos_sim(embedding1, embedding2).item()
 
58
 
59
  def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
60
  """Compares the results of different models using Annoy."""
61
  sentence1_results = {}
62
  sentence2_results = {}
63
+ similarities = {}
64
 
65
+ sentence1_results[model1_name] = find_similar_sentence_annoy(
66
+ sentence, model1_name, sentences1, annoy_indexes1
67
+ )
68
+ sentence1_results[model2_name] = find_similar_sentence_annoy(
69
+ sentence, model2_name, sentences1, annoy_indexes1
70
+ )
71
+ sentence1_results[model3_name] = find_similar_sentence_annoy(
72
+ sentence, model3_name, sentences1, annoy_indexes1
73
+ )
74
+ sentence1_results[model4_name] = find_similar_sentence_annoy(
75
+ sentence, model4_name, sentences1, annoy_indexes1
76
+ )
77
 
78
+ sentence2_results[model1_name] = find_similar_sentence_annoy(
79
+ sentence, model1_name, sentences2, annoy_indexes2
80
+ )
81
+ sentence2_results[model2_name] = find_similar_sentence_annoy(
82
+ sentence, model2_name, sentences2, annoy_indexes2
83
+ )
84
+ sentence2_results[model3_name] = find_similar_sentence_annoy(
85
+ sentence, model3_name, sentences2, annoy_indexes2
86
+ )
87
+ sentence2_results[model4_name] = find_similar_sentence_annoy(
88
+ sentence, model4_name, sentences2, annoy_indexes2
89
+ )
90
 
91
+ # Calculate cosine similarities
92
  for model_name in model_names:
93
+ similarities[model_name] = calculate_cosine_similarity(
94
  sentence1_results[model_name], sentence2_results[model_name], models[model_name]
95
  )
96
 
97
+ return sentence1_results, sentence2_results, similarities
98
 
99
+ def format_results(sentence1_results, sentence2_results, similarities):
100
  """Formats the results for display in Gradio."""
101
  output_text = ""
102
  for model_name in model_names:
103
  output_text += f"**{model_name}**\n"
104
+ output_text += (
105
+ f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
106
+ )
107
+ output_text += (
108
+ f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
109
+ )
110
+ output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n"
111
  return output_text
112
 
113
  def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
114
  """Gradio interface function."""
115
+ sentence1_results, sentence2_results, similarities = compare_models_annoy(
116
  sentence, model1_name, model2_name, model3_name, model4_name
117
  )
118
+ return format_results(sentence1_results, sentence2_results, similarities)
119
 
120
  iface = gr.Interface(
121
  fn=gradio_interface,
 
128
  ],
129
  outputs=gr.Markdown(),
130
  title="Sentence Transformer Model Comparison (Annoy)",
131
+ description=(
132
+ "Inserisce una frase e confronta le frasi più simili generate da diversi modelli "
133
+ "sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 "
134
+ "che dalla frase2. Calcola anche la similarità del coseno tra le frasi. "
135
+ "Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt."
136
+ ),
137
  )
138
 
139
  iface.launch()