Spaces:

nickprock
/

try_sentence_transformers_it

Sleeping

App Files Files Community

nickprock commited on Mar 18

Commit

6f0eb8d

verified ·

1 Parent(s): 23ef606

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -26

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from sentence_transformers import SentenceTransformer
 import pandas as pd
 from datasets import load_dataset
 from annoy import AnnoyIndex
@@ -26,6 +26,22 @@ try:
     annoy_indexes1 = {}  # Store Annoy indexes for sentence1
     annoy_indexes2 = {}  # Store Annoy indexes for sentence2
     def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
         """Finds the most similar sentence using Annoy."""
         model = models[model_name]
@@ -34,53 +50,72 @@ try:
         best_sentence_index = nearest_neighbors[0]
         return sentence_list[best_sentence_index]
-    def calculate_similarity(sentence1, sentence2, model):
-        """Calculates the cosine similarity between two sentences using a given model."""
-        embedding1 = model.encode(sentence1, convert_to_tensor=True)
-        embedding2 = model.encode(sentence2, convert_to_tensor=True)
-        similarity = util.cos_sim(embedding1, embedding2).item()
-        return similarity
     def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
         """Compares the results of different models using Annoy."""
         sentence1_results = {}
         sentence2_results = {}
-        similarity_results = {}
-        sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
-        sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
-        sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1)
-        sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1)
-        sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2)
-        sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2)
-        sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
-        sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
-        # Calculate similarity between the retrieved sentences
         for model_name in model_names:
-            similarity_results[model_name] = calculate_similarity(
                 sentence1_results[model_name], sentence2_results[model_name], models[model_name]
             )
-        return sentence1_results, sentence2_results, similarity_results
-    def format_results(sentence1_results, sentence2_results, similarity_results):
         """Formats the results for display in Gradio."""
         output_text = ""
         for model_name in model_names:
             output_text += f"**{model_name}**\n"
-            output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
-            output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
-            output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n"
         return output_text
     def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
         """Gradio interface function."""
-        sentence1_results, sentence2_results, similarity_results = compare_models_annoy(
             sentence, model1_name, model2_name, model3_name, model4_name
         )
-        return format_results(sentence1_results, sentence2_results, similarity_results)
     iface = gr.Interface(
         fn=gradio_interface,
@@ -93,7 +128,12 @@ try:
         ],
         outputs=gr.Markdown(),
         title="Sentence Transformer Model Comparison (Annoy)",
-        description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.",
     )
     iface.launch()

 import gradio as gr
+from sentence_transformers import SentenceTransformer, util
 import pandas as pd
 from datasets import load_dataset
 from annoy import AnnoyIndex
     annoy_indexes1 = {}  # Store Annoy indexes for sentence1
     annoy_indexes2 = {}  # Store Annoy indexes for sentence2
+    def build_annoy_index(model_name, sentences):
+        """Builds an Annoy index for a given model and sentences."""
+        model = models[model_name]
+        embeddings = model.encode(sentences)
+        embedding_dim = embeddings.shape[1]
+        annoy_index = AnnoyIndex(embedding_dim, "angular")  # Use angular distance for cosine similarity
+        for i, embedding in enumerate(embeddings):
+            annoy_index.add_item(i, embedding)
+        annoy_index.build(10)  # Build with 10 trees
+        return annoy_index
+    # Build Annoy indexes for each model
+    for model_name in model_names:
+        annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
+        annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)
     def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
         """Finds the most similar sentence using Annoy."""
         model = models[model_name]
         best_sentence_index = nearest_neighbors[0]
         return sentence_list[best_sentence_index]
+    def calculate_cosine_similarity(sentence1, sentence2, model):
+        """Calculates the cosine similarity between two sentences."""
+        embedding1 = model.encode(sentence1)
+        embedding2 = model.encode(sentence2)
+        return util.cos_sim(embedding1, embedding2).item()
     def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
         """Compares the results of different models using Annoy."""
         sentence1_results = {}
         sentence2_results = {}
+        similarities = {}
+        sentence1_results[model1_name] = find_similar_sentence_annoy(
+            sentence, model1_name, sentences1, annoy_indexes1
+        )
+        sentence1_results[model2_name] = find_similar_sentence_annoy(
+            sentence, model2_name, sentences1, annoy_indexes1
+        )
+        sentence1_results[model3_name] = find_similar_sentence_annoy(
+            sentence, model3_name, sentences1, annoy_indexes1
+        )
+        sentence1_results[model4_name] = find_similar_sentence_annoy(
+            sentence, model4_name, sentences1, annoy_indexes1
+        )
+        sentence2_results[model1_name] = find_similar_sentence_annoy(
+            sentence, model1_name, sentences2, annoy_indexes2
+        )
+        sentence2_results[model2_name] = find_similar_sentence_annoy(
+            sentence, model2_name, sentences2, annoy_indexes2
+        )
+        sentence2_results[model3_name] = find_similar_sentence_annoy(
+            sentence, model3_name, sentences2, annoy_indexes2
+        )
+        sentence2_results[model4_name] = find_similar_sentence_annoy(
+            sentence, model4_name, sentences2, annoy_indexes2
+        )
+        # Calculate cosine similarities
         for model_name in model_names:
+            similarities[model_name] = calculate_cosine_similarity(
                 sentence1_results[model_name], sentence2_results[model_name], models[model_name]
             )
+        return sentence1_results, sentence2_results, similarities
+    def format_results(sentence1_results, sentence2_results, similarities):
         """Formats the results for display in Gradio."""
         output_text = ""
         for model_name in model_names:
             output_text += f"**{model_name}**\n"
+            output_text += (
+                f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
+            )
+            output_text += (
+                f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
+            )
+            output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n"
         return output_text
     def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
         """Gradio interface function."""
+        sentence1_results, sentence2_results, similarities = compare_models_annoy(
             sentence, model1_name, model2_name, model3_name, model4_name
         )
+        return format_results(sentence1_results, sentence2_results, similarities)
     iface = gr.Interface(
         fn=gradio_interface,
         ],
         outputs=gr.Markdown(),
         title="Sentence Transformer Model Comparison (Annoy)",
+        description=(
+            "Inserisce una frase e confronta le frasi più simili generate da diversi modelli "
+            "sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 "
+            "che dalla frase2. Calcola anche la similarità del coseno tra le frasi. "
+            "Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt."
+        ),
     )
     iface.launch()