Spaces:

nickprock
/

try_sentence_transformers_it

Running

App Files Files Community

nickprock commited on Mar 18

Commit

23ef606

verified ·

1 Parent(s): 950925d

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -22

app.py CHANGED Viewed

@@ -19,29 +19,13 @@ try:
         "nickprock/multi-sentence-BERTino",
         "nickprock/sentence-bert-base-italian-uncased",
         "nickprock/sentence-bert-base-italian-xxl-uncased",
-        "nickprock/mmarco-bert-base-italian-uncased",
     ]
     models = {name: SentenceTransformer(name) for name in model_names}
     annoy_indexes1 = {}  # Store Annoy indexes for sentence1
     annoy_indexes2 = {}  # Store Annoy indexes for sentence2
-    def build_annoy_index(model_name, sentences):
-        """Builds an Annoy index for a given model and sentences."""
-        model = models[model_name]
-        embeddings = model.encode(sentences)
-        embedding_dim = embeddings.shape[1]
-        annoy_index = AnnoyIndex(embedding_dim, "angular")  # Use angular distance for cosine similarity
-        for i, embedding in enumerate(embeddings):
-            annoy_index.add_item(i, embedding)
-        annoy_index.build(10)  # Build with 10 trees
-        return annoy_index
-    # Build Annoy indexes for each model
-    for model_name in model_names:
-        annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1)
-        annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2)
     def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
         """Finds the most similar sentence using Annoy."""
         model = models[model_name]
@@ -50,10 +34,18 @@ try:
         best_sentence_index = nearest_neighbors[0]
         return sentence_list[best_sentence_index]
     def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
         """Compares the results of different models using Annoy."""
         sentence1_results = {}
         sentence2_results = {}
         sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
         sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
@@ -65,21 +57,30 @@ try:
         sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
         sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
-        return sentence1_results, sentence2_results
-    def format_results(sentence1_results, sentence2_results):
         """Formats the results for display in Gradio."""
         output_text = ""
         for model_name in model_names:
             output_text += f"**{model_name}**\n"
             output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
-            output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n\n"
         return output_text
     def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
         """Gradio interface function."""
-        sentence1_results, sentence2_results = compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name)
-        return format_results(sentence1_results, sentence2_results)
     iface = gr.Interface(
         fn=gradio_interface,

         "nickprock/multi-sentence-BERTino",
         "nickprock/sentence-bert-base-italian-uncased",
         "nickprock/sentence-bert-base-italian-xxl-uncased",
+        "nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl",
     ]
     models = {name: SentenceTransformer(name) for name in model_names}
     annoy_indexes1 = {}  # Store Annoy indexes for sentence1
     annoy_indexes2 = {}  # Store Annoy indexes for sentence2
     def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index):
         """Finds the most similar sentence using Annoy."""
         model = models[model_name]
         best_sentence_index = nearest_neighbors[0]
         return sentence_list[best_sentence_index]
+    def calculate_similarity(sentence1, sentence2, model):
+        """Calculates the cosine similarity between two sentences using a given model."""
+        embedding1 = model.encode(sentence1, convert_to_tensor=True)
+        embedding2 = model.encode(sentence2, convert_to_tensor=True)
+        similarity = util.cos_sim(embedding1, embedding2).item()
+        return similarity
     def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name):
         """Compares the results of different models using Annoy."""
         sentence1_results = {}
         sentence2_results = {}
+        similarity_results = {}
         sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1)
         sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1)
         sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2)
         sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2)
+        # Calculate similarity between the retrieved sentences
+        for model_name in model_names:
+            similarity_results[model_name] = calculate_similarity(
+                sentence1_results[model_name], sentence2_results[model_name], models[model_name]
+            )
+        return sentence1_results, sentence2_results, similarity_results
+    def format_results(sentence1_results, sentence2_results, similarity_results):
         """Formats the results for display in Gradio."""
         output_text = ""
         for model_name in model_names:
             output_text += f"**{model_name}**\n"
             output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n"
+            output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n"
+            output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n"
         return output_text
     def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name):
         """Gradio interface function."""
+        sentence1_results, sentence2_results, similarity_results = compare_models_annoy(
+            sentence, model1_name, model2_name, model3_name, model4_name
+        )
+        return format_results(sentence1_results, sentence2_results, similarity_results)
     iface = gr.Interface(
         fn=gradio_interface,