Spaces:

DexterSptizu
/

wordllam-text-similarity

Sleeping

App Files Files Community

DexterSptizu commited on Nov 29, 2024

Commit

46033c1

verified ·

1 Parent(s): b1ec3a2

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -7

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import gradio as gr
 from wordllama import WordLlama
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 # Load the default WordLlama model
 wl = WordLlama.load()
-# Initialize TF-IDF vectorizer
 tfidf_vectorizer = TfidfVectorizer()
 def calculate_similarities(sentence1, sentence2):
     # WordLlama similarity
@@ -18,7 +19,11 @@ def calculate_similarities(sentence1, sentence2):
     tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
     tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
-    return float(wordllama_score), float(tfidf_score)
 # Carefully selected examples to compare both methods
 examples = [
@@ -62,7 +67,7 @@ examples = [
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown("# Text Similarity Comparison")
     gr.Markdown("""
-    Compare sentences using both WordLlama and TF-IDF similarity metrics.
     Examples are categorized to demonstrate strengths of each method.
     """)
@@ -95,15 +100,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
             info="Term frequency-based similarity score (0-1)",
             value=0.0
         )
     gr.Markdown("""
     ### Understanding the Scores
     - **WordLlama Similarity**: Better at understanding semantic meaning and context
     - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
     ### Example Categories
-    1. High Similarity: Both methods should show high scores
-    2. Medium Similarity: Both methods should show moderate scores
     3. Semantic Similarity: WordLlama typically performs better
     4. Word Order Cases: Shows how each method handles word order
     5. Synonym Cases: Tests semantic understanding
@@ -112,7 +123,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
     button.click(
         calculate_similarities,
         inputs=[sentence1, sentence2],
-        outputs=[wordllama_output, tfidf_output]
     )
     gr.Examples(

 import gradio as gr
 from wordllama import WordLlama
+from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 # Load the default WordLlama model
 wl = WordLlama.load()
+# Initialize vectorizers
 tfidf_vectorizer = TfidfVectorizer()
+hashing_vectorizer = HashingVectorizer(n_features=2**4)
 def calculate_similarities(sentence1, sentence2):
     # WordLlama similarity
     tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
     tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
+    # Hashing Vectorizer similarity
+    hash_matrix = hashing_vectorizer.transform([sentence1, sentence2])
+    hash_score = cosine_similarity(hash_matrix[0:1], hash_matrix[1:2])[0][0]
+    return float(wordllama_score), float(tfidf_score), float(hash_score)
 # Carefully selected examples to compare both methods
 examples = [
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown("# Text Similarity Comparison")
     gr.Markdown("""
+    Compare sentences using WordLlama, TF-IDF, and Hashing Vectorizer similarity metrics.
     Examples are categorized to demonstrate strengths of each method.
     """)
             info="Term frequency-based similarity score (0-1)",
             value=0.0
         )
+        hash_output = gr.Number(
+            label="Hashing Vectorizer Similarity",
+            info="Hash-based similarity score (0-1)",
+            value=0.0
+        )
     gr.Markdown("""
     ### Understanding the Scores
     - **WordLlama Similarity**: Better at understanding semantic meaning and context
     - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
+    - **Hashing Vectorizer Similarity**: Memory-efficient alternative to TF-IDF, good for large-scale text processing
     ### Example Categories
+    1. High Similarity: All methods should show high scores
+    2. Medium Similarity: All methods should show moderate scores
     3. Semantic Similarity: WordLlama typically performs better
     4. Word Order Cases: Shows how each method handles word order
     5. Synonym Cases: Tests semantic understanding
     button.click(
         calculate_similarities,
         inputs=[sentence1, sentence2],
+        outputs=[wordllama_output, tfidf_output, hash_output]
     )
     gr.Examples(