Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
from wordllama import WordLlama
|
3 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
import numpy as np
|
6 |
|
7 |
# Load the default WordLlama model
|
8 |
wl = WordLlama.load()
|
9 |
|
10 |
-
# Initialize
|
11 |
tfidf_vectorizer = TfidfVectorizer()
|
|
|
12 |
|
13 |
def calculate_similarities(sentence1, sentence2):
|
14 |
# WordLlama similarity
|
@@ -18,7 +19,11 @@ def calculate_similarities(sentence1, sentence2):
|
|
18 |
tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
|
19 |
tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Carefully selected examples to compare both methods
|
24 |
examples = [
|
@@ -62,7 +67,7 @@ examples = [
|
|
62 |
with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
63 |
gr.Markdown("# Text Similarity Comparison")
|
64 |
gr.Markdown("""
|
65 |
-
Compare sentences using
|
66 |
Examples are categorized to demonstrate strengths of each method.
|
67 |
""")
|
68 |
|
@@ -95,15 +100,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
95 |
info="Term frequency-based similarity score (0-1)",
|
96 |
value=0.0
|
97 |
)
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
gr.Markdown("""
|
100 |
### Understanding the Scores
|
101 |
- **WordLlama Similarity**: Better at understanding semantic meaning and context
|
102 |
- **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
|
|
|
103 |
|
104 |
### Example Categories
|
105 |
-
1. High Similarity:
|
106 |
-
2. Medium Similarity:
|
107 |
3. Semantic Similarity: WordLlama typically performs better
|
108 |
4. Word Order Cases: Shows how each method handles word order
|
109 |
5. Synonym Cases: Tests semantic understanding
|
@@ -112,7 +123,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
112 |
button.click(
|
113 |
calculate_similarities,
|
114 |
inputs=[sentence1, sentence2],
|
115 |
-
outputs=[wordllama_output, tfidf_output]
|
116 |
)
|
117 |
|
118 |
gr.Examples(
|
|
|
1 |
import gradio as gr
|
2 |
from wordllama import WordLlama
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
import numpy as np
|
6 |
|
7 |
# Load the default WordLlama model
|
8 |
wl = WordLlama.load()
|
9 |
|
10 |
+
# Initialize vectorizers
|
11 |
tfidf_vectorizer = TfidfVectorizer()
|
12 |
+
hashing_vectorizer = HashingVectorizer(n_features=2**4)
|
13 |
|
14 |
def calculate_similarities(sentence1, sentence2):
|
15 |
# WordLlama similarity
|
|
|
19 |
tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
|
20 |
tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
|
21 |
|
22 |
+
# Hashing Vectorizer similarity
|
23 |
+
hash_matrix = hashing_vectorizer.transform([sentence1, sentence2])
|
24 |
+
hash_score = cosine_similarity(hash_matrix[0:1], hash_matrix[1:2])[0][0]
|
25 |
+
|
26 |
+
return float(wordllama_score), float(tfidf_score), float(hash_score)
|
27 |
|
28 |
# Carefully selected examples to compare both methods
|
29 |
examples = [
|
|
|
67 |
with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
68 |
gr.Markdown("# Text Similarity Comparison")
|
69 |
gr.Markdown("""
|
70 |
+
Compare sentences using WordLlama, TF-IDF, and Hashing Vectorizer similarity metrics.
|
71 |
Examples are categorized to demonstrate strengths of each method.
|
72 |
""")
|
73 |
|
|
|
100 |
info="Term frequency-based similarity score (0-1)",
|
101 |
value=0.0
|
102 |
)
|
103 |
+
hash_output = gr.Number(
|
104 |
+
label="Hashing Vectorizer Similarity",
|
105 |
+
info="Hash-based similarity score (0-1)",
|
106 |
+
value=0.0
|
107 |
+
)
|
108 |
|
109 |
gr.Markdown("""
|
110 |
### Understanding the Scores
|
111 |
- **WordLlama Similarity**: Better at understanding semantic meaning and context
|
112 |
- **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
|
113 |
+
- **Hashing Vectorizer Similarity**: Memory-efficient alternative to TF-IDF, good for large-scale text processing
|
114 |
|
115 |
### Example Categories
|
116 |
+
1. High Similarity: All methods should show high scores
|
117 |
+
2. Medium Similarity: All methods should show moderate scores
|
118 |
3. Semantic Similarity: WordLlama typically performs better
|
119 |
4. Word Order Cases: Shows how each method handles word order
|
120 |
5. Synonym Cases: Tests semantic understanding
|
|
|
123 |
button.click(
|
124 |
calculate_similarities,
|
125 |
inputs=[sentence1, sentence2],
|
126 |
+
outputs=[wordllama_output, tfidf_output, hash_output]
|
127 |
)
|
128 |
|
129 |
gr.Examples(
|