DexterSptizu commited on
Commit
46033c1
·
verified ·
1 Parent(s): b1ec3a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -7
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import gradio as gr
2
  from wordllama import WordLlama
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
 
7
  # Load the default WordLlama model
8
  wl = WordLlama.load()
9
 
10
- # Initialize TF-IDF vectorizer
11
  tfidf_vectorizer = TfidfVectorizer()
 
12
 
13
  def calculate_similarities(sentence1, sentence2):
14
  # WordLlama similarity
@@ -18,7 +19,11 @@ def calculate_similarities(sentence1, sentence2):
18
  tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
19
  tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
20
 
21
- return float(wordllama_score), float(tfidf_score)
 
 
 
 
22
 
23
  # Carefully selected examples to compare both methods
24
  examples = [
@@ -62,7 +67,7 @@ examples = [
62
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
63
  gr.Markdown("# Text Similarity Comparison")
64
  gr.Markdown("""
65
- Compare sentences using both WordLlama and TF-IDF similarity metrics.
66
  Examples are categorized to demonstrate strengths of each method.
67
  """)
68
 
@@ -95,15 +100,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
95
  info="Term frequency-based similarity score (0-1)",
96
  value=0.0
97
  )
 
 
 
 
 
98
 
99
  gr.Markdown("""
100
  ### Understanding the Scores
101
  - **WordLlama Similarity**: Better at understanding semantic meaning and context
102
  - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
 
103
 
104
  ### Example Categories
105
- 1. High Similarity: Both methods should show high scores
106
- 2. Medium Similarity: Both methods should show moderate scores
107
  3. Semantic Similarity: WordLlama typically performs better
108
  4. Word Order Cases: Shows how each method handles word order
109
  5. Synonym Cases: Tests semantic understanding
@@ -112,7 +123,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
112
  button.click(
113
  calculate_similarities,
114
  inputs=[sentence1, sentence2],
115
- outputs=[wordllama_output, tfidf_output]
116
  )
117
 
118
  gr.Examples(
 
1
  import gradio as gr
2
  from wordllama import WordLlama
3
+ from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
 
7
  # Load the default WordLlama model
8
  wl = WordLlama.load()
9
 
10
+ # Initialize vectorizers
11
  tfidf_vectorizer = TfidfVectorizer()
12
+ hashing_vectorizer = HashingVectorizer(n_features=2**4)
13
 
14
  def calculate_similarities(sentence1, sentence2):
15
  # WordLlama similarity
 
19
  tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
20
  tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
21
 
22
+ # Hashing Vectorizer similarity
23
+ hash_matrix = hashing_vectorizer.transform([sentence1, sentence2])
24
+ hash_score = cosine_similarity(hash_matrix[0:1], hash_matrix[1:2])[0][0]
25
+
26
+ return float(wordllama_score), float(tfidf_score), float(hash_score)
27
 
28
  # Carefully selected examples to compare both methods
29
  examples = [
 
67
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
68
  gr.Markdown("# Text Similarity Comparison")
69
  gr.Markdown("""
70
+ Compare sentences using WordLlama, TF-IDF, and Hashing Vectorizer similarity metrics.
71
  Examples are categorized to demonstrate strengths of each method.
72
  """)
73
 
 
100
  info="Term frequency-based similarity score (0-1)",
101
  value=0.0
102
  )
103
+ hash_output = gr.Number(
104
+ label="Hashing Vectorizer Similarity",
105
+ info="Hash-based similarity score (0-1)",
106
+ value=0.0
107
+ )
108
 
109
  gr.Markdown("""
110
  ### Understanding the Scores
111
  - **WordLlama Similarity**: Better at understanding semantic meaning and context
112
  - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
113
+ - **Hashing Vectorizer Similarity**: Memory-efficient alternative to TF-IDF, good for large-scale text processing
114
 
115
  ### Example Categories
116
+ 1. High Similarity: All methods should show high scores
117
+ 2. Medium Similarity: All methods should show moderate scores
118
  3. Semantic Similarity: WordLlama typically performs better
119
  4. Word Order Cases: Shows how each method handles word order
120
  5. Synonym Cases: Tests semantic understanding
 
123
  button.click(
124
  calculate_similarities,
125
  inputs=[sentence1, sentence2],
126
+ outputs=[wordllama_output, tfidf_output, hash_output]
127
  )
128
 
129
  gr.Examples(