File size: 4,527 Bytes
1107e2d
 
83b72c1
 
 
1107e2d
 
 
 
83b72c1
 
1107e2d
83b72c1
 
 
 
 
 
 
 
cfe8e38
83b72c1
b1ec3a2
1107e2d
b1ec3a2
 
 
 
6773f13
b1ec3a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1107e2d
 
83b72c1
6773f13
b1ec3a2
6773f13
 
b1ec3a2
6773f13
83b72c1
 
6773f13
 
 
 
 
 
 
 
 
 
 
 
 
 
83b72c1
6773f13
9732442
83b72c1
6773f13
 
 
 
 
 
 
 
 
 
 
 
 
b1ec3a2
 
6773f13
b1ec3a2
 
 
 
 
 
6773f13
9732442
83b72c1
 
 
 
 
9732442
83b72c1
 
6773f13
 
83b72c1
9732442
6773f13
83b72c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
from wordllama import WordLlama
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the default WordLlama model
wl = WordLlama.load()

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

def calculate_similarities(sentence1, sentence2):
    # WordLlama similarity
    wordllama_score = wl.similarity(sentence1, sentence2)
    
    # TF-IDF similarity
    tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
    tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    return float(wordllama_score), float(tfidf_score)

# Carefully selected examples to compare both methods
examples = [
    # High similarity in both methods
    ["The cat is sleeping on the couch", "The cat is resting on the sofa"],
    ["I love eating pizza", "I enjoy eating pizza"],
    ["The weather is sunny today", "It is a sunny day today"],
    
    # Medium similarity in both methods
    ["She is reading a book", "She is holding a novel"],
    ["The car is red", "The automobile is crimson"],
    ["The children are playing in the park", "Kids are having fun at the playground"],
    
    # Cases where WordLlama should perform better
    ["The food was great", "The meal was excellent"],
    ["The student is studying hard", "The pupil is working diligently"],
    ["This movie is fantastic", "This film is amazing"],
    
    # Cases where TF-IDF should perform better
    ["The red car is parked", "The red car is moving"],
    ["The book is on the table", "The book is under the table"],
    ["She went to the store", "She went to the mall"],
    
    # Semantic similarity cases
    ["The laptop is expensive", "The computer costs a lot"],
    ["The dog is barking", "The canine is making noise"],
    ["The house is large", "The home is spacious"],
    
    # Word order importance cases
    ["The cat chased the mouse", "The mouse chased the cat"],
    ["John gave Mary a book", "Mary gave John a book"],
    ["The teacher helped the student", "The student helped the teacher"],
    
    # Synonym cases
    ["The car is fast", "The vehicle is quick"],
    ["The building is tall", "The structure is high"],
    ["The food is delicious", "The cuisine is tasty"]
]

# Define Gradio interface with updated layout
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown("# Text Similarity Comparison")
    gr.Markdown("""
    Compare sentences using both WordLlama and TF-IDF similarity metrics.
    Examples are categorized to demonstrate strengths of each method.
    """)
    
    with gr.Row():
        with gr.Column():
            sentence1 = gr.Textbox(
                lines=2,
                placeholder="Enter first sentence...",
                label="First Sentence",
                info="Type or select from examples below"
            )
        with gr.Column():
            sentence2 = gr.Textbox(
                lines=2,
                placeholder="Enter second sentence...",
                label="Second Sentence",
                info="Type or select from examples below"
            )
    
    button = gr.Button("Calculate Similarities", variant="primary")
    
    with gr.Row():
        wordllama_output = gr.Number(
            label="WordLlama Similarity",
            info="Contextual similarity score (0-1)",
            value=0.0
        )
        tfidf_output = gr.Number(
            label="TF-IDF Similarity",
            info="Term frequency-based similarity score (0-1)",
            value=0.0
        )
    
    gr.Markdown("""
    ### Understanding the Scores
    - **WordLlama Similarity**: Better at understanding semantic meaning and context
    - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
    
    ### Example Categories
    1. High Similarity: Both methods should show high scores
    2. Medium Similarity: Both methods should show moderate scores
    3. Semantic Similarity: WordLlama typically performs better
    4. Word Order Cases: Shows how each method handles word order
    5. Synonym Cases: Tests semantic understanding
    """)
    
    button.click(
        calculate_similarities,
        inputs=[sentence1, sentence2],
        outputs=[wordllama_output, tfidf_output]
    )
    
    gr.Examples(
        examples=examples,
        inputs=[sentence1, sentence2],
        label="Click on any example to load it"
    )

# Launch the interface
iface.launch(share=True)