File size: 4,527 Bytes
1107e2d 83b72c1 1107e2d 83b72c1 1107e2d 83b72c1 cfe8e38 83b72c1 b1ec3a2 1107e2d b1ec3a2 6773f13 b1ec3a2 1107e2d 83b72c1 6773f13 b1ec3a2 6773f13 b1ec3a2 6773f13 83b72c1 6773f13 83b72c1 6773f13 9732442 83b72c1 6773f13 b1ec3a2 6773f13 b1ec3a2 6773f13 9732442 83b72c1 9732442 83b72c1 6773f13 83b72c1 9732442 6773f13 83b72c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from wordllama import WordLlama
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load the default WordLlama model
wl = WordLlama.load()
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
def calculate_similarities(sentence1, sentence2):
# WordLlama similarity
wordllama_score = wl.similarity(sentence1, sentence2)
# TF-IDF similarity
tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return float(wordllama_score), float(tfidf_score)
# Carefully selected examples to compare both methods
examples = [
# High similarity in both methods
["The cat is sleeping on the couch", "The cat is resting on the sofa"],
["I love eating pizza", "I enjoy eating pizza"],
["The weather is sunny today", "It is a sunny day today"],
# Medium similarity in both methods
["She is reading a book", "She is holding a novel"],
["The car is red", "The automobile is crimson"],
["The children are playing in the park", "Kids are having fun at the playground"],
# Cases where WordLlama should perform better
["The food was great", "The meal was excellent"],
["The student is studying hard", "The pupil is working diligently"],
["This movie is fantastic", "This film is amazing"],
# Cases where TF-IDF should perform better
["The red car is parked", "The red car is moving"],
["The book is on the table", "The book is under the table"],
["She went to the store", "She went to the mall"],
# Semantic similarity cases
["The laptop is expensive", "The computer costs a lot"],
["The dog is barking", "The canine is making noise"],
["The house is large", "The home is spacious"],
# Word order importance cases
["The cat chased the mouse", "The mouse chased the cat"],
["John gave Mary a book", "Mary gave John a book"],
["The teacher helped the student", "The student helped the teacher"],
# Synonym cases
["The car is fast", "The vehicle is quick"],
["The building is tall", "The structure is high"],
["The food is delicious", "The cuisine is tasty"]
]
# Define Gradio interface with updated layout
with gr.Blocks(theme=gr.themes.Soft()) as iface:
gr.Markdown("# Text Similarity Comparison")
gr.Markdown("""
Compare sentences using both WordLlama and TF-IDF similarity metrics.
Examples are categorized to demonstrate strengths of each method.
""")
with gr.Row():
with gr.Column():
sentence1 = gr.Textbox(
lines=2,
placeholder="Enter first sentence...",
label="First Sentence",
info="Type or select from examples below"
)
with gr.Column():
sentence2 = gr.Textbox(
lines=2,
placeholder="Enter second sentence...",
label="Second Sentence",
info="Type or select from examples below"
)
button = gr.Button("Calculate Similarities", variant="primary")
with gr.Row():
wordllama_output = gr.Number(
label="WordLlama Similarity",
info="Contextual similarity score (0-1)",
value=0.0
)
tfidf_output = gr.Number(
label="TF-IDF Similarity",
info="Term frequency-based similarity score (0-1)",
value=0.0
)
gr.Markdown("""
### Understanding the Scores
- **WordLlama Similarity**: Better at understanding semantic meaning and context
- **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
### Example Categories
1. High Similarity: Both methods should show high scores
2. Medium Similarity: Both methods should show moderate scores
3. Semantic Similarity: WordLlama typically performs better
4. Word Order Cases: Shows how each method handles word order
5. Synonym Cases: Tests semantic understanding
""")
button.click(
calculate_similarities,
inputs=[sentence1, sentence2],
outputs=[wordllama_output, tfidf_output]
)
gr.Examples(
examples=examples,
inputs=[sentence1, sentence2],
label="Click on any example to load it"
)
# Launch the interface
iface.launch(share=True) |