Spaces:

DexterSptizu
/

wordllam-text-similarity

Running

File size: 4,527 Bytes

import gradio as gr
from wordllama import WordLlama
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the default WordLlama model
wl = WordLlama.load()

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

def calculate_similarities(sentence1, sentence2):
    # WordLlama similarity
    wordllama_score = wl.similarity(sentence1, sentence2)
    
    # TF-IDF similarity
    tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
    tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    return float(wordllama_score), float(tfidf_score)

# Carefully selected examples to compare both methods
examples = [
    # High similarity in both methods
    ["The cat is sleeping on the couch", "The cat is resting on the sofa"],
    ["I love eating pizza", "I enjoy eating pizza"],
    ["The weather is sunny today", "It is a sunny day today"],
    
    # Medium similarity in both methods
    ["She is reading a book", "She is holding a novel"],
    ["The car is red", "The automobile is crimson"],
    ["The children are playing in the park", "Kids are having fun at the playground"],
    
    # Cases where WordLlama should perform better
    ["The food was great", "The meal was excellent"],
    ["The student is studying hard", "The pupil is working diligently"],
    ["This movie is fantastic", "This film is amazing"],
    
    # Cases where TF-IDF should perform better
    ["The red car is parked", "The red car is moving"],
    ["The book is on the table", "The book is under the table"],
    ["She went to the store", "She went to the mall"],
    
    # Semantic similarity cases
    ["The laptop is expensive", "The computer costs a lot"],
    ["The dog is barking", "The canine is making noise"],
    ["The house is large", "The home is spacious"],
    
    # Word order importance cases
    ["The cat chased the mouse", "The mouse chased the cat"],
    ["John gave Mary a book", "Mary gave John a book"],
    ["The teacher helped the student", "The student helped the teacher"],
    
    # Synonym cases
    ["The car is fast", "The vehicle is quick"],
    ["The building is tall", "The structure is high"],
    ["The food is delicious", "The cuisine is tasty"]
]

# Define Gradio interface with updated layout
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown("# Text Similarity Comparison")
    gr.Markdown("""
    Compare sentences using both WordLlama and TF-IDF similarity metrics.
    Examples are categorized to demonstrate strengths of each method.
    """)
    
    with gr.Row():
        with gr.Column():
            sentence1 = gr.Textbox(
                lines=2,
                placeholder="Enter first sentence...",
                label="First Sentence",
                info="Type or select from examples below"
            )
        with gr.Column():
            sentence2 = gr.Textbox(
                lines=2,
                placeholder="Enter second sentence...",
                label="Second Sentence",
                info="Type or select from examples below"
            )
    
    button = gr.Button("Calculate Similarities", variant="primary")
    
    with gr.Row():
        wordllama_output = gr.Number(
            label="WordLlama Similarity",
            info="Contextual similarity score (0-1)",
            value=0.0
        )
        tfidf_output = gr.Number(
            label="TF-IDF Similarity",
            info="Term frequency-based similarity score (0-1)",
            value=0.0
        )
    
    gr.Markdown("""
    ### Understanding the Scores
    - **WordLlama Similarity**: Better at understanding semantic meaning and context
    - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
    
    ### Example Categories
    1. High Similarity: Both methods should show high scores
    2. Medium Similarity: Both methods should show moderate scores
    3. Semantic Similarity: WordLlama typically performs better
    4. Word Order Cases: Shows how each method handles word order
    5. Synonym Cases: Tests semantic understanding
    """)
    
    button.click(
        calculate_similarities,
        inputs=[sentence1, sentence2],
        outputs=[wordllama_output, tfidf_output]
    )
    
    gr.Examples(
        examples=examples,
        inputs=[sentence1, sentence2],
        label="Click on any example to load it"
    )

# Launch the interface
iface.launch(share=True)