Spaces:

DexterSptizu
/

sentence-transformer-visualization

Sleeping

App Files Files Community

DexterSptizu commited on Nov 4, 2024

Commit

9fba660

verified ·

1 Parent(s): ac1c59d

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -201

app.py CHANGED Viewed

@@ -1,228 +1,154 @@
 import streamlit as st
 import numpy as np
-from sentence_transformers import SentenceTransformer, util
 import plotly.graph_objects as go
 # Page configuration
-st.set_page_config(
-    page_title="Sentence Embeddings Explorer",
-    page_icon="🤗",
-    layout="wide"
-)
-# Custom CSS
-st.markdown("""
-    <style>
-    .stTabs [data-baseweb="tab-list"] {
-        gap: 24px;
-    }
-    .stTabs [data-baseweb="tab"] {
-        height: 50px;
-        padding-left: 20px;
-        padding-right: 20px;
-    }
-    .big-font {
-        font-size:20px !important;
-        font-weight: bold;
-    }
-    .medium-font {
-        font-size:16px !important;
-    }
-    .highlight {
-        padding: 10px;
-        border-radius: 5px;
-        margin: 10px 0;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-# Initialize model
 @st.cache_resource
-def load_model():
-    return SentenceTransformer('all-MiniLM-L6-v2')
-model = load_model()
-def get_embedding_and_similarity(text1, text2):
-    embedding1 = model.encode(text1, convert_to_tensor=True)
-    embedding2 = model.encode(text2, convert_to_tensor=True)
-    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
-    return similarity, embedding1.cpu().numpy(), embedding2.cpu().numpy()
-def create_radar_chart(embedding1, embedding2, num_dimensions=10):
-    # Select first few dimensions for visualization
-    categories = [f"Dim {i+1}" for i in range(num_dimensions)]
-    fig = go.Figure()
-    fig.add_trace(go.Scatterpolar(
-        r=embedding1[:num_dimensions],
-        theta=categories,
-        fill='toself',
-        name='Text 1'
-    ))
-    fig.add_trace(go.Scatterpolar(
-        r=embedding2[:num_dimensions],
-        theta=categories,
-        fill='toself',
-        name='Text 2'
     ))
     fig.update_layout(
-        polar=dict(radialaxis=dict(visible=True, range=[-1, 1])),
-        showlegend=True
     )
     return fig
-# Title and Introduction
-st.title("🤗 Interactive Sentence Embeddings Explorer")
-st.markdown("""
-<p class="medium-font">
-Explore the fascinating world of sentence embeddings! This interactive tool helps you understand
-how AI models capture the meaning of text and measure similarity between sentences.
-</p>
-""", unsafe_allow_html=True)
-# Create tabs
-tab1, tab2, tab3 = st.tabs(["🔍 Compare Texts", "📚 Learn by Examples", "ℹ️ How It Works"])
-with tab1:
-    st.markdown("### Compare Any Two Texts")
-    col1, col2 = st.columns(2)
-    with col1:
-        text1 = st.text_area("First Text",
-                            height=150,
-                            value="I love programming in Python",
-                            help="Enter your first text here")
-    with col2:
-        text2 = st.text_area("Second Text",
-                            height=150,
-                            value="Python is my favorite programming language",
-                            help="Enter your second text here")
-    if st.button("Calculate Similarity", type="primary"):
-        similarity, emb1, emb2 = get_embedding_and_similarity(text1, text2)
-        # Create three columns for results
-        col1, col2, col3 = st.columns([2,1,2])
-        with col2:
-            st.markdown("### Similarity Score")
-            st.markdown(f"<h1 style='text-align: center;'>{similarity:.2f}</h1>",
-                       unsafe_allow_html=True)
-        # Progress bar and interpretation
-        st.progress(similarity)
-        if similarity > 0.8:
-            st.success("🎯 These texts are very similar!")
-        elif similarity > 0.5:
-            st.info("🤔 These texts are somewhat similar")
-        else:
-            st.warning("📊 These texts are quite different")
-        # Visualization
-        st.markdown("### Embedding Visualization")
-        st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
-with tab2:
-    st.markdown("### Learn Through Examples")
-    examples = {
-        "Similar Meaning, Different Words": {
-            "text1": "The cat is sleeping on the couch",
-            "text2": "A feline is resting on the sofa",
-            "explanation": "These sentences use different words but convey the same meaning."
-        },
-        "Similar Words, Different Context": {
-            "text1": "The bank is by the river",
-            "text2": "I need to go to the bank for money",
-            "explanation": "These sentences use 'bank' in different contexts."
-        },
-        "Technical Similarity": {
-            "text1": "Python is a programming language",
-            "text2": "Java is used for coding software",
-            "explanation": "These sentences are related to programming but discuss different languages."
-        },
-        "Opposite Meanings": {
-            "text1": "The stock market is going up",
-            "text2": "The stock market is going down",
-            "explanation": "These sentences use similar words but have opposite meanings."
-        }
-    }
-    selected_example = st.selectbox("Choose an example to explore",
-                                  list(examples.keys()))
-    if st.button("Analyze Example", type="primary"):
-        example = examples[selected_example]
-        similarity, emb1, emb2 = get_embedding_and_similarity(
-            example["text1"],
-            example["text2"]
-        )
-        col1, col2 = st.columns(2)
-        with col1:
-            st.markdown("**Text 1:**")
-            st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text1']}</div>",
-                       unsafe_allow_html=True)
-        with col2:
-            st.markdown("**Text 2:**")
-            st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text2']}</div>",
-                       unsafe_allow_html=True)
-        st.markdown("**Explanation:**")
-        st.info(example["explanation"])
-        st.markdown("**Similarity Score:**")
-        st.progress(similarity)
-        st.write(f"Cosine Similarity: {similarity:.4f}")
-        st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
-with tab3:
-    st.markdown("### Understanding Sentence Embeddings")
-    col1, col2 = st.columns(2)
     with col1:
-        st.markdown("""
-        #### What are Sentence Embeddings?
-        Sentence embeddings are numerical representations of text that capture semantic meaning.
-        Each sentence is converted into a vector of numbers, where similar meanings result in
-        similar vectors.
-        #### How is Similarity Calculated?
-        The similarity between two sentences is measured using cosine similarity between their
-        embedding vectors. The score ranges from -1 to 1:
-        - 1.0 = Identical meaning
-        - >0.8 = Very similar
-        - >0.5 = Somewhat similar
-        - <0.5 = Different meanings
-        """)
     with col2:
-        st.markdown("""
-        #### Current Model Details
-        This demo uses the `all-MiniLM-L6-v2` model:
-        - Embedding Size: 384 dimensions
-        - Optimized for semantic similarity
-        - Fast and efficient
-        - Good balance of performance and speed
-        #### Use Cases
-        - Semantic search
-        - Document similarity
-        - Text clustering
-        - Information retrieval
-        """)
-    with st.expander("🔬 Technical Details"):
-        st.markdown("""
-        The model processes text through these steps:
-        1. Tokenization: Breaks text into tokens
-        2. Encoding: Converts tokens to embeddings
-        3. Pooling: Combines token embeddings into sentence embedding
-        4. Similarity: Computes cosine similarity between embeddings
-        """)

 import streamlit as st
 import numpy as np
+from sentence_transformers import SentenceTransformer
+import plotly.express as px
 import plotly.graph_objects as go
+from sklearn.manifold import TSNE
+import torch
+from transformers import AutoTokenizer, AutoModel
+import pandas as pd
 # Page configuration
+st.set_page_config(layout="wide", page_title="Word & Sentence Embeddings Explorer")
 @st.cache_resource
+def load_models():
+    sent_model = SentenceTransformer('all-MiniLM-L6-v2')
+    word_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    word_model = AutoModel.from_pretrained('bert-base-uncased')
+    return sent_model, word_tokenizer, word_model
+sent_model, word_tokenizer, word_model = load_models()
+def get_word_embeddings(text):
+    # Tokenize and get word embeddings
+    tokens = word_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
+    with torch.no_grad():
+        outputs = word_model(**tokens)
+    word_embeddings = outputs.last_hidden_state.squeeze(0)
+    # Get original words from tokens
+    words = word_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
+    return words, word_embeddings
+def create_heatmap(embeddings, words):
+    # Create heatmap of word embeddings
+    fig = go.Figure(data=go.Heatmap(
+        z=embeddings,
+        x=[f'Dim {i+1}' for i in range(embeddings.shape[1])],
+        y=words,
+        colorscale='Viridis'
     ))
     fig.update_layout(
+        title='Word Embeddings Heatmap',
+        xaxis_title='Embedding Dimensions',
+        yaxis_title='Words',
+        height=400
     )
     return fig
+def create_word_scatter(embeddings, words):
+    # Reduce dimensions for visualization
+    tsne = TSNE(n_components=2, random_state=42)
+    embeddings_2d = tsne.fit_transform(embeddings)
+    # Create scatter plot
+    fig = px.scatter(
+        x=embeddings_2d[:, 0],
+        y=embeddings_2d[:, 1],
+        text=words,
+        title='Word Embeddings in 2D Space'
+    )
+    fig.update_traces(textposition='top center')
+    fig.update_layout(height=400)
+    return fig
+def main():
+    st.title("🔤 Interactive Word & Sentence Embeddings Explorer")
+    with st.expander("ℹ️ About this app", expanded=True):
+        st.markdown("""
+        This app helps you understand how words and sentences are represented in vector space:
+        - **Word-level Analysis**: See how individual words are embedded
+        - **Sentence-level Analysis**: Compare different sentences
+        - **Interactive Visualizations**: Explore embeddings through various charts
+        """)
+    col1, col2 = st.columns([2, 1])
     with col1:
+        text_input = st.text_area(
+            "Enter your text",
+            value="The quick brown fox jumps over the lazy dog",
+            height=100,
+            help="Enter any text to see its word and sentence embeddings"
+        )
     with col2:
+        st.markdown("### Visualization Options")
+        show_heatmap = st.checkbox("Show Heatmap", value=True)
+        show_scatter = st.checkbox("Show Word Scatter", value=True)
+        show_sentence = st.checkbox("Show Sentence Analysis", value=True)
+    if text_input:
+        # Get word-level embeddings
+        words, word_embeddings = get_word_embeddings(text_input)
+        word_embeddings = word_embeddings.numpy()
+        # Remove special tokens
+        mask = ~np.isin(words, ['[CLS]', '[SEP]', '[PAD]'])
+        words = [w for i, w in enumerate(words) if mask[i]]
+        word_embeddings = word_embeddings[mask]
+        # Create visualizations
+        if show_heatmap:
+            st.plotly_chart(create_heatmap(word_embeddings, words), use_container_width=True)
+        if show_scatter:
+            st.plotly_chart(create_word_scatter(word_embeddings, words), use_container_width=True)
+        if show_sentence:
+            st.markdown("### Sentence-Level Analysis")
+            # Get sentence embedding
+            sentence_embedding = sent_model.encode(text_input)
+            # Create sentence embedding visualization
+            fig = go.Figure(data=go.Bar(
+                x=list(range(len(sentence_embedding))),
+                y=sentence_embedding,
+                name='Sentence Embedding'
+            ))
+            fig.update_layout(
+                title='Sentence Embedding Vector',
+                xaxis_title='Dimension',
+                yaxis_title='Value',
+                height=300
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # Add similarity comparison
+            st.markdown("### Compare with Another Sentence")
+            compare_text = st.text_area("Enter another sentence for comparison",
+                                      value="A quick brown dog jumps over the lazy fox",
+                                      height=100)
+            if compare_text:
+                similarity = util.pytorch_cos_sim(
+                    sent_model.encode(text_input),
+                    sent_model.encode(compare_text)
+                ).item()
+                st.metric(
+                    label="Semantic Similarity",
+                    value=f"{similarity:.2f}",
+                    help="1.0 = identical meaning, 0.0 = completely different"
+                )
+if __name__ == "__main__":
+    main()