Spaces:

DexterSptizu
/

sentence-transformer-visualization

Sleeping

App Files Files Community

DexterSptizu commited on Nov 4, 2024

Commit

67dc5f6

verified ·

1 Parent(s): cfcdcd6

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -137

app.py CHANGED Viewed

@@ -1,180 +1,184 @@
 import streamlit as st
 import numpy as np
-from sentence_transformers import SentenceTransformer
-import plotly.express as px
 import plotly.graph_objects as go
-from sklearn.manifold import TSNE
-import torch
-from transformers import AutoTokenizer, AutoModel
 import pandas as pd
-from sentence_transformers import SentenceTransformer, util  # Added util import
 # Page configuration
-st.set_page_config(layout="wide", page_title="Word & Sentence Embeddings Explorer")
 @st.cache_resource
-def load_models():
-    sent_model = SentenceTransformer('all-MiniLM-L6-v2')
-    word_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-    word_model = AutoModel.from_pretrained('bert-base-uncased')
-    return sent_model, word_tokenizer, word_model
-sent_model, word_tokenizer, word_model = load_models()
-def get_word_embeddings(text):
-    # Tokenize and get word embeddings
-    tokens = word_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
-    with torch.no_grad():
-        outputs = word_model(**tokens)
-    word_embeddings = outputs.last_hidden_state.squeeze(0)
-    # Get original words from tokens
-    words = word_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
-    return words, word_embeddings
-def create_heatmap(embeddings, words):
-    # Create heatmap of word embeddings
-    fig = go.Figure(data=go.Heatmap(
-        z=embeddings,
-        x=[f'Dim {i+1}' for i in range(embeddings.shape[1])],
-        y=words,
-        colorscale='Viridis'
     ))
     fig.update_layout(
-        title='Word Embeddings Heatmap',
-        xaxis_title='Embedding Dimensions',
-        yaxis_title='Words',
         height=400
     )
     return fig
-def create_word_scatter(embeddings, words):
-    # Calculate appropriate perplexity value
-    n_samples = len(embeddings)
-    # Perplexity should be between 5 and 50, and less than n_samples
-    perplexity = min(30, n_samples - 1)  # Default is 30, but ensure it's less than n_samples
-    # Reduce dimensions for visualization using t-SNE
-    tsne = TSNE(
-        n_components=2,
-        perplexity=perplexity,
-        random_state=42,
-        init='random',
-        learning_rate='auto'
-    )
-    # Perform t-SNE dimensionality reduction
-    embeddings_2d = tsne.fit_transform(embeddings)
-    # Create scatter plot
-    fig = px.scatter(
-        x=embeddings_2d[:, 0],
-        y=embeddings_2d[:, 1],
-        text=words,
-        title=f'Word Embeddings in 2D Space (perplexity={perplexity})'
-    )
-    # Update layout for better visualization
-    fig.update_traces(
-        textposition='top center',
-        mode='markers+text'
-    )
     fig.update_layout(
-        height=400,
-        showlegend=False,
-        xaxis_title="t-SNE dimension 1",
-        yaxis_title="t-SNE dimension 2"
     )
     return fig
 def main():
-    st.title("🔤 Interactive Word & Sentence Embeddings Explorer")
-    with st.expander("ℹ️ About this app", expanded=True):
         st.markdown("""
-        This app helps you understand how words and sentences are represented in vector space:
-        - **Word-level Analysis**: See how individual words are embedded
-        - **Sentence-level Analysis**: Compare different sentences
-        - **Interactive Visualizations**: Explore embeddings through various charts
         """)
-    col1, col2 = st.columns([2, 1])
     with col1:
-        text_input = st.text_area(
-            "Enter your text",
-            value="The quick brown fox jumps over the lazy dog",
-            height=100,
-            help="Enter any text to see its word and sentence embeddings"
-        )
     with col2:
-        st.markdown("### Visualization Options")
-        show_heatmap = st.checkbox("Show Heatmap", value=True)
-        show_scatter = st.checkbox("Show Word Scatter", value=True)
-        show_sentence = st.checkbox("Show Sentence Analysis", value=True)
-    if text_input:
-        # Get word-level embeddings
-        words, word_embeddings = get_word_embeddings(text_input)
-        word_embeddings = word_embeddings.numpy()
-        # Remove special tokens
-        mask = ~np.isin(words, ['[CLS]', '[SEP]', '[PAD]'])
-        words = [w for i, w in enumerate(words) if mask[i]]
-        word_embeddings = word_embeddings[mask]
-        # Create visualizations
-        if show_heatmap:
-            st.plotly_chart(create_heatmap(word_embeddings, words), use_container_width=True)
-        if show_scatter:
-            st.plotly_chart(create_word_scatter(word_embeddings, words), use_container_width=True)
-        if show_sentence:
-            st.markdown("### Sentence-Level Analysis")
-            # Get sentence embedding
-            sentence_embedding = sent_model.encode(text_input)
-            # Create sentence embedding visualization
-            fig = go.Figure(data=go.Bar(
-                x=list(range(len(sentence_embedding))),
-                y=sentence_embedding,
-                name='Sentence Embedding'
-            ))
-            fig.update_layout(
-                title='Sentence Embedding Vector',
-                xaxis_title='Dimension',
-                yaxis_title='Value',
-                height=300
-            )
-            st.plotly_chart(fig, use_container_width=True)
-            # Add similarity comparison
-            st.markdown("### Compare with Another Sentence")
-            compare_text = st.text_area("Enter another sentence for comparison",
-                                      value="A quick brown dog jumps over the lazy fox",
-                                      height=100)
-            if compare_text:
-                # Calculate similarity using the imported util
-                similarity = util.pytorch_cos_sim(
-                    sent_model.encode(text_input, convert_to_tensor=True),
-                    sent_model.encode(compare_text, convert_to_tensor=True)
-                ).item()
-                st.metric(
-                    label="Semantic Similarity",
-                    value=f"{similarity:.2f}",
-                    help="1.0 = identical meaning, 0.0 = completely different"
-                )
 if __name__ == "__main__":
     main()

 import streamlit as st
 import numpy as np
+from sentence_transformers import SentenceTransformer, util
 import plotly.graph_objects as go
+import plotly.express as px
+from typing import List, Tuple
 import pandas as pd
 # Page configuration
+st.set_page_config(layout="wide", page_title="🎯 Sentence Transformer Explorer")
+# Load model
 @st.cache_resource
+def load_model():
+    return SentenceTransformer('all-MiniLM-L6-v2')
+model = load_model()
+def get_embedding_and_similarity(sentences: List[str]) -> Tuple[np.ndarray, np.ndarray]:
+    embeddings = model.encode(sentences)
+    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
+    return embeddings, similarity_matrix
+def create_word_importance_visualization(sentence: str, embedding: np.ndarray):
+    # Calculate word-level contribution to the embedding
+    words = sentence.split()
+    word_embeddings = model.encode(words)
+    # Calculate each word's average contribution
+    word_importance = np.mean(np.abs(word_embeddings), axis=1)
+    # Create word importance visualization
+    fig = go.Figure()
+    # Add word bars
+    fig.add_trace(go.Bar(
+        x=words,
+        y=word_importance,
+        marker_color='rgb(158,202,225)',
+        text=np.round(word_importance, 3),
+        textposition='auto',
     ))
     fig.update_layout(
+        title="Word Importance in Embedding",
+        xaxis_title="Words",
+        yaxis_title="Average Contribution",
         height=400
     )
     return fig
+def create_similarity_heatmap(sentences: List[str], similarity_matrix: np.ndarray):
+    fig = go.Figure(data=go.Heatmap(
+        z=similarity_matrix,
+        x=sentences,
+        y=sentences,
+        colorscale='RdBu',
+        text=np.round(similarity_matrix, 3),
+        texttemplate='%{text}',
+        textfont={"size": 10},
+        hoverongaps=False
+    ))
     fig.update_layout(
+        title="Sentence Similarity Matrix",
+        height=400
     )
     return fig
 def main():
+    st.title("🎯 Interactive Sentence Transformer Explorer")
+    with st.expander("ℹ️ How it works", expanded=True):
         st.markdown("""
+        This interactive tool helps you understand how Sentence Transformers work:
+        1. **Sentence Embedding**: Convert sentences into numerical vectors
+        2. **Word Importance**: See how each word contributes to the final embedding
+        3. **Similarity Analysis**: Compare how similar sentences are to each other
+        4. **Interactive Examples**: Try different sentences and see the results
         """)
+    # Interactive sentence input
+    st.subheader("🔤 Enter Your Sentences")
+    col1, col2 = st.columns(2)
     with col1:
+        # Example templates
+        example_templates = {
+            "Similar Meanings": [
+                "I love programming in Python",
+                "Coding with Python is my favorite",
+                "I enjoy developing software using Python"
+            ],
+            "Different Topics": [
+                "The cat sleeps on the mat",
+                "Python is a programming language",
+                "The weather is beautiful today"
+            ],
+            "Semantic Relations": [
+                "Paris is the capital of France",
+                "Berlin is the capital of Germany",
+                "London is the capital of England"
+            ]
+        }
+        selected_template = st.selectbox("Choose an example template:",
+                                       list(example_templates.keys()))
     with col2:
+        if st.button("Load Example"):
+            sentences = example_templates[selected_template]
+        else:
+            sentences = ["I love programming in Python",
+                        "Coding with Python is my favorite",
+                        "The weather is beautiful today"]
+    # Dynamic sentence input
+    num_sentences = st.slider("Number of sentences:", 2, 5, 3)
+    sentences = []
+    for i in range(num_sentences):
+        sentence = st.text_input(f"Sentence {i+1}",
+                               value=sentences[i] if i < len(sentences) else "")
+        sentences.append(sentence)
+    if st.button("Analyze Sentences", type="primary"):
+        if all(sentences):
+            embeddings, similarity_matrix = get_embedding_and_similarity(sentences)
+            st.subheader("📊 Analysis Results")
+            # Create tabs for different visualizations
+            tab1, tab2, tab3 = st.tabs(["Word Importance", "Sentence Similarity", "Embedding Space"])
+            with tab1:
+                st.markdown("### 🔍 Word-Level Analysis")
+                for i, sentence in enumerate(sentences):
+                    st.markdown(f"**Sentence {i+1}:** {sentence}")
+                    fig = create_word_importance_visualization(sentence, embeddings[i])
+                    st.plotly_chart(fig, use_container_width=True)
+            with tab2:
+                st.markdown("### 🤝 Sentence Similarity Analysis")
+                fig = create_similarity_heatmap(sentences, similarity_matrix)
+                st.plotly_chart(fig, use_container_width=True)
+                # Add similarity interpretation
+                st.markdown("#### 💡 Interpretation")
+                for i in range(len(sentences)):
+                    for j in range(i+1, len(sentences)):
+                        similarity = similarity_matrix[i][j]
+                        interpretation = (
+                            "Very similar" if similarity > 0.8
+                            else "Moderately similar" if similarity > 0.5
+                            else "Different"
+                        )
+                        st.write(f"Sentences {i+1} & {i+2}: {interpretation} ({similarity:.3f})")
+            with tab3:
+                st.markdown("### 🎯 Interactive Embedding Analysis")
+                # Create embedding statistics
+                embedding_stats = pd.DataFrame({
+                    'Sentence': sentences,
+                    'Embedding_Length': [np.linalg.norm(emb) for emb in embeddings],
+                    'Mean_Value': [np.mean(emb) for emb in embeddings],
+                    'Std_Dev': [np.std(emb) for emb in embeddings]
+                })
+                st.dataframe(embedding_stats)
+                st.markdown("""
+                #### 📝 Understanding Embeddings
+                - **Embedding Length**: Represents the magnitude of the vector
+                - **Mean Value**: Average of all dimensions
+                - **Standard Deviation**: Spread of values across dimensions
+                """)
+        else:
+            st.warning("Please enter all sentences before analyzing.")
 if __name__ == "__main__":
     main()