DexterSptizu commited on
Commit
9fba660
·
verified ·
1 Parent(s): ac1c59d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -201
app.py CHANGED
@@ -1,228 +1,154 @@
1
  import streamlit as st
2
  import numpy as np
3
- from sentence_transformers import SentenceTransformer, util
 
4
  import plotly.graph_objects as go
 
 
 
 
5
 
6
  # Page configuration
7
- st.set_page_config(
8
- page_title="Sentence Embeddings Explorer",
9
- page_icon="🤗",
10
- layout="wide"
11
- )
12
 
13
- # Custom CSS
14
- st.markdown("""
15
- <style>
16
- .stTabs [data-baseweb="tab-list"] {
17
- gap: 24px;
18
- }
19
- .stTabs [data-baseweb="tab"] {
20
- height: 50px;
21
- padding-left: 20px;
22
- padding-right: 20px;
23
- }
24
- .big-font {
25
- font-size:20px !important;
26
- font-weight: bold;
27
- }
28
- .medium-font {
29
- font-size:16px !important;
30
- }
31
- .highlight {
32
- padding: 10px;
33
- border-radius: 5px;
34
- margin: 10px 0;
35
- }
36
- </style>
37
- """, unsafe_allow_html=True)
38
-
39
- # Initialize model
40
  @st.cache_resource
41
- def load_model():
42
- return SentenceTransformer('all-MiniLM-L6-v2')
43
-
44
- model = load_model()
 
45
 
46
- def get_embedding_and_similarity(text1, text2):
47
- embedding1 = model.encode(text1, convert_to_tensor=True)
48
- embedding2 = model.encode(text2, convert_to_tensor=True)
49
- similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
50
- return similarity, embedding1.cpu().numpy(), embedding2.cpu().numpy()
51
 
52
- def create_radar_chart(embedding1, embedding2, num_dimensions=10):
53
- # Select first few dimensions for visualization
54
- categories = [f"Dim {i+1}" for i in range(num_dimensions)]
 
 
 
55
 
56
- fig = go.Figure()
57
- fig.add_trace(go.Scatterpolar(
58
- r=embedding1[:num_dimensions],
59
- theta=categories,
60
- fill='toself',
61
- name='Text 1'
62
- ))
63
- fig.add_trace(go.Scatterpolar(
64
- r=embedding2[:num_dimensions],
65
- theta=categories,
66
- fill='toself',
67
- name='Text 2'
68
  ))
69
 
70
  fig.update_layout(
71
- polar=dict(radialaxis=dict(visible=True, range=[-1, 1])),
72
- showlegend=True
 
 
73
  )
74
  return fig
75
 
76
- # Title and Introduction
77
- st.title("🤗 Interactive Sentence Embeddings Explorer")
78
- st.markdown("""
79
- <p class="medium-font">
80
- Explore the fascinating world of sentence embeddings! This interactive tool helps you understand
81
- how AI models capture the meaning of text and measure similarity between sentences.
82
- </p>
83
- """, unsafe_allow_html=True)
84
-
85
- # Create tabs
86
- tab1, tab2, tab3 = st.tabs(["🔍 Compare Texts", "📚 Learn by Examples", "ℹ️ How It Works"])
87
-
88
- with tab1:
89
- st.markdown("### Compare Any Two Texts")
90
- col1, col2 = st.columns(2)
91
-
92
- with col1:
93
- text1 = st.text_area("First Text",
94
- height=150,
95
- value="I love programming in Python",
96
- help="Enter your first text here")
97
 
98
- with col2:
99
- text2 = st.text_area("Second Text",
100
- height=150,
101
- value="Python is my favorite programming language",
102
- help="Enter your second text here")
103
-
104
- if st.button("Calculate Similarity", type="primary"):
105
- similarity, emb1, emb2 = get_embedding_and_similarity(text1, text2)
106
-
107
- # Create three columns for results
108
- col1, col2, col3 = st.columns([2,1,2])
109
-
110
- with col2:
111
- st.markdown("### Similarity Score")
112
- st.markdown(f"<h1 style='text-align: center;'>{similarity:.2f}</h1>",
113
- unsafe_allow_html=True)
114
-
115
- # Progress bar and interpretation
116
- st.progress(similarity)
117
-
118
- if similarity > 0.8:
119
- st.success("🎯 These texts are very similar!")
120
- elif similarity > 0.5:
121
- st.info("🤔 These texts are somewhat similar")
122
- else:
123
- st.warning("📊 These texts are quite different")
124
-
125
- # Visualization
126
- st.markdown("### Embedding Visualization")
127
- st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
128
-
129
- with tab2:
130
- st.markdown("### Learn Through Examples")
131
 
132
- examples = {
133
- "Similar Meaning, Different Words": {
134
- "text1": "The cat is sleeping on the couch",
135
- "text2": "A feline is resting on the sofa",
136
- "explanation": "These sentences use different words but convey the same meaning."
137
- },
138
- "Similar Words, Different Context": {
139
- "text1": "The bank is by the river",
140
- "text2": "I need to go to the bank for money",
141
- "explanation": "These sentences use 'bank' in different contexts."
142
- },
143
- "Technical Similarity": {
144
- "text1": "Python is a programming language",
145
- "text2": "Java is used for coding software",
146
- "explanation": "These sentences are related to programming but discuss different languages."
147
- },
148
- "Opposite Meanings": {
149
- "text1": "The stock market is going up",
150
- "text2": "The stock market is going down",
151
- "explanation": "These sentences use similar words but have opposite meanings."
152
- }
153
- }
154
 
155
- selected_example = st.selectbox("Choose an example to explore",
156
- list(examples.keys()))
157
 
158
- if st.button("Analyze Example", type="primary"):
159
- example = examples[selected_example]
160
- similarity, emb1, emb2 = get_embedding_and_similarity(
161
- example["text1"],
162
- example["text2"]
163
- )
164
-
165
- col1, col2 = st.columns(2)
166
- with col1:
167
- st.markdown("**Text 1:**")
168
- st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text1']}</div>",
169
- unsafe_allow_html=True)
170
- with col2:
171
- st.markdown("**Text 2:**")
172
- st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text2']}</div>",
173
- unsafe_allow_html=True)
174
-
175
- st.markdown("**Explanation:**")
176
- st.info(example["explanation"])
177
-
178
- st.markdown("**Similarity Score:**")
179
- st.progress(similarity)
180
- st.write(f"Cosine Similarity: {similarity:.4f}")
181
-
182
- st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
183
-
184
- with tab3:
185
- st.markdown("### Understanding Sentence Embeddings")
186
 
187
- col1, col2 = st.columns(2)
188
 
189
  with col1:
190
- st.markdown("""
191
- #### What are Sentence Embeddings?
192
- Sentence embeddings are numerical representations of text that capture semantic meaning.
193
- Each sentence is converted into a vector of numbers, where similar meanings result in
194
- similar vectors.
 
195
 
196
- #### How is Similarity Calculated?
197
- The similarity between two sentences is measured using cosine similarity between their
198
- embedding vectors. The score ranges from -1 to 1:
199
- - 1.0 = Identical meaning
200
- - >0.8 = Very similar
201
- - >0.5 = Somewhat similar
202
- - <0.5 = Different meanings
203
- """)
204
-
205
  with col2:
206
- st.markdown("""
207
- #### Current Model Details
208
- This demo uses the `all-MiniLM-L6-v2` model:
209
- - Embedding Size: 384 dimensions
210
- - Optimized for semantic similarity
211
- - Fast and efficient
212
- - Good balance of performance and speed
 
 
213
 
214
- #### Use Cases
215
- - Semantic search
216
- - Document similarity
217
- - Text clustering
218
- - Information retrieval
219
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- with st.expander("🔬 Technical Details"):
222
- st.markdown("""
223
- The model processes text through these steps:
224
- 1. Tokenization: Breaks text into tokens
225
- 2. Encoding: Converts tokens to embeddings
226
- 3. Pooling: Combines token embeddings into sentence embedding
227
- 4. Similarity: Computes cosine similarity between embeddings
228
- """)
 
1
  import streamlit as st
2
  import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ import plotly.express as px
5
  import plotly.graph_objects as go
6
+ from sklearn.manifold import TSNE
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModel
9
+ import pandas as pd
10
 
11
  # Page configuration
12
+ st.set_page_config(layout="wide", page_title="Word & Sentence Embeddings Explorer")
 
 
 
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  @st.cache_resource
15
+ def load_models():
16
+ sent_model = SentenceTransformer('all-MiniLM-L6-v2')
17
+ word_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
18
+ word_model = AutoModel.from_pretrained('bert-base-uncased')
19
+ return sent_model, word_tokenizer, word_model
20
 
21
+ sent_model, word_tokenizer, word_model = load_models()
 
 
 
 
22
 
23
+ def get_word_embeddings(text):
24
+ # Tokenize and get word embeddings
25
+ tokens = word_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
26
+ with torch.no_grad():
27
+ outputs = word_model(**tokens)
28
+ word_embeddings = outputs.last_hidden_state.squeeze(0)
29
 
30
+ # Get original words from tokens
31
+ words = word_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
32
+
33
+ return words, word_embeddings
34
+
35
+ def create_heatmap(embeddings, words):
36
+ # Create heatmap of word embeddings
37
+ fig = go.Figure(data=go.Heatmap(
38
+ z=embeddings,
39
+ x=[f'Dim {i+1}' for i in range(embeddings.shape[1])],
40
+ y=words,
41
+ colorscale='Viridis'
42
  ))
43
 
44
  fig.update_layout(
45
+ title='Word Embeddings Heatmap',
46
+ xaxis_title='Embedding Dimensions',
47
+ yaxis_title='Words',
48
+ height=400
49
  )
50
  return fig
51
 
52
+ def create_word_scatter(embeddings, words):
53
+ # Reduce dimensions for visualization
54
+ tsne = TSNE(n_components=2, random_state=42)
55
+ embeddings_2d = tsne.fit_transform(embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Create scatter plot
58
+ fig = px.scatter(
59
+ x=embeddings_2d[:, 0],
60
+ y=embeddings_2d[:, 1],
61
+ text=words,
62
+ title='Word Embeddings in 2D Space'
63
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ fig.update_traces(textposition='top center')
66
+ fig.update_layout(height=400)
67
+ return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ def main():
70
+ st.title("🔤 Interactive Word & Sentence Embeddings Explorer")
71
 
72
+ with st.expander("ℹ️ About this app", expanded=True):
73
+ st.markdown("""
74
+ This app helps you understand how words and sentences are represented in vector space:
75
+ - **Word-level Analysis**: See how individual words are embedded
76
+ - **Sentence-level Analysis**: Compare different sentences
77
+ - **Interactive Visualizations**: Explore embeddings through various charts
78
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ col1, col2 = st.columns([2, 1])
81
 
82
  with col1:
83
+ text_input = st.text_area(
84
+ "Enter your text",
85
+ value="The quick brown fox jumps over the lazy dog",
86
+ height=100,
87
+ help="Enter any text to see its word and sentence embeddings"
88
+ )
89
 
 
 
 
 
 
 
 
 
 
90
  with col2:
91
+ st.markdown("### Visualization Options")
92
+ show_heatmap = st.checkbox("Show Heatmap", value=True)
93
+ show_scatter = st.checkbox("Show Word Scatter", value=True)
94
+ show_sentence = st.checkbox("Show Sentence Analysis", value=True)
95
+
96
+ if text_input:
97
+ # Get word-level embeddings
98
+ words, word_embeddings = get_word_embeddings(text_input)
99
+ word_embeddings = word_embeddings.numpy()
100
 
101
+ # Remove special tokens
102
+ mask = ~np.isin(words, ['[CLS]', '[SEP]', '[PAD]'])
103
+ words = [w for i, w in enumerate(words) if mask[i]]
104
+ word_embeddings = word_embeddings[mask]
105
+
106
+ # Create visualizations
107
+ if show_heatmap:
108
+ st.plotly_chart(create_heatmap(word_embeddings, words), use_container_width=True)
109
+
110
+ if show_scatter:
111
+ st.plotly_chart(create_word_scatter(word_embeddings, words), use_container_width=True)
112
+
113
+ if show_sentence:
114
+ st.markdown("### Sentence-Level Analysis")
115
+
116
+ # Get sentence embedding
117
+ sentence_embedding = sent_model.encode(text_input)
118
+
119
+ # Create sentence embedding visualization
120
+ fig = go.Figure(data=go.Bar(
121
+ x=list(range(len(sentence_embedding))),
122
+ y=sentence_embedding,
123
+ name='Sentence Embedding'
124
+ ))
125
+
126
+ fig.update_layout(
127
+ title='Sentence Embedding Vector',
128
+ xaxis_title='Dimension',
129
+ yaxis_title='Value',
130
+ height=300
131
+ )
132
+
133
+ st.plotly_chart(fig, use_container_width=True)
134
+
135
+ # Add similarity comparison
136
+ st.markdown("### Compare with Another Sentence")
137
+ compare_text = st.text_area("Enter another sentence for comparison",
138
+ value="A quick brown dog jumps over the lazy fox",
139
+ height=100)
140
+
141
+ if compare_text:
142
+ similarity = util.pytorch_cos_sim(
143
+ sent_model.encode(text_input),
144
+ sent_model.encode(compare_text)
145
+ ).item()
146
+
147
+ st.metric(
148
+ label="Semantic Similarity",
149
+ value=f"{similarity:.2f}",
150
+ help="1.0 = identical meaning, 0.0 = completely different"
151
+ )
152
 
153
+ if __name__ == "__main__":
154
+ main()