Update app.py
Browse files
app.py
CHANGED
@@ -1,228 +1,154 @@
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
-
from sentence_transformers import SentenceTransformer
|
|
|
4 |
import plotly.graph_objects as go
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Page configuration
|
7 |
-
st.set_page_config(
|
8 |
-
page_title="Sentence Embeddings Explorer",
|
9 |
-
page_icon="🤗",
|
10 |
-
layout="wide"
|
11 |
-
)
|
12 |
|
13 |
-
# Custom CSS
|
14 |
-
st.markdown("""
|
15 |
-
<style>
|
16 |
-
.stTabs [data-baseweb="tab-list"] {
|
17 |
-
gap: 24px;
|
18 |
-
}
|
19 |
-
.stTabs [data-baseweb="tab"] {
|
20 |
-
height: 50px;
|
21 |
-
padding-left: 20px;
|
22 |
-
padding-right: 20px;
|
23 |
-
}
|
24 |
-
.big-font {
|
25 |
-
font-size:20px !important;
|
26 |
-
font-weight: bold;
|
27 |
-
}
|
28 |
-
.medium-font {
|
29 |
-
font-size:16px !important;
|
30 |
-
}
|
31 |
-
.highlight {
|
32 |
-
padding: 10px;
|
33 |
-
border-radius: 5px;
|
34 |
-
margin: 10px 0;
|
35 |
-
}
|
36 |
-
</style>
|
37 |
-
""", unsafe_allow_html=True)
|
38 |
-
|
39 |
-
# Initialize model
|
40 |
@st.cache_resource
|
41 |
-
def
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
|
46 |
-
|
47 |
-
embedding1 = model.encode(text1, convert_to_tensor=True)
|
48 |
-
embedding2 = model.encode(text2, convert_to_tensor=True)
|
49 |
-
similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
|
50 |
-
return similarity, embedding1.cpu().numpy(), embedding2.cpu().numpy()
|
51 |
|
52 |
-
def
|
53 |
-
#
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
fig.
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
))
|
69 |
|
70 |
fig.update_layout(
|
71 |
-
|
72 |
-
|
|
|
|
|
73 |
)
|
74 |
return fig
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
Explore the fascinating world of sentence embeddings! This interactive tool helps you understand
|
81 |
-
how AI models capture the meaning of text and measure similarity between sentences.
|
82 |
-
</p>
|
83 |
-
""", unsafe_allow_html=True)
|
84 |
-
|
85 |
-
# Create tabs
|
86 |
-
tab1, tab2, tab3 = st.tabs(["🔍 Compare Texts", "📚 Learn by Examples", "ℹ️ How It Works"])
|
87 |
-
|
88 |
-
with tab1:
|
89 |
-
st.markdown("### Compare Any Two Texts")
|
90 |
-
col1, col2 = st.columns(2)
|
91 |
-
|
92 |
-
with col1:
|
93 |
-
text1 = st.text_area("First Text",
|
94 |
-
height=150,
|
95 |
-
value="I love programming in Python",
|
96 |
-
help="Enter your first text here")
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
similarity, emb1, emb2 = get_embedding_and_similarity(text1, text2)
|
106 |
-
|
107 |
-
# Create three columns for results
|
108 |
-
col1, col2, col3 = st.columns([2,1,2])
|
109 |
-
|
110 |
-
with col2:
|
111 |
-
st.markdown("### Similarity Score")
|
112 |
-
st.markdown(f"<h1 style='text-align: center;'>{similarity:.2f}</h1>",
|
113 |
-
unsafe_allow_html=True)
|
114 |
-
|
115 |
-
# Progress bar and interpretation
|
116 |
-
st.progress(similarity)
|
117 |
-
|
118 |
-
if similarity > 0.8:
|
119 |
-
st.success("🎯 These texts are very similar!")
|
120 |
-
elif similarity > 0.5:
|
121 |
-
st.info("🤔 These texts are somewhat similar")
|
122 |
-
else:
|
123 |
-
st.warning("📊 These texts are quite different")
|
124 |
-
|
125 |
-
# Visualization
|
126 |
-
st.markdown("### Embedding Visualization")
|
127 |
-
st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
|
128 |
-
|
129 |
-
with tab2:
|
130 |
-
st.markdown("### Learn Through Examples")
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
"text2": "A feline is resting on the sofa",
|
136 |
-
"explanation": "These sentences use different words but convey the same meaning."
|
137 |
-
},
|
138 |
-
"Similar Words, Different Context": {
|
139 |
-
"text1": "The bank is by the river",
|
140 |
-
"text2": "I need to go to the bank for money",
|
141 |
-
"explanation": "These sentences use 'bank' in different contexts."
|
142 |
-
},
|
143 |
-
"Technical Similarity": {
|
144 |
-
"text1": "Python is a programming language",
|
145 |
-
"text2": "Java is used for coding software",
|
146 |
-
"explanation": "These sentences are related to programming but discuss different languages."
|
147 |
-
},
|
148 |
-
"Opposite Meanings": {
|
149 |
-
"text1": "The stock market is going up",
|
150 |
-
"text2": "The stock market is going down",
|
151 |
-
"explanation": "These sentences use similar words but have opposite meanings."
|
152 |
-
}
|
153 |
-
}
|
154 |
|
155 |
-
|
156 |
-
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
col1, col2 = st.columns(2)
|
166 |
-
with col1:
|
167 |
-
st.markdown("**Text 1:**")
|
168 |
-
st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text1']}</div>",
|
169 |
-
unsafe_allow_html=True)
|
170 |
-
with col2:
|
171 |
-
st.markdown("**Text 2:**")
|
172 |
-
st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text2']}</div>",
|
173 |
-
unsafe_allow_html=True)
|
174 |
-
|
175 |
-
st.markdown("**Explanation:**")
|
176 |
-
st.info(example["explanation"])
|
177 |
-
|
178 |
-
st.markdown("**Similarity Score:**")
|
179 |
-
st.progress(similarity)
|
180 |
-
st.write(f"Cosine Similarity: {similarity:.4f}")
|
181 |
-
|
182 |
-
st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
|
183 |
-
|
184 |
-
with tab3:
|
185 |
-
st.markdown("### Understanding Sentence Embeddings")
|
186 |
|
187 |
-
col1, col2 = st.columns(2)
|
188 |
|
189 |
with col1:
|
190 |
-
st.
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
195 |
|
196 |
-
#### How is Similarity Calculated?
|
197 |
-
The similarity between two sentences is measured using cosine similarity between their
|
198 |
-
embedding vectors. The score ranges from -1 to 1:
|
199 |
-
- 1.0 = Identical meaning
|
200 |
-
- >0.8 = Very similar
|
201 |
-
- >0.5 = Somewhat similar
|
202 |
-
- <0.5 = Different meanings
|
203 |
-
""")
|
204 |
-
|
205 |
with col2:
|
206 |
-
st.markdown(""
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
|
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
-
|
222 |
-
|
223 |
-
The model processes text through these steps:
|
224 |
-
1. Tokenization: Breaks text into tokens
|
225 |
-
2. Encoding: Converts tokens to embeddings
|
226 |
-
3. Pooling: Combines token embeddings into sentence embedding
|
227 |
-
4. Similarity: Computes cosine similarity between embeddings
|
228 |
-
""")
|
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import plotly.express as px
|
5 |
import plotly.graph_objects as go
|
6 |
+
from sklearn.manifold import TSNE
|
7 |
+
import torch
|
8 |
+
from transformers import AutoTokenizer, AutoModel
|
9 |
+
import pandas as pd
|
10 |
|
11 |
# Page configuration
|
12 |
+
st.set_page_config(layout="wide", page_title="Word & Sentence Embeddings Explorer")
|
|
|
|
|
|
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
@st.cache_resource
|
15 |
+
def load_models():
|
16 |
+
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
|
17 |
+
word_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
18 |
+
word_model = AutoModel.from_pretrained('bert-base-uncased')
|
19 |
+
return sent_model, word_tokenizer, word_model
|
20 |
|
21 |
+
sent_model, word_tokenizer, word_model = load_models()
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
def get_word_embeddings(text):
|
24 |
+
# Tokenize and get word embeddings
|
25 |
+
tokens = word_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
|
26 |
+
with torch.no_grad():
|
27 |
+
outputs = word_model(**tokens)
|
28 |
+
word_embeddings = outputs.last_hidden_state.squeeze(0)
|
29 |
|
30 |
+
# Get original words from tokens
|
31 |
+
words = word_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
|
32 |
+
|
33 |
+
return words, word_embeddings
|
34 |
+
|
35 |
+
def create_heatmap(embeddings, words):
|
36 |
+
# Create heatmap of word embeddings
|
37 |
+
fig = go.Figure(data=go.Heatmap(
|
38 |
+
z=embeddings,
|
39 |
+
x=[f'Dim {i+1}' for i in range(embeddings.shape[1])],
|
40 |
+
y=words,
|
41 |
+
colorscale='Viridis'
|
42 |
))
|
43 |
|
44 |
fig.update_layout(
|
45 |
+
title='Word Embeddings Heatmap',
|
46 |
+
xaxis_title='Embedding Dimensions',
|
47 |
+
yaxis_title='Words',
|
48 |
+
height=400
|
49 |
)
|
50 |
return fig
|
51 |
|
52 |
+
def create_word_scatter(embeddings, words):
|
53 |
+
# Reduce dimensions for visualization
|
54 |
+
tsne = TSNE(n_components=2, random_state=42)
|
55 |
+
embeddings_2d = tsne.fit_transform(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# Create scatter plot
|
58 |
+
fig = px.scatter(
|
59 |
+
x=embeddings_2d[:, 0],
|
60 |
+
y=embeddings_2d[:, 1],
|
61 |
+
text=words,
|
62 |
+
title='Word Embeddings in 2D Space'
|
63 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
fig.update_traces(textposition='top center')
|
66 |
+
fig.update_layout(height=400)
|
67 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
+
def main():
|
70 |
+
st.title("🔤 Interactive Word & Sentence Embeddings Explorer")
|
71 |
|
72 |
+
with st.expander("ℹ️ About this app", expanded=True):
|
73 |
+
st.markdown("""
|
74 |
+
This app helps you understand how words and sentences are represented in vector space:
|
75 |
+
- **Word-level Analysis**: See how individual words are embedded
|
76 |
+
- **Sentence-level Analysis**: Compare different sentences
|
77 |
+
- **Interactive Visualizations**: Explore embeddings through various charts
|
78 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
col1, col2 = st.columns([2, 1])
|
81 |
|
82 |
with col1:
|
83 |
+
text_input = st.text_area(
|
84 |
+
"Enter your text",
|
85 |
+
value="The quick brown fox jumps over the lazy dog",
|
86 |
+
height=100,
|
87 |
+
help="Enter any text to see its word and sentence embeddings"
|
88 |
+
)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
with col2:
|
91 |
+
st.markdown("### Visualization Options")
|
92 |
+
show_heatmap = st.checkbox("Show Heatmap", value=True)
|
93 |
+
show_scatter = st.checkbox("Show Word Scatter", value=True)
|
94 |
+
show_sentence = st.checkbox("Show Sentence Analysis", value=True)
|
95 |
+
|
96 |
+
if text_input:
|
97 |
+
# Get word-level embeddings
|
98 |
+
words, word_embeddings = get_word_embeddings(text_input)
|
99 |
+
word_embeddings = word_embeddings.numpy()
|
100 |
|
101 |
+
# Remove special tokens
|
102 |
+
mask = ~np.isin(words, ['[CLS]', '[SEP]', '[PAD]'])
|
103 |
+
words = [w for i, w in enumerate(words) if mask[i]]
|
104 |
+
word_embeddings = word_embeddings[mask]
|
105 |
+
|
106 |
+
# Create visualizations
|
107 |
+
if show_heatmap:
|
108 |
+
st.plotly_chart(create_heatmap(word_embeddings, words), use_container_width=True)
|
109 |
+
|
110 |
+
if show_scatter:
|
111 |
+
st.plotly_chart(create_word_scatter(word_embeddings, words), use_container_width=True)
|
112 |
+
|
113 |
+
if show_sentence:
|
114 |
+
st.markdown("### Sentence-Level Analysis")
|
115 |
+
|
116 |
+
# Get sentence embedding
|
117 |
+
sentence_embedding = sent_model.encode(text_input)
|
118 |
+
|
119 |
+
# Create sentence embedding visualization
|
120 |
+
fig = go.Figure(data=go.Bar(
|
121 |
+
x=list(range(len(sentence_embedding))),
|
122 |
+
y=sentence_embedding,
|
123 |
+
name='Sentence Embedding'
|
124 |
+
))
|
125 |
+
|
126 |
+
fig.update_layout(
|
127 |
+
title='Sentence Embedding Vector',
|
128 |
+
xaxis_title='Dimension',
|
129 |
+
yaxis_title='Value',
|
130 |
+
height=300
|
131 |
+
)
|
132 |
+
|
133 |
+
st.plotly_chart(fig, use_container_width=True)
|
134 |
+
|
135 |
+
# Add similarity comparison
|
136 |
+
st.markdown("### Compare with Another Sentence")
|
137 |
+
compare_text = st.text_area("Enter another sentence for comparison",
|
138 |
+
value="A quick brown dog jumps over the lazy fox",
|
139 |
+
height=100)
|
140 |
+
|
141 |
+
if compare_text:
|
142 |
+
similarity = util.pytorch_cos_sim(
|
143 |
+
sent_model.encode(text_input),
|
144 |
+
sent_model.encode(compare_text)
|
145 |
+
).item()
|
146 |
+
|
147 |
+
st.metric(
|
148 |
+
label="Semantic Similarity",
|
149 |
+
value=f"{similarity:.2f}",
|
150 |
+
help="1.0 = identical meaning, 0.0 = completely different"
|
151 |
+
)
|
152 |
|
153 |
+
if __name__ == "__main__":
|
154 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|