Update app.py
Browse files
app.py
CHANGED
@@ -1,180 +1,184 @@
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
-
from sentence_transformers import SentenceTransformer
|
4 |
-
import plotly.express as px
|
5 |
import plotly.graph_objects as go
|
6 |
-
|
7 |
-
import
|
8 |
-
from transformers import AutoTokenizer, AutoModel
|
9 |
import pandas as pd
|
10 |
-
from sentence_transformers import SentenceTransformer, util # Added util import
|
11 |
-
|
12 |
|
13 |
# Page configuration
|
14 |
-
st.set_page_config(layout="wide", page_title="
|
15 |
|
|
|
16 |
@st.cache_resource
|
17 |
-
def
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
return sent_model, word_tokenizer, word_model
|
22 |
|
23 |
-
|
|
|
|
|
|
|
24 |
|
25 |
-
def
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
outputs = word_model(**tokens)
|
30 |
-
word_embeddings = outputs.last_hidden_state.squeeze(0)
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
#
|
39 |
-
fig
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
44 |
))
|
45 |
|
46 |
fig.update_layout(
|
47 |
-
title=
|
48 |
-
xaxis_title=
|
49 |
-
yaxis_title=
|
50 |
height=400
|
51 |
)
|
52 |
return fig
|
53 |
|
54 |
-
def
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
init='random',
|
66 |
-
learning_rate='auto'
|
67 |
-
)
|
68 |
-
|
69 |
-
# Perform t-SNE dimensionality reduction
|
70 |
-
embeddings_2d = tsne.fit_transform(embeddings)
|
71 |
-
|
72 |
-
# Create scatter plot
|
73 |
-
fig = px.scatter(
|
74 |
-
x=embeddings_2d[:, 0],
|
75 |
-
y=embeddings_2d[:, 1],
|
76 |
-
text=words,
|
77 |
-
title=f'Word Embeddings in 2D Space (perplexity={perplexity})'
|
78 |
-
)
|
79 |
|
80 |
-
# Update layout for better visualization
|
81 |
-
fig.update_traces(
|
82 |
-
textposition='top center',
|
83 |
-
mode='markers+text'
|
84 |
-
)
|
85 |
fig.update_layout(
|
86 |
-
|
87 |
-
|
88 |
-
xaxis_title="t-SNE dimension 1",
|
89 |
-
yaxis_title="t-SNE dimension 2"
|
90 |
)
|
91 |
-
|
92 |
return fig
|
93 |
|
94 |
def main():
|
95 |
-
st.title("
|
96 |
|
97 |
-
with st.expander("βΉοΈ
|
98 |
st.markdown("""
|
99 |
-
This
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
103 |
""")
|
104 |
|
105 |
-
|
|
|
|
|
|
|
106 |
|
107 |
with col1:
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
|
|
|
|
|
|
115 |
with col2:
|
116 |
-
st.
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
st.plotly_chart(create_heatmap(word_embeddings, words), use_container_width=True)
|
134 |
-
|
135 |
-
if show_scatter:
|
136 |
-
st.plotly_chart(create_word_scatter(word_embeddings, words), use_container_width=True)
|
137 |
-
|
138 |
-
if show_sentence:
|
139 |
-
st.markdown("### Sentence-Level Analysis")
|
140 |
-
|
141 |
-
# Get sentence embedding
|
142 |
-
sentence_embedding = sent_model.encode(text_input)
|
143 |
|
144 |
-
|
145 |
-
fig = go.Figure(data=go.Bar(
|
146 |
-
x=list(range(len(sentence_embedding))),
|
147 |
-
y=sentence_embedding,
|
148 |
-
name='Sentence Embedding'
|
149 |
-
))
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
xaxis_title='Dimension',
|
154 |
-
yaxis_title='Value',
|
155 |
-
height=300
|
156 |
-
)
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
-
st.
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
178 |
|
179 |
if __name__ == "__main__":
|
180 |
main()
|
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
|
|
4 |
import plotly.graph_objects as go
|
5 |
+
import plotly.express as px
|
6 |
+
from typing import List, Tuple
|
|
|
7 |
import pandas as pd
|
|
|
|
|
8 |
|
9 |
# Page configuration
|
10 |
+
st.set_page_config(layout="wide", page_title="π― Sentence Transformer Explorer")
|
11 |
|
12 |
+
# Load model
|
13 |
@st.cache_resource
|
14 |
+
def load_model():
|
15 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
16 |
+
|
17 |
+
model = load_model()
|
|
|
18 |
|
19 |
+
def get_embedding_and_similarity(sentences: List[str]) -> Tuple[np.ndarray, np.ndarray]:
|
20 |
+
embeddings = model.encode(sentences)
|
21 |
+
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
|
22 |
+
return embeddings, similarity_matrix
|
23 |
|
24 |
+
def create_word_importance_visualization(sentence: str, embedding: np.ndarray):
|
25 |
+
# Calculate word-level contribution to the embedding
|
26 |
+
words = sentence.split()
|
27 |
+
word_embeddings = model.encode(words)
|
|
|
|
|
28 |
|
29 |
+
# Calculate each word's average contribution
|
30 |
+
word_importance = np.mean(np.abs(word_embeddings), axis=1)
|
31 |
|
32 |
+
# Create word importance visualization
|
33 |
+
fig = go.Figure()
|
34 |
+
|
35 |
+
# Add word bars
|
36 |
+
fig.add_trace(go.Bar(
|
37 |
+
x=words,
|
38 |
+
y=word_importance,
|
39 |
+
marker_color='rgb(158,202,225)',
|
40 |
+
text=np.round(word_importance, 3),
|
41 |
+
textposition='auto',
|
42 |
))
|
43 |
|
44 |
fig.update_layout(
|
45 |
+
title="Word Importance in Embedding",
|
46 |
+
xaxis_title="Words",
|
47 |
+
yaxis_title="Average Contribution",
|
48 |
height=400
|
49 |
)
|
50 |
return fig
|
51 |
|
52 |
+
def create_similarity_heatmap(sentences: List[str], similarity_matrix: np.ndarray):
|
53 |
+
fig = go.Figure(data=go.Heatmap(
|
54 |
+
z=similarity_matrix,
|
55 |
+
x=sentences,
|
56 |
+
y=sentences,
|
57 |
+
colorscale='RdBu',
|
58 |
+
text=np.round(similarity_matrix, 3),
|
59 |
+
texttemplate='%{text}',
|
60 |
+
textfont={"size": 10},
|
61 |
+
hoverongaps=False
|
62 |
+
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
|
|
|
|
|
|
|
|
|
|
64 |
fig.update_layout(
|
65 |
+
title="Sentence Similarity Matrix",
|
66 |
+
height=400
|
|
|
|
|
67 |
)
|
|
|
68 |
return fig
|
69 |
|
70 |
def main():
|
71 |
+
st.title("π― Interactive Sentence Transformer Explorer")
|
72 |
|
73 |
+
with st.expander("βΉοΈ How it works", expanded=True):
|
74 |
st.markdown("""
|
75 |
+
This interactive tool helps you understand how Sentence Transformers work:
|
76 |
+
|
77 |
+
1. **Sentence Embedding**: Convert sentences into numerical vectors
|
78 |
+
2. **Word Importance**: See how each word contributes to the final embedding
|
79 |
+
3. **Similarity Analysis**: Compare how similar sentences are to each other
|
80 |
+
4. **Interactive Examples**: Try different sentences and see the results
|
81 |
""")
|
82 |
|
83 |
+
# Interactive sentence input
|
84 |
+
st.subheader("π€ Enter Your Sentences")
|
85 |
+
|
86 |
+
col1, col2 = st.columns(2)
|
87 |
|
88 |
with col1:
|
89 |
+
# Example templates
|
90 |
+
example_templates = {
|
91 |
+
"Similar Meanings": [
|
92 |
+
"I love programming in Python",
|
93 |
+
"Coding with Python is my favorite",
|
94 |
+
"I enjoy developing software using Python"
|
95 |
+
],
|
96 |
+
"Different Topics": [
|
97 |
+
"The cat sleeps on the mat",
|
98 |
+
"Python is a programming language",
|
99 |
+
"The weather is beautiful today"
|
100 |
+
],
|
101 |
+
"Semantic Relations": [
|
102 |
+
"Paris is the capital of France",
|
103 |
+
"Berlin is the capital of Germany",
|
104 |
+
"London is the capital of England"
|
105 |
+
]
|
106 |
+
}
|
107 |
|
108 |
+
selected_template = st.selectbox("Choose an example template:",
|
109 |
+
list(example_templates.keys()))
|
110 |
+
|
111 |
with col2:
|
112 |
+
if st.button("Load Example"):
|
113 |
+
sentences = example_templates[selected_template]
|
114 |
+
else:
|
115 |
+
sentences = ["I love programming in Python",
|
116 |
+
"Coding with Python is my favorite",
|
117 |
+
"The weather is beautiful today"]
|
118 |
|
119 |
+
# Dynamic sentence input
|
120 |
+
num_sentences = st.slider("Number of sentences:", 2, 5, 3)
|
121 |
+
sentences = []
|
122 |
+
|
123 |
+
for i in range(num_sentences):
|
124 |
+
sentence = st.text_input(f"Sentence {i+1}",
|
125 |
+
value=sentences[i] if i < len(sentences) else "")
|
126 |
+
sentences.append(sentence)
|
127 |
+
|
128 |
+
if st.button("Analyze Sentences", type="primary"):
|
129 |
+
if all(sentences):
|
130 |
+
embeddings, similarity_matrix = get_embedding_and_similarity(sentences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
+
st.subheader("π Analysis Results")
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
# Create tabs for different visualizations
|
135 |
+
tab1, tab2, tab3 = st.tabs(["Word Importance", "Sentence Similarity", "Embedding Space"])
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
with tab1:
|
138 |
+
st.markdown("### π Word-Level Analysis")
|
139 |
+
for i, sentence in enumerate(sentences):
|
140 |
+
st.markdown(f"**Sentence {i+1}:** {sentence}")
|
141 |
+
fig = create_word_importance_visualization(sentence, embeddings[i])
|
142 |
+
st.plotly_chart(fig, use_container_width=True)
|
143 |
|
144 |
+
with tab2:
|
145 |
+
st.markdown("### π€ Sentence Similarity Analysis")
|
146 |
+
fig = create_similarity_heatmap(sentences, similarity_matrix)
|
147 |
+
st.plotly_chart(fig, use_container_width=True)
|
148 |
+
|
149 |
+
# Add similarity interpretation
|
150 |
+
st.markdown("#### π‘ Interpretation")
|
151 |
+
for i in range(len(sentences)):
|
152 |
+
for j in range(i+1, len(sentences)):
|
153 |
+
similarity = similarity_matrix[i][j]
|
154 |
+
interpretation = (
|
155 |
+
"Very similar" if similarity > 0.8
|
156 |
+
else "Moderately similar" if similarity > 0.5
|
157 |
+
else "Different"
|
158 |
+
)
|
159 |
+
st.write(f"Sentences {i+1} & {i+2}: {interpretation} ({similarity:.3f})")
|
160 |
|
161 |
+
with tab3:
|
162 |
+
st.markdown("### π― Interactive Embedding Analysis")
|
163 |
+
|
164 |
+
# Create embedding statistics
|
165 |
+
embedding_stats = pd.DataFrame({
|
166 |
+
'Sentence': sentences,
|
167 |
+
'Embedding_Length': [np.linalg.norm(emb) for emb in embeddings],
|
168 |
+
'Mean_Value': [np.mean(emb) for emb in embeddings],
|
169 |
+
'Std_Dev': [np.std(emb) for emb in embeddings]
|
170 |
+
})
|
171 |
+
|
172 |
+
st.dataframe(embedding_stats)
|
173 |
|
174 |
+
st.markdown("""
|
175 |
+
#### π Understanding Embeddings
|
176 |
+
- **Embedding Length**: Represents the magnitude of the vector
|
177 |
+
- **Mean Value**: Average of all dimensions
|
178 |
+
- **Standard Deviation**: Spread of values across dimensions
|
179 |
+
""")
|
180 |
+
else:
|
181 |
+
st.warning("Please enter all sentences before analyzing.")
|
182 |
|
183 |
if __name__ == "__main__":
|
184 |
main()
|