DexterSptizu
commited on
Commit
β’
1262b42
1
Parent(s):
2df7566
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,42 @@
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
from sentence_transformers import SentenceTransformer, util
|
|
|
4 |
|
5 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
@st.cache_resource
|
7 |
def load_model():
|
8 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
@@ -10,102 +44,185 @@ def load_model():
|
|
10 |
model = load_model()
|
11 |
|
12 |
def get_embedding_and_similarity(text1, text2):
|
13 |
-
# Get embeddings
|
14 |
embedding1 = model.encode(text1, convert_to_tensor=True)
|
15 |
embedding2 = model.encode(text2, convert_to_tensor=True)
|
16 |
-
|
17 |
-
# Calculate cosine similarity
|
18 |
similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
|
19 |
-
return similarity
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
st.title("π€ Interactive Sentence Embeddings Explorer")
|
22 |
st.markdown("""
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
st.header("Compare Two Texts")
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
st.markdown("**First Text**")
|
33 |
-
text1 = st.text_area("Enter first text", height=100,
|
34 |
-
value="I love programming in Python")
|
35 |
|
36 |
-
with
|
37 |
-
st.markdown("
|
38 |
-
|
39 |
-
value="Python is my favorite programming language")
|
40 |
-
|
41 |
-
if st.button("Calculate Similarity"):
|
42 |
-
similarity = get_embedding_and_similarity(text1, text2)
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
st.
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
st.warning("These texts are quite different")
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
"
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
"
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
}
|
76 |
-
}
|
77 |
|
78 |
-
selected_example = st.selectbox("Choose an example",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
similarity = get_embedding_and_similarity(example["text1"], example["text2"])
|
83 |
|
84 |
-
st.
|
85 |
-
st.write("Text 1:", example["text1"])
|
86 |
-
st.write("Text 2:", example["text2"])
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
+
import plotly.graph_objects as go
|
5 |
|
6 |
+
# Page configuration
|
7 |
+
st.set_page_config(
|
8 |
+
page_title="Sentence Embeddings Explorer",
|
9 |
+
page_icon="π€",
|
10 |
+
layout="wide"
|
11 |
+
)
|
12 |
+
|
13 |
+
# Custom CSS
|
14 |
+
st.markdown("""
|
15 |
+
<style>
|
16 |
+
.stTabs [data-baseweb="tab-list"] {
|
17 |
+
gap: 24px;
|
18 |
+
}
|
19 |
+
.stTabs [data-baseweb="tab"] {
|
20 |
+
height: 50px;
|
21 |
+
padding-left: 20px;
|
22 |
+
padding-right: 20px;
|
23 |
+
}
|
24 |
+
.big-font {
|
25 |
+
font-size:20px !important;
|
26 |
+
font-weight: bold;
|
27 |
+
}
|
28 |
+
.medium-font {
|
29 |
+
font-size:16px !important;
|
30 |
+
}
|
31 |
+
.highlight {
|
32 |
+
padding: 10px;
|
33 |
+
border-radius: 5px;
|
34 |
+
margin: 10px 0;
|
35 |
+
}
|
36 |
+
</style>
|
37 |
+
""", unsafe_allow_html=True)
|
38 |
+
|
39 |
+
# Initialize model
|
40 |
@st.cache_resource
|
41 |
def load_model():
|
42 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
44 |
model = load_model()
|
45 |
|
46 |
def get_embedding_and_similarity(text1, text2):
|
|
|
47 |
embedding1 = model.encode(text1, convert_to_tensor=True)
|
48 |
embedding2 = model.encode(text2, convert_to_tensor=True)
|
|
|
|
|
49 |
similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
|
50 |
+
return similarity, embedding1.cpu().numpy(), embedding2.cpu().numpy()
|
51 |
|
52 |
+
def create_radar_chart(embedding1, embedding2, num_dimensions=10):
|
53 |
+
# Select first few dimensions for visualization
|
54 |
+
categories = [f"Dim {i+1}" for i in range(num_dimensions)]
|
55 |
+
|
56 |
+
fig = go.Figure()
|
57 |
+
fig.add_trace(go.Scatterpolar(
|
58 |
+
r=embedding1[:num_dimensions],
|
59 |
+
theta=categories,
|
60 |
+
fill='toself',
|
61 |
+
name='Text 1'
|
62 |
+
))
|
63 |
+
fig.add_trace(go.Scatterpolar(
|
64 |
+
r=embedding2[:num_dimensions],
|
65 |
+
theta=categories,
|
66 |
+
fill='toself',
|
67 |
+
name='Text 2'
|
68 |
+
))
|
69 |
+
|
70 |
+
fig.update_layout(
|
71 |
+
polar=dict(radialaxis=dict(visible=True, range=[-1, 1])),
|
72 |
+
showlegend=True
|
73 |
+
)
|
74 |
+
return fig
|
75 |
+
|
76 |
+
# Title and Introduction
|
77 |
st.title("π€ Interactive Sentence Embeddings Explorer")
|
78 |
st.markdown("""
|
79 |
+
<p class="medium-font">
|
80 |
+
Explore the fascinating world of sentence embeddings! This interactive tool helps you understand
|
81 |
+
how AI models capture the meaning of text and measure similarity between sentences.
|
82 |
+
</p>
|
83 |
+
""", unsafe_allow_html=True)
|
|
|
84 |
|
85 |
+
# Create tabs
|
86 |
+
tab1, tab2, tab3 = st.tabs(["π Compare Texts", "π Learn by Examples", "βΉοΈ How It Works"])
|
|
|
|
|
|
|
87 |
|
88 |
+
with tab1:
|
89 |
+
st.markdown("### Compare Any Two Texts")
|
90 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
with col1:
|
93 |
+
text1 = st.text_area("First Text",
|
94 |
+
height=150,
|
95 |
+
value="I love programming in Python",
|
96 |
+
help="Enter your first text here")
|
97 |
|
98 |
+
with col2:
|
99 |
+
text2 = st.text_area("Second Text",
|
100 |
+
height=150,
|
101 |
+
value="Python is my favorite programming language",
|
102 |
+
help="Enter your second text here")
|
|
|
103 |
|
104 |
+
if st.button("Calculate Similarity", type="primary"):
|
105 |
+
similarity, emb1, emb2 = get_embedding_and_similarity(text1, text2)
|
106 |
+
|
107 |
+
# Create three columns for results
|
108 |
+
col1, col2, col3 = st.columns([2,1,2])
|
109 |
+
|
110 |
+
with col2:
|
111 |
+
st.markdown("### Similarity Score")
|
112 |
+
st.markdown(f"<h1 style='text-align: center;'>{similarity:.2f}</h1>",
|
113 |
+
unsafe_allow_html=True)
|
114 |
+
|
115 |
+
# Progress bar and interpretation
|
116 |
+
st.progress(similarity)
|
117 |
+
|
118 |
+
if similarity > 0.8:
|
119 |
+
st.success("π― These texts are very similar!")
|
120 |
+
elif similarity > 0.5:
|
121 |
+
st.info("π€ These texts are somewhat similar")
|
122 |
+
else:
|
123 |
+
st.warning("π These texts are quite different")
|
124 |
+
|
125 |
+
# Visualization
|
126 |
+
st.markdown("### Embedding Visualization")
|
127 |
+
st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
|
128 |
|
129 |
+
with tab2:
|
130 |
+
st.markdown("### Learn Through Examples")
|
131 |
+
|
132 |
+
examples = {
|
133 |
+
"Similar Meaning, Different Words": {
|
134 |
+
"text1": "The cat is sleeping on the couch",
|
135 |
+
"text2": "A feline is resting on the sofa",
|
136 |
+
"explanation": "These sentences use different words but convey the same meaning."
|
137 |
+
},
|
138 |
+
"Similar Words, Different Context": {
|
139 |
+
"text1": "The bank is by the river",
|
140 |
+
"text2": "I need to go to the bank for money",
|
141 |
+
"explanation": "These sentences use 'bank' in different contexts."
|
142 |
+
},
|
143 |
+
"Technical Similarity": {
|
144 |
+
"text1": "Python is a programming language",
|
145 |
+
"text2": "Java is used for coding software",
|
146 |
+
"explanation": "These sentences are related to programming but discuss different languages."
|
147 |
+
},
|
148 |
+
"Opposite Meanings": {
|
149 |
+
"text1": "The stock market is going up",
|
150 |
+
"text2": "The stock market is going down",
|
151 |
+
"explanation": "These sentences use similar words but have opposite meanings."
|
152 |
+
}
|
153 |
}
|
|
|
154 |
|
155 |
+
selected_example = st.selectbox("Choose an example to explore",
|
156 |
+
list(examples.keys()))
|
157 |
+
|
158 |
+
if st.button("Analyze Example", type="primary"):
|
159 |
+
example = examples[selected_example]
|
160 |
+
similarity, emb1, emb2 = get_embedding_and_similarity(
|
161 |
+
example["text1"],
|
162 |
+
example["text2"]
|
163 |
+
)
|
164 |
+
|
165 |
+
col1, col2 = st.columns(2)
|
166 |
+
with col1:
|
167 |
+
st.markdown("**Text 1:**")
|
168 |
+
st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text1']}</div>",
|
169 |
+
unsafe_allow_html=True)
|
170 |
+
with col2:
|
171 |
+
st.markdown("**Text 2:**")
|
172 |
+
st.markdown(f"<div class='highlight' style='background-color: #f0f2f6'>{example['text2']}</div>",
|
173 |
+
unsafe_allow_html=True)
|
174 |
+
|
175 |
+
st.markdown("**Explanation:**")
|
176 |
+
st.info(example["explanation"])
|
177 |
+
|
178 |
+
st.markdown("**Similarity Score:**")
|
179 |
+
st.progress(similarity)
|
180 |
+
st.write(f"Cosine Similarity: {similarity:.4f}")
|
181 |
+
|
182 |
+
st.plotly_chart(create_radar_chart(emb1, emb2), use_container_width=True)
|
183 |
|
184 |
+
with tab3:
|
185 |
+
st.markdown("### Understanding Sentence Embeddings")
|
|
|
186 |
|
187 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
188 |
|
189 |
+
with col1:
|
190 |
+
st.markdown("""
|
191 |
+
#### What are Sentence Embeddings?
|
192 |
+
Sentence embeddings are numerical representations of text that capture semantic meaning.
|
193 |
+
Each sentence is converted into a vector of numbers, where similar meanings result in
|
194 |
+
similar vectors.
|
195 |
+
|
196 |
+
#### How is Similarity Calculated?
|
197 |
+
The similarity between two sentences is measured using cosine similarity between their
|
198 |
+
embedding vectors. The score ranges from -1 to 1:
|
199 |
+
- 1.0 = Identical meaning
|
200 |
+
- >0.8 = Very similar
|
201 |
+
- >0.5 = Somewhat similar
|
202 |
+
- <0.5 = Different meanings
|
203 |
+
""")
|
204 |
+
|
205 |
+
with col2:
|
206 |
+
st.markdown("""
|
207 |
+
#### Current Model Details
|
208 |
+
This demo uses the `all-MiniLM-L6-v2` model:
|
209 |
+
- Embedding Size: 384 dimensions
|
210 |
+
- Optimized for semantic similarity
|
211 |
+
- Fast and efficient
|
212 |
+
- Good balance of performance and speed
|
213 |
+
|
214 |
+
#### Use Cases
|
215 |
+
- Semantic search
|
216 |
+
- Document similarity
|
217 |
+
- Text clustering
|
218 |
+
- Information retrieval
|
219 |
+
""")
|
220 |
|
221 |
+
with st.expander("π¬ Technical Details"):
|
222 |
+
st.markdown("""
|
223 |
+
The model processes text through these steps:
|
224 |
+
1. Tokenization: Breaks text into tokens
|
225 |
+
2. Encoding: Converts tokens to embeddings
|
226 |
+
3. Pooling: Combines token embeddings into sentence embedding
|
227 |
+
4. Similarity: Computes cosine similarity between embeddings
|
228 |
+
""")
|