Update app.py
Browse files
app.py
CHANGED
@@ -1,112 +1,205 @@
|
|
1 |
import io
|
2 |
import textwrap
|
|
|
3 |
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
import streamlit as st
|
7 |
-
from sklearn.manifold import TSNE
|
|
|
|
|
|
|
8 |
import plotly.express as px
|
|
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
[0, 0, 0], [0, 0, 1],
|
14 |
-
[0, 1, 0], [0, 1, 1],
|
15 |
-
[1, 0, 0], [1, 0, 1],
|
16 |
-
[1, 1, 0], [1, 1, 1]
|
17 |
-
]),
|
18 |
-
"Square pyramid (3-D, 5 vertices)": np.array([
|
19 |
-
[-1, -1, 0],
|
20 |
-
[ 1, -1, 0],
|
21 |
-
[ 1, 1, 0],
|
22 |
-
[-1, 1, 0],
|
23 |
-
[ 0, 0, 1]
|
24 |
-
])
|
25 |
-
}
|
26 |
-
|
27 |
|
28 |
-
def
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
cleaned = textwrap.dedent(text.strip())
|
34 |
-
rows = [row for row in cleaned.splitlines() if row.strip()]
|
35 |
-
data = [list(map(float, row.replace(",", " ").split())) for row in rows]
|
36 |
-
return np.array(data, dtype=float)
|
37 |
-
|
38 |
-
|
39 |
-
def run_tsne(data: np.ndarray, perplexity: float, seed: int) -> np.ndarray:
|
40 |
-
tsne = TSNE(
|
41 |
-
n_components=2,
|
42 |
-
perplexity=perplexity,
|
43 |
-
random_state=seed,
|
44 |
-
init="pca"
|
45 |
-
)
|
46 |
-
return tsne.fit_transform(data)
|
47 |
-
# ---------------------------------------------------------------------------
|
48 |
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
)
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
with st.sidebar:
|
61 |
-
st.header("1οΈβ£
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
file = st.file_uploader("Upload coordinates file (*.csv / *.txt)")
|
73 |
-
if file:
|
74 |
-
text = io.StringIO(file.getvalue().decode("utf-8")).read()
|
75 |
-
data_raw = parse_text_points(text)
|
76 |
else:
|
77 |
st.stop()
|
78 |
-
|
79 |
-
else: # Paste text
|
80 |
placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
|
81 |
-
|
82 |
-
if not
|
83 |
st.stop()
|
84 |
-
data_raw = parse_text_points(
|
85 |
|
86 |
-
st.
|
87 |
-
st.
|
88 |
-
perplexity = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
|
89 |
seed = st.number_input("Random seed", value=42, step=1)
|
90 |
-
run_button = st.button("Run t-SNE π")
|
91 |
|
92 |
-
#
|
93 |
-
if
|
94 |
-
|
95 |
-
|
96 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
st.stop()
|
101 |
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
st.plotly_chart(fig, use_container_width=True)
|
110 |
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import io
|
2 |
import textwrap
|
3 |
+
import itertools
|
4 |
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
+
from sklearn.manifold import TSNE, trustworthiness
|
9 |
+
from sklearn.decomposition import PCA
|
10 |
+
from sklearn.cluster import KMeans, DBSCAN
|
11 |
+
import umap.umap_ as umap
|
12 |
import plotly.express as px
|
13 |
+
from sklearn.datasets import make_swiss_roll
|
14 |
|
15 |
+
# --- Example shapes (some generated on demand) --------------------------------
|
16 |
+
def generate_hypercube(n=4):
|
17 |
+
return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
def generate_simplex(n=3):
|
20 |
+
# n-simplex in n-D: standard basis vectors + origin
|
21 |
+
eye = np.eye(n, dtype=float)
|
22 |
+
origin = np.zeros((1, n), dtype=float)
|
23 |
+
return np.vstack([eye, origin])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
def generate_swiss_roll(n_samples=500, noise=0.05):
|
26 |
+
X, _ = make_swiss_roll(n_samples=n_samples, noise=noise)
|
27 |
+
return X
|
28 |
|
29 |
+
EXAMPLE_SHAPES = {
|
30 |
+
"Cube (3-D, 8 pts)": np.array([
|
31 |
+
[0,0,0],[0,0,1],[0,1,0],[0,1,1],
|
32 |
+
[1,0,0],[1,0,1],[1,1,0],[1,1,1]
|
33 |
+
], dtype=float),
|
34 |
+
"Square pyramid (3-D, 5 pts)": np.array([
|
35 |
+
[-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
|
36 |
+
], dtype=float),
|
37 |
+
"4-D hypercube (16 pts)": generate_hypercube(4),
|
38 |
+
"3-simplex (4 pts in 3-D)": generate_simplex(3),
|
39 |
+
"Swiss roll (500 pts, 3-D)": generate_swiss_roll,
|
40 |
+
}
|
41 |
|
42 |
+
# --- Parsing & embedding -----------------------------------------------------
|
43 |
+
def parse_text_points(text: str) -> np.ndarray:
|
44 |
+
txt = textwrap.dedent(text.strip())
|
45 |
+
rows = [r for r in txt.splitlines() if r.strip()]
|
46 |
+
data = [list(map(float, r.replace(",", " ").split())) for r in rows]
|
47 |
+
arr = np.array(data, dtype=float)
|
48 |
+
return arr
|
49 |
+
|
50 |
+
def run_tsne(data, perp, seed):
|
51 |
+
ts = TSNE(n_components=2, perplexity=perp, random_state=seed, init="pca")
|
52 |
+
emb = ts.fit_transform(data)
|
53 |
+
return emb, ts.kl_divergence_
|
54 |
+
|
55 |
+
def run_pca(data):
|
56 |
+
pca = PCA(n_components=2)
|
57 |
+
return pca.fit_transform(data), None
|
58 |
+
|
59 |
+
def run_umap(data, n_neighbors, min_dist, seed):
|
60 |
+
um = umap.UMAP(n_components=2, n_neighbors=n_neighbors,
|
61 |
+
min_dist=min_dist, random_state=seed)
|
62 |
+
return um.fit_transform(data), None
|
63 |
+
|
64 |
+
# --- Streamlit App -----------------------------------------------------------
|
65 |
+
st.set_page_config(layout="wide")
|
66 |
+
st.title("π Dimensionality Reduction Explorer")
|
67 |
+
st.write("""
|
68 |
+
Upload or paste your n-D points, pick an algorithm (t-SNE/PCA/UMAP),
|
69 |
+
optionally cluster, and see the 2-D embedding.
|
70 |
+
""")
|
71 |
+
|
72 |
+
# Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
73 |
with st.sidebar:
|
74 |
+
st.header("1οΈβ£ Data Input")
|
75 |
+
mode = st.radio("Source", ["Example shape","Upload CSV/TXT","Paste text"])
|
76 |
+
if mode == "Example shape":
|
77 |
+
key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys()))
|
78 |
+
src = EXAMPLE_SHAPES[key]
|
79 |
+
data_raw = src() if callable(src) else src
|
80 |
+
elif mode == "Upload CSV/TXT":
|
81 |
+
up = st.file_uploader("Upload file", type=["csv","txt"])
|
82 |
+
if up:
|
83 |
+
txt = io.StringIO(up.getvalue().decode("utf-8")).read()
|
84 |
+
data_raw = parse_text_points(txt)
|
|
|
|
|
|
|
|
|
85 |
else:
|
86 |
st.stop()
|
87 |
+
else:
|
|
|
88 |
placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
|
89 |
+
txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
|
90 |
+
if not txt.strip():
|
91 |
st.stop()
|
92 |
+
data_raw = parse_text_points(txt)
|
93 |
|
94 |
+
st.header("2οΈβ£ Algorithm & Params")
|
95 |
+
algo = st.selectbox("Method", ["t-SNE","PCA","UMAP"])
|
|
|
96 |
seed = st.number_input("Random seed", value=42, step=1)
|
|
|
97 |
|
98 |
+
# method-specific
|
99 |
+
if algo == "t-SNE":
|
100 |
+
perp = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
|
101 |
+
elif algo == "UMAP":
|
102 |
+
neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
|
103 |
+
min_dist = st.slider("min_dist", 0.0, 0.99, 0.1, 0.01)
|
104 |
+
# PCA has no extra params
|
105 |
+
|
106 |
+
st.header("3οΈβ£ Clustering (optional)")
|
107 |
+
do_cluster = st.checkbox("Cluster embedding")
|
108 |
+
if do_cluster:
|
109 |
+
cluster_algo = st.selectbox("Algorithm", ["KMeans","DBSCAN"])
|
110 |
+
if cluster_algo == "KMeans":
|
111 |
+
n_clusters = st.slider("n_clusters", 2, 10, 3, 1)
|
112 |
+
else:
|
113 |
+
eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.5, 0.1)
|
114 |
|
115 |
+
st.markdown("---")
|
116 |
+
run = st.button("Run & Visualize π")
|
|
|
117 |
|
118 |
+
# Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
119 |
+
if run:
|
120 |
+
pts = data_raw
|
121 |
+
if pts.ndim != 2 or pts.shape[0] < 2:
|
122 |
+
st.error("Need at least two points in an (n_pts Γ n_dims) array.")
|
123 |
+
st.stop()
|
124 |
|
125 |
+
# run chosen reducer
|
126 |
+
if algo == "t-SNE":
|
127 |
+
emb, kl = run_tsne(pts, perp, seed)
|
128 |
+
elif algo == "PCA":
|
129 |
+
emb, kl = run_pca(pts)
|
130 |
+
else:
|
131 |
+
emb, kl = run_umap(pts, neighbors, min_dist, seed)
|
132 |
+
|
133 |
+
# compute trustworthiness
|
134 |
+
tw = trustworthiness(pts, emb, n_neighbors=5)
|
135 |
+
|
136 |
+
# clustering
|
137 |
+
df = pd.DataFrame(emb, columns=["x","y"])
|
138 |
+
if do_cluster:
|
139 |
+
if cluster_algo == "KMeans":
|
140 |
+
labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
|
141 |
+
else:
|
142 |
+
labels = DBSCAN(eps=eps).fit_predict(emb)
|
143 |
+
df["cluster"] = labels.astype(str)
|
144 |
+
fig = px.scatter(df, x="x", y="y", color="cluster",
|
145 |
+
title=f"{algo} embedding with {cluster_algo}", width=700, height=500)
|
146 |
+
else:
|
147 |
+
fig = px.scatter(df, x="x", y="y",
|
148 |
+
title=f"{algo} embedding", width=700, height=500)
|
149 |
+
|
150 |
+
fig.update_traces(marker=dict(size=8))
|
151 |
+
fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
|
152 |
+
|
153 |
+
# display
|
154 |
+
st.subheader("2-D Embedding")
|
155 |
st.plotly_chart(fig, use_container_width=True)
|
156 |
|
157 |
+
st.markdown(f"**Trustworthiness (k=5):** {tw:.3f}")
|
158 |
+
if kl is not None:
|
159 |
+
st.markdown(f"**t-SNE KL divergence:** {kl:.3f}")
|
160 |
+
|
161 |
+
# download CSV
|
162 |
+
csv = df.to_csv(index=False).encode("utf-8")
|
163 |
+
st.download_button(
|
164 |
+
"Download embedding as CSV",
|
165 |
+
data=csv,
|
166 |
+
file_name="embedding.csv",
|
167 |
+
mime="text/csv"
|
168 |
+
)
|
169 |
+
|
170 |
+
# raw data expander
|
171 |
+
with st.expander("Show original data"):
|
172 |
+
st.write(pts)
|
173 |
+
|
174 |
+
# t-SNE math explainer
|
175 |
+
if algo == "t-SNE":
|
176 |
+
with st.expander("π§ How t-SNE works"):
|
177 |
+
st.markdown(r"""
|
178 |
+
1. **High-D similarities**
|
179 |
+
Convert pairwise distances \(d_{ij}\) into conditional probabilities
|
180 |
+
\[
|
181 |
+
p_{j|i} = \frac{\exp\!\bigl(-\|x_i - x_j\|^2 / 2\sigma_i^2\bigr)}
|
182 |
+
{\sum_{k\neq i}\exp\!\bigl(-\|x_i - x_k\|^2 / 2\sigma_i^2\bigr)}
|
183 |
+
\]
|
184 |
+
then symmetrize to \(p_{ij}=(p_{j|i}+p_{i|j})/2n\).
|
185 |
+
|
186 |
+
2. **Low-D affinities**
|
187 |
+
In 2-D we use a Student-t kernel:
|
188 |
+
\[
|
189 |
+
q_{ij} = \frac{\bigl(1 + \|y_i - y_j\|^2\bigr)^{-1}}
|
190 |
+
{\sum_{k\neq l}\bigl(1 + \|y_k - y_l\|^2\bigr)^{-1}}
|
191 |
+
\]
|
192 |
+
|
193 |
+
3. **Minimize KL divergence**
|
194 |
+
Find \(\{y_i\}\) to minimize
|
195 |
+
\[
|
196 |
+
KL(P\|Q)
|
197 |
+
= \sum_{i\neq j} p_{ij}\,\log\frac{p_{ij}}{q_{ij}}
|
198 |
+
\]
|
199 |
+
via gradient descentβpreserving local structure while pushing dissimilar points apart.
|
200 |
+
|
201 |
+
**Key parameter β perplexity**
|
202 |
+
Controls each \(\sigma_i\) by solving
|
203 |
+
\(\mathrm{Perp}(p_{i\cdot})=2^{-\sum_j p_{j|i}\log_2 p_{j|i}}\),
|
204 |
+
intuitively setting an βeffective # neighborsβ (5β50 typical).
|
205 |
+
""")
|