euler314 commited on
Commit
4a851e9
Β·
verified Β·
1 Parent(s): b6da946

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -87
app.py CHANGED
@@ -1,112 +1,205 @@
1
  import io
2
  import textwrap
 
3
 
4
  import numpy as np
5
  import pandas as pd
6
  import streamlit as st
7
- from sklearn.manifold import TSNE
 
 
 
8
  import plotly.express as px
 
9
 
10
- # -------------- Helper functions -------------------------------------------
11
- EXAMPLE_SHAPES = {
12
- "Cube (3-D, 8 vertices)": np.array([
13
- [0, 0, 0], [0, 0, 1],
14
- [0, 1, 0], [0, 1, 1],
15
- [1, 0, 0], [1, 0, 1],
16
- [1, 1, 0], [1, 1, 1]
17
- ]),
18
- "Square pyramid (3-D, 5 vertices)": np.array([
19
- [-1, -1, 0],
20
- [ 1, -1, 0],
21
- [ 1, 1, 0],
22
- [-1, 1, 0],
23
- [ 0, 0, 1]
24
- ])
25
- }
26
-
27
 
28
- def parse_text_points(text: str) -> np.ndarray:
29
- """
30
- Parse a multiline string of comma- or whitespace-separated numbers
31
- into an (n_points, n_dims) array.
32
- """
33
- cleaned = textwrap.dedent(text.strip())
34
- rows = [row for row in cleaned.splitlines() if row.strip()]
35
- data = [list(map(float, row.replace(",", " ").split())) for row in rows]
36
- return np.array(data, dtype=float)
37
-
38
-
39
- def run_tsne(data: np.ndarray, perplexity: float, seed: int) -> np.ndarray:
40
- tsne = TSNE(
41
- n_components=2,
42
- perplexity=perplexity,
43
- random_state=seed,
44
- init="pca"
45
- )
46
- return tsne.fit_transform(data)
47
- # ---------------------------------------------------------------------------
48
 
 
 
 
49
 
50
- st.title("πŸŒ€ t-SNE Explorer for n-D Point Clouds")
51
- st.markdown(
52
- """
53
- Upload or paste your points, choose parameters, and see how
54
- **t-SNE** flattens them into 2-D.
55
- *Example shapes* are provided for quick experimentation.
56
- """
57
- )
 
 
 
 
58
 
59
- # --- Sidebar controls -------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  with st.sidebar:
61
- st.header("1️⃣ Choose data source")
62
- source = st.radio(
63
- "Data input method",
64
- ["Example shape", "Upload CSV/TXT", "Paste raw text"]
65
- )
66
-
67
- if source == "Example shape":
68
- shape_key = st.selectbox("Pick a shape", list(EXAMPLE_SHAPES.keys()))
69
- data_raw = EXAMPLE_SHAPES[shape_key]
70
-
71
- elif source == "Upload CSV/TXT":
72
- file = st.file_uploader("Upload coordinates file (*.csv / *.txt)")
73
- if file:
74
- text = io.StringIO(file.getvalue().decode("utf-8")).read()
75
- data_raw = parse_text_points(text)
76
  else:
77
  st.stop()
78
-
79
- else: # Paste text
80
  placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
81
- text = st.text_area("Paste coordinates (one point per line)", height=200, placeholder=placeholder)
82
- if not text.strip():
83
  st.stop()
84
- data_raw = parse_text_points(text)
85
 
86
- st.divider()
87
- st.header("2️⃣ t-SNE parameters")
88
- perplexity = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
89
  seed = st.number_input("Random seed", value=42, step=1)
90
- run_button = st.button("Run t-SNE πŸš€")
91
 
92
- # --- Main area --------------------------------------------------------------
93
- if run_button:
94
- if data_raw.ndim != 2 or data_raw.shape[0] < 2:
95
- st.error("Need at least two points; check your input.")
96
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- if perplexity >= data_raw.shape[0]:
99
- st.error("Perplexity must be less than the number of points.")
100
- st.stop()
101
 
102
- embedding = run_tsne(data_raw, perplexity, seed)
103
- df_plot = pd.DataFrame(embedding, columns=["x", "y"])
 
 
 
 
104
 
105
- st.subheader("2-D embedding")
106
- fig = px.scatter(df_plot, x="x", y="y", width=700, height=500)
107
- fig.update_traces(marker=dict(size=10))
108
- fig.update_layout(margin=dict(l=20, r=20, t=30, b=20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  st.plotly_chart(fig, use_container_width=True)
110
 
111
- with st.expander("Show raw data"):
112
- st.write(pd.DataFrame(data_raw))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import textwrap
3
+ import itertools
4
 
5
  import numpy as np
6
  import pandas as pd
7
  import streamlit as st
8
+ from sklearn.manifold import TSNE, trustworthiness
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.cluster import KMeans, DBSCAN
11
+ import umap.umap_ as umap
12
  import plotly.express as px
13
+ from sklearn.datasets import make_swiss_roll
14
 
15
+ # --- Example shapes (some generated on demand) --------------------------------
16
+ def generate_hypercube(n=4):
17
+ return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def generate_simplex(n=3):
20
+ # n-simplex in n-D: standard basis vectors + origin
21
+ eye = np.eye(n, dtype=float)
22
+ origin = np.zeros((1, n), dtype=float)
23
+ return np.vstack([eye, origin])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ def generate_swiss_roll(n_samples=500, noise=0.05):
26
+ X, _ = make_swiss_roll(n_samples=n_samples, noise=noise)
27
+ return X
28
 
29
+ EXAMPLE_SHAPES = {
30
+ "Cube (3-D, 8 pts)": np.array([
31
+ [0,0,0],[0,0,1],[0,1,0],[0,1,1],
32
+ [1,0,0],[1,0,1],[1,1,0],[1,1,1]
33
+ ], dtype=float),
34
+ "Square pyramid (3-D, 5 pts)": np.array([
35
+ [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
36
+ ], dtype=float),
37
+ "4-D hypercube (16 pts)": generate_hypercube(4),
38
+ "3-simplex (4 pts in 3-D)": generate_simplex(3),
39
+ "Swiss roll (500 pts, 3-D)": generate_swiss_roll,
40
+ }
41
 
42
+ # --- Parsing & embedding -----------------------------------------------------
43
+ def parse_text_points(text: str) -> np.ndarray:
44
+ txt = textwrap.dedent(text.strip())
45
+ rows = [r for r in txt.splitlines() if r.strip()]
46
+ data = [list(map(float, r.replace(",", " ").split())) for r in rows]
47
+ arr = np.array(data, dtype=float)
48
+ return arr
49
+
50
+ def run_tsne(data, perp, seed):
51
+ ts = TSNE(n_components=2, perplexity=perp, random_state=seed, init="pca")
52
+ emb = ts.fit_transform(data)
53
+ return emb, ts.kl_divergence_
54
+
55
+ def run_pca(data):
56
+ pca = PCA(n_components=2)
57
+ return pca.fit_transform(data), None
58
+
59
+ def run_umap(data, n_neighbors, min_dist, seed):
60
+ um = umap.UMAP(n_components=2, n_neighbors=n_neighbors,
61
+ min_dist=min_dist, random_state=seed)
62
+ return um.fit_transform(data), None
63
+
64
+ # --- Streamlit App -----------------------------------------------------------
65
+ st.set_page_config(layout="wide")
66
+ st.title("πŸŒ€ Dimensionality Reduction Explorer")
67
+ st.write("""
68
+ Upload or paste your n-D points, pick an algorithm (t-SNE/PCA/UMAP),
69
+ optionally cluster, and see the 2-D embedding.
70
+ """)
71
+
72
+ # Sidebar ─────────────────────────────────────────────────────────────────────
73
  with st.sidebar:
74
+ st.header("1️⃣ Data Input")
75
+ mode = st.radio("Source", ["Example shape","Upload CSV/TXT","Paste text"])
76
+ if mode == "Example shape":
77
+ key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys()))
78
+ src = EXAMPLE_SHAPES[key]
79
+ data_raw = src() if callable(src) else src
80
+ elif mode == "Upload CSV/TXT":
81
+ up = st.file_uploader("Upload file", type=["csv","txt"])
82
+ if up:
83
+ txt = io.StringIO(up.getvalue().decode("utf-8")).read()
84
+ data_raw = parse_text_points(txt)
 
 
 
 
85
  else:
86
  st.stop()
87
+ else:
 
88
  placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
89
+ txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
90
+ if not txt.strip():
91
  st.stop()
92
+ data_raw = parse_text_points(txt)
93
 
94
+ st.header("2️⃣ Algorithm & Params")
95
+ algo = st.selectbox("Method", ["t-SNE","PCA","UMAP"])
 
96
  seed = st.number_input("Random seed", value=42, step=1)
 
97
 
98
+ # method-specific
99
+ if algo == "t-SNE":
100
+ perp = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
101
+ elif algo == "UMAP":
102
+ neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
103
+ min_dist = st.slider("min_dist", 0.0, 0.99, 0.1, 0.01)
104
+ # PCA has no extra params
105
+
106
+ st.header("3️⃣ Clustering (optional)")
107
+ do_cluster = st.checkbox("Cluster embedding")
108
+ if do_cluster:
109
+ cluster_algo = st.selectbox("Algorithm", ["KMeans","DBSCAN"])
110
+ if cluster_algo == "KMeans":
111
+ n_clusters = st.slider("n_clusters", 2, 10, 3, 1)
112
+ else:
113
+ eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.5, 0.1)
114
 
115
+ st.markdown("---")
116
+ run = st.button("Run & Visualize πŸš€")
 
117
 
118
+ # Main ────────────────────────────────────────────────────────────────────────
119
+ if run:
120
+ pts = data_raw
121
+ if pts.ndim != 2 or pts.shape[0] < 2:
122
+ st.error("Need at least two points in an (n_pts Γ— n_dims) array.")
123
+ st.stop()
124
 
125
+ # run chosen reducer
126
+ if algo == "t-SNE":
127
+ emb, kl = run_tsne(pts, perp, seed)
128
+ elif algo == "PCA":
129
+ emb, kl = run_pca(pts)
130
+ else:
131
+ emb, kl = run_umap(pts, neighbors, min_dist, seed)
132
+
133
+ # compute trustworthiness
134
+ tw = trustworthiness(pts, emb, n_neighbors=5)
135
+
136
+ # clustering
137
+ df = pd.DataFrame(emb, columns=["x","y"])
138
+ if do_cluster:
139
+ if cluster_algo == "KMeans":
140
+ labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
141
+ else:
142
+ labels = DBSCAN(eps=eps).fit_predict(emb)
143
+ df["cluster"] = labels.astype(str)
144
+ fig = px.scatter(df, x="x", y="y", color="cluster",
145
+ title=f"{algo} embedding with {cluster_algo}", width=700, height=500)
146
+ else:
147
+ fig = px.scatter(df, x="x", y="y",
148
+ title=f"{algo} embedding", width=700, height=500)
149
+
150
+ fig.update_traces(marker=dict(size=8))
151
+ fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
152
+
153
+ # display
154
+ st.subheader("2-D Embedding")
155
  st.plotly_chart(fig, use_container_width=True)
156
 
157
+ st.markdown(f"**Trustworthiness (k=5):** {tw:.3f}")
158
+ if kl is not None:
159
+ st.markdown(f"**t-SNE KL divergence:** {kl:.3f}")
160
+
161
+ # download CSV
162
+ csv = df.to_csv(index=False).encode("utf-8")
163
+ st.download_button(
164
+ "Download embedding as CSV",
165
+ data=csv,
166
+ file_name="embedding.csv",
167
+ mime="text/csv"
168
+ )
169
+
170
+ # raw data expander
171
+ with st.expander("Show original data"):
172
+ st.write(pts)
173
+
174
+ # t-SNE math explainer
175
+ if algo == "t-SNE":
176
+ with st.expander("🧠 How t-SNE works"):
177
+ st.markdown(r"""
178
+ 1. **High-D similarities**
179
+ Convert pairwise distances \(d_{ij}\) into conditional probabilities
180
+ \[
181
+ p_{j|i} = \frac{\exp\!\bigl(-\|x_i - x_j\|^2 / 2\sigma_i^2\bigr)}
182
+ {\sum_{k\neq i}\exp\!\bigl(-\|x_i - x_k\|^2 / 2\sigma_i^2\bigr)}
183
+ \]
184
+ then symmetrize to \(p_{ij}=(p_{j|i}+p_{i|j})/2n\).
185
+
186
+ 2. **Low-D affinities**
187
+ In 2-D we use a Student-t kernel:
188
+ \[
189
+ q_{ij} = \frac{\bigl(1 + \|y_i - y_j\|^2\bigr)^{-1}}
190
+ {\sum_{k\neq l}\bigl(1 + \|y_k - y_l\|^2\bigr)^{-1}}
191
+ \]
192
+
193
+ 3. **Minimize KL divergence**
194
+ Find \(\{y_i\}\) to minimize
195
+ \[
196
+ KL(P\|Q)
197
+ = \sum_{i\neq j} p_{ij}\,\log\frac{p_{ij}}{q_{ij}}
198
+ \]
199
+ via gradient descentβ€”preserving local structure while pushing dissimilar points apart.
200
+
201
+ **Key parameter – perplexity**
202
+ Controls each \(\sigma_i\) by solving
203
+ \(\mathrm{Perp}(p_{i\cdot})=2^{-\sum_j p_{j|i}\log_2 p_{j|i}}\),
204
+ intuitively setting an β€œeffective # neighbors” (5–50 typical).
205
+ """)