Spaces:

euler314
/

tsne

Sleeping

App Files Files Community

euler314 commited on May 21

Commit

4a851e9

verified ·

1 Parent(s): b6da946

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -87

app.py CHANGED Viewed

@@ -1,112 +1,205 @@
 import io
 import textwrap
 import numpy as np
 import pandas as pd
 import streamlit as st
-from sklearn.manifold import TSNE
 import plotly.express as px
-# --------------  Helper functions -------------------------------------------
-EXAMPLE_SHAPES = {
-    "Cube (3-D, 8 vertices)": np.array([
-        [0, 0, 0], [0, 0, 1],
-        [0, 1, 0], [0, 1, 1],
-        [1, 0, 0], [1, 0, 1],
-        [1, 1, 0], [1, 1, 1]
-    ]),
-    "Square pyramid (3-D, 5 vertices)": np.array([
-        [-1, -1,  0],
-        [ 1, -1,  0],
-        [ 1,  1,  0],
-        [-1,  1,  0],
-        [ 0,  0,  1]
-    ])
-}
-def parse_text_points(text: str) -> np.ndarray:
-    """
-    Parse a multiline string of comma- or whitespace-separated numbers
-    into an (n_points, n_dims) array.
-    """
-    cleaned = textwrap.dedent(text.strip())
-    rows = [row for row in cleaned.splitlines() if row.strip()]
-    data = [list(map(float, row.replace(",", " ").split())) for row in rows]
-    return np.array(data, dtype=float)
-def run_tsne(data: np.ndarray, perplexity: float, seed: int) -> np.ndarray:
-    tsne = TSNE(
-        n_components=2,
-        perplexity=perplexity,
-        random_state=seed,
-        init="pca"
-    )
-    return tsne.fit_transform(data)
-# ---------------------------------------------------------------------------
-st.title("🌀 t-SNE Explorer for n-D Point Clouds")
-st.markdown(
-    """
-    Upload or paste your points, choose parameters, and see how
-    **t-SNE** flattens them into 2-D.
-    *Example shapes* are provided for quick experimentation.
-    """
-)
-# --- Sidebar controls -------------------------------------------------------
 with st.sidebar:
-    st.header("1️⃣  Choose data source")
-    source = st.radio(
-        "Data input method",
-        ["Example shape", "Upload CSV/TXT", "Paste raw text"]
-    )
-    if source == "Example shape":
-        shape_key = st.selectbox("Pick a shape", list(EXAMPLE_SHAPES.keys()))
-        data_raw = EXAMPLE_SHAPES[shape_key]
-    elif source == "Upload CSV/TXT":
-        file = st.file_uploader("Upload coordinates file (*.csv / *.txt)")
-        if file:
-            text = io.StringIO(file.getvalue().decode("utf-8")).read()
-            data_raw = parse_text_points(text)
         else:
             st.stop()
-    else:  # Paste text
         placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
-        text = st.text_area("Paste coordinates (one point per line)", height=200, placeholder=placeholder)
-        if not text.strip():
             st.stop()
-        data_raw = parse_text_points(text)
-    st.divider()
-    st.header("2️⃣  t-SNE parameters")
-    perplexity = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
     seed = st.number_input("Random seed", value=42, step=1)
-    run_button = st.button("Run t-SNE 🚀")
-# --- Main area --------------------------------------------------------------
-if run_button:
-    if data_raw.ndim != 2 or data_raw.shape[0] < 2:
-        st.error("Need at least two points; check your input.")
-        st.stop()
-    if perplexity >= data_raw.shape[0]:
-        st.error("Perplexity must be less than the number of points.")
-        st.stop()
-    embedding = run_tsne(data_raw, perplexity, seed)
-    df_plot = pd.DataFrame(embedding, columns=["x", "y"])
-    st.subheader("2-D embedding")
-    fig = px.scatter(df_plot, x="x", y="y", width=700, height=500)
-    fig.update_traces(marker=dict(size=10))
-    fig.update_layout(margin=dict(l=20, r=20, t=30, b=20))
     st.plotly_chart(fig, use_container_width=True)
-    with st.expander("Show raw data"):
-        st.write(pd.DataFrame(data_raw))

 import io
 import textwrap
+import itertools
 import numpy as np
 import pandas as pd
 import streamlit as st
+from sklearn.manifold import TSNE, trustworthiness
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans, DBSCAN
+import umap.umap_ as umap
 import plotly.express as px
+from sklearn.datasets import make_swiss_roll
+# --- Example shapes (some generated on demand) --------------------------------
+def generate_hypercube(n=4):
+    return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
+def generate_simplex(n=3):
+    # n-simplex in n-D: standard basis vectors + origin
+    eye = np.eye(n, dtype=float)
+    origin = np.zeros((1, n), dtype=float)
+    return np.vstack([eye, origin])
+def generate_swiss_roll(n_samples=500, noise=0.05):
+    X, _ = make_swiss_roll(n_samples=n_samples, noise=noise)
+    return X
+EXAMPLE_SHAPES = {
+    "Cube (3-D, 8 pts)": np.array([
+        [0,0,0],[0,0,1],[0,1,0],[0,1,1],
+        [1,0,0],[1,0,1],[1,1,0],[1,1,1]
+    ], dtype=float),
+    "Square pyramid (3-D, 5 pts)": np.array([
+        [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
+    ], dtype=float),
+    "4-D hypercube (16 pts)": generate_hypercube(4),
+    "3-simplex (4 pts in 3-D)": generate_simplex(3),
+    "Swiss roll (500 pts, 3-D)": generate_swiss_roll,
+}
+# --- Parsing & embedding -----------------------------------------------------
+def parse_text_points(text: str) -> np.ndarray:
+    txt = textwrap.dedent(text.strip())
+    rows = [r for r in txt.splitlines() if r.strip()]
+    data = [list(map(float, r.replace(",", " ").split())) for r in rows]
+    arr = np.array(data, dtype=float)
+    return arr
+def run_tsne(data, perp, seed):
+    ts = TSNE(n_components=2, perplexity=perp, random_state=seed, init="pca")
+    emb = ts.fit_transform(data)
+    return emb, ts.kl_divergence_
+def run_pca(data):
+    pca = PCA(n_components=2)
+    return pca.fit_transform(data), None
+def run_umap(data, n_neighbors, min_dist, seed):
+    um = umap.UMAP(n_components=2, n_neighbors=n_neighbors,
+                  min_dist=min_dist, random_state=seed)
+    return um.fit_transform(data), None
+# --- Streamlit App -----------------------------------------------------------
+st.set_page_config(layout="wide")
+st.title("🌀 Dimensionality Reduction Explorer")
+st.write("""
+Upload or paste your n-D points, pick an algorithm (t-SNE/PCA/UMAP),
+optionally cluster, and see the 2-D embedding.
+""")
+# Sidebar ─────────────────────────────────────────────────────────────────────
 with st.sidebar:
+    st.header("1️⃣ Data Input")
+    mode = st.radio("Source", ["Example shape","Upload CSV/TXT","Paste text"])
+    if mode == "Example shape":
+        key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys()))
+        src = EXAMPLE_SHAPES[key]
+        data_raw = src() if callable(src) else src
+    elif mode == "Upload CSV/TXT":
+        up = st.file_uploader("Upload file", type=["csv","txt"])
+        if up:
+            txt = io.StringIO(up.getvalue().decode("utf-8")).read()
+            data_raw = parse_text_points(txt)
         else:
             st.stop()
+    else:
         placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
+        txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
+        if not txt.strip():
             st.stop()
+        data_raw = parse_text_points(txt)
+    st.header("2️⃣ Algorithm & Params")
+    algo = st.selectbox("Method", ["t-SNE","PCA","UMAP"])
     seed = st.number_input("Random seed", value=42, step=1)
+    # method-specific
+    if algo == "t-SNE":
+        perp = st.slider("Perplexity", 5.0, 50.0, 30.0, 1.0)
+    elif algo == "UMAP":
+        neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
+        min_dist = st.slider("min_dist", 0.0, 0.99, 0.1, 0.01)
+    # PCA has no extra params
+    st.header("3️⃣ Clustering (optional)")
+    do_cluster = st.checkbox("Cluster embedding")
+    if do_cluster:
+        cluster_algo = st.selectbox("Algorithm", ["KMeans","DBSCAN"])
+        if cluster_algo == "KMeans":
+            n_clusters = st.slider("n_clusters", 2, 10, 3, 1)
+        else:
+            eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.5, 0.1)
+    st.markdown("---")
+    run = st.button("Run & Visualize 🚀")
+# Main ────────────────────────────────────────────────────────────────────────
+if run:
+    pts = data_raw
+    if pts.ndim != 2 or pts.shape[0] < 2:
+        st.error("Need at least two points in an (n_pts × n_dims) array.")
+        st.stop()
+    # run chosen reducer
+    if algo == "t-SNE":
+        emb, kl = run_tsne(pts, perp, seed)
+    elif algo == "PCA":
+        emb, kl = run_pca(pts)
+    else:
+        emb, kl = run_umap(pts, neighbors, min_dist, seed)
+    # compute trustworthiness
+    tw = trustworthiness(pts, emb, n_neighbors=5)
+    # clustering
+    df = pd.DataFrame(emb, columns=["x","y"])
+    if do_cluster:
+        if cluster_algo == "KMeans":
+            labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
+        else:
+            labels = DBSCAN(eps=eps).fit_predict(emb)
+        df["cluster"] = labels.astype(str)
+        fig = px.scatter(df, x="x", y="y", color="cluster",
+                         title=f"{algo} embedding with {cluster_algo}", width=700, height=500)
+    else:
+        fig = px.scatter(df, x="x", y="y",
+                         title=f"{algo} embedding", width=700, height=500)
+    fig.update_traces(marker=dict(size=8))
+    fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
+    # display
+    st.subheader("2-D Embedding")
     st.plotly_chart(fig, use_container_width=True)
+    st.markdown(f"**Trustworthiness (k=5):** {tw:.3f}")
+    if kl is not None:
+        st.markdown(f"**t-SNE KL divergence:** {kl:.3f}")
+    # download CSV
+    csv = df.to_csv(index=False).encode("utf-8")
+    st.download_button(
+        "Download embedding as CSV",
+        data=csv,
+        file_name="embedding.csv",
+        mime="text/csv"
+    )
+    # raw data expander
+    with st.expander("Show original data"):
+        st.write(pts)
+    # t-SNE math explainer
+    if algo == "t-SNE":
+        with st.expander("🧠 How t-SNE works"):
+            st.markdown(r"""
+1. **High-D similarities**
+   Convert pairwise distances \(d_{ij}\) into conditional probabilities
+   \[
+     p_{j|i} = \frac{\exp\!\bigl(-\|x_i - x_j\|^2 / 2\sigma_i^2\bigr)}
+                     {\sum_{k\neq i}\exp\!\bigl(-\|x_i - x_k\|^2 / 2\sigma_i^2\bigr)}
+   \]
+   then symmetrize to \(p_{ij}=(p_{j|i}+p_{i|j})/2n\).
+2. **Low-D affinities**
+   In 2-D we use a Student-t kernel:
+   \[
+     q_{ij} = \frac{\bigl(1 + \|y_i - y_j\|^2\bigr)^{-1}}
+                  {\sum_{k\neq l}\bigl(1 + \|y_k - y_l\|^2\bigr)^{-1}}
+   \]
+3. **Minimize KL divergence**
+   Find \(\{y_i\}\) to minimize
+   \[
+     KL(P\|Q)
+     = \sum_{i\neq j} p_{ij}\,\log\frac{p_{ij}}{q_{ij}}
+   \]
+   via gradient descent—preserving local structure while pushing dissimilar points apart.
+**Key parameter – perplexity**
+Controls each \(\sigma_i\) by solving
+\(\mathrm{Perp}(p_{i\cdot})=2^{-\sum_j p_{j|i}\log_2 p_{j|i}}\),
+intuitively setting an “effective # neighbors” (5–50 typical).
+            """)