Spaces:

euler314
/

tsne

Sleeping

App Files Files Community

euler314 commited on Jun 13

Commit

599c56e

verified ·

1 Parent(s): f953fb1

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -96

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import io
 import textwrap
 import itertools
 import numpy as np
 import pandas as pd
@@ -10,9 +12,10 @@ from sklearn.decomposition import PCA
 from sklearn.cluster import KMeans, DBSCAN
 import umap.umap_ as umap
 import plotly.express as px
 from sklearn.datasets import make_swiss_roll
-# --- Example shapes (some generated on demand) --------------------------------
 def generate_hypercube(n=4):
     return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
@@ -26,19 +29,19 @@ def generate_swiss_roll(n_samples=500, noise=0.05):
     return X
 EXAMPLE_SHAPES = {
-    "Cube (3-D, 8 pts)": np.array([
         [0,0,0],[0,0,1],[0,1,0],[0,1,1],
         [1,0,0],[1,0,1],[1,1,0],[1,1,1]
     ], dtype=float),
-    "Square pyramid (3-D, 5 pts)": np.array([
         [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
     ], dtype=float),
-    "4-D hypercube (16 pts)": generate_hypercube(4),
-    "3-simplex (4 pts in 3-D)": generate_simplex(3),
-    "Swiss roll (500 pts, 3-D)": generate_swiss_roll,
 }
-# --- Parsing & embedding -----------------------------------------------------
 def parse_text_points(text: str) -> np.ndarray:
     txt = textwrap.dedent(text.strip())
     rows = [r for r in txt.splitlines() if r.strip()]
@@ -59,41 +62,54 @@ def run_umap(data, n_neighbors, min_dist, seed):
                   min_dist=min_dist, random_state=seed)
     return um.fit_transform(data), None
-# --- Streamlit App -----------------------------------------------------------
 st.set_page_config(layout="wide")
 st.title("🌀 Dimensionality Reduction Explorer")
 st.write("""
-Upload or paste your n-D points, pick an algorithm (t-SNE/PCA/UMAP),
-optionally cluster, and see the 2-D embedding.
 """)
-# Sidebar ─────────────────────────────────────────────────────────────────────
 with st.sidebar:
     st.header("1️⃣ Data Input")
-    mode = st.radio("Source", ["Example shape","Upload CSV/TXT","Paste text"])
     if mode == "Example shape":
         key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys()))
         src = EXAMPLE_SHAPES[key]
         data_raw = src() if callable(src) else src
     elif mode == "Upload CSV/TXT":
-        up = st.file_uploader("Upload file", type=["csv","txt"])
-        if up:
-            txt = io.StringIO(up.getvalue().decode("utf-8")).read()
-            data_raw = parse_text_points(txt)
-        else:
             st.stop()
-    else:
         placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
         txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
         if not txt.strip():
             st.stop()
         data_raw = parse_text_points(txt)
     st.header("2️⃣ Algorithm & Params")
-    algo = st.selectbox("Method", ["t-SNE","PCA","UMAP"])
     seed = st.number_input("Random seed", value=42, step=1)
-    if algo == "t-SNE":
         perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0)
     elif algo == "UMAP":
         neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
@@ -102,7 +118,7 @@ with st.sidebar:
     st.header("3️⃣ Clustering (optional)")
     do_cluster = st.checkbox("Cluster embedding")
     if do_cluster:
-        cluster_algo = st.selectbox("Algorithm", ["KMeans","DBSCAN"])
         if cluster_algo == "KMeans":
             n_clusters = st.slider("n_clusters", 2, 10, 3, 1)
         else:
@@ -111,98 +127,95 @@ with st.sidebar:
     st.markdown("---")
     run = st.button("Run & Visualize 🚀")
-# Main ────────────────────────────────────────────────────────────────────────
-if run:
-    pts = data_raw
     if pts.ndim != 2 or pts.shape[0] < 2:
-        st.error("Need at least two points in an (n_pts × n_dims) array.")
-        st.stop()
-    # run chosen reducer
-    if algo == "t-SNE":
         emb, kl = run_tsne(pts, perp, seed)
     elif algo == "PCA":
         emb, kl = run_pca(pts)
     else:
         emb, kl = run_umap(pts, neighbors, min_dist, seed)
-    # dynamic trustworthiness
     n_samples = pts.shape[0]
     k_max = (n_samples - 1) // 2
-    if k_max >= 1:
-        tw = trustworthiness(pts, emb, n_neighbors=k_max)
-    else:
-        tw = None
-    # clustering & plotting
-    df = pd.DataFrame(emb, columns=["x","y"])
     if do_cluster:
         if cluster_algo == "KMeans":
             labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
         else:
             labels = DBSCAN(eps=eps).fit_predict(emb)
         df["cluster"] = labels.astype(str)
-        fig = px.scatter(df, x="x", y="y", color="cluster",
-                         title=f"{algo} embedding with {cluster_algo}", width=700, height=500)
-    else:
-        fig = px.scatter(df, x="x", y="y",
-                         title=f"{algo} embedding", width=700, height=500)
-    fig.update_traces(marker=dict(size=8))
-    fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
-    # display
-    st.subheader("2-D Embedding")
-    st.plotly_chart(fig, use_container_width=True)
-    if tw is not None:
-        st.markdown(f"**Trustworthiness (k={k_max}):** {tw:.3f}")
-    else:
-        st.markdown("**Trustworthiness:** Not enough samples to compute (need ≥3 points).")
-    if kl is not None:
-        st.markdown(f"**t-SNE KL divergence:** {kl:.3f}")
-    # download CSV
-    csv = df.to_csv(index=False).encode("utf-8")
-    st.download_button(
-        "Download embedding as CSV",
-        data=csv,
-        file_name="embedding.csv",
-        mime="text/csv"
-    )
-    with st.expander("Show original data"):
-        st.write(pts)
-    if algo == "t-SNE":
-        with st.expander("🧠 How t-SNE works"):
-            st.markdown(r"""
-1. **High-D similarities**
-   Convert pairwise distances \(d_{ij}\) into conditional probabilities
-   \[
-     p_{j|i} = \frac{\exp\!\bigl(-\|x_i - x_j\|^2 / 2\sigma_i^2\bigr)}
-                     {\sum_{k\neq i}\exp\!\bigl(-\|x_i - x_k\|^2 / 2\sigma_i^2\bigr)}
-   \]
-   then symmetrize to \(p_{ij}=(p_{j|i}+p_{i|j})/2n\).
-2. **Low-D affinities**
-   In 2-D we use a Student-t kernel:
-   \[
-     q_{ij} = \frac{\bigl(1 + \|y_i - y_j\|^2\bigr)^{-1}}
-                  {\sum_{k\neq l}\bigl(1 + \|y_k - y_l\|^2\bigr)^{-1}}
-   \]
-3. **Minimize KL divergence**
-   Find \(\{y_i\}\) to minimize
-   \[
-     KL(P\|Q)
-     = \sum_{i\neq j} p_{ij}\,\log\frac{p_{ij}}{q_{ij}}
-   \]
-   via gradient descent—preserving local structure while pushing dissimilar points apart.
-**Key parameter – perplexity**
-Controls each \(\sigma_i\) by solving
-\(\mathrm{Perp}(p_{i\cdot})=2^{-\sum_j p_{j|i}\log_2 p_{j|i}}\),
-intuitively setting an “effective # neighbors” (5–50 typical).
-            """)

 import io
 import textwrap
 import itertools
+import zipfile
+from typing import List, Tuple
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans, DBSCAN
 import umap.umap_ as umap
 import plotly.express as px
+from scipy.spatial.distance import cdist
 from sklearn.datasets import make_swiss_roll
+# ── Example shapes (some generated on demand) ────────────────────────────────
 def generate_hypercube(n=4):
     return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
     return X
 EXAMPLE_SHAPES = {
+    "Cube (3‑D, 8 pts)": np.array([
         [0,0,0],[0,0,1],[0,1,0],[0,1,1],
         [1,0,0],[1,0,1],[1,1,0],[1,1,1]
     ], dtype=float),
+    "Square pyramid (3‑D, 5 pts)": np.array([
         [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
     ], dtype=float),
+    "4‑D hypercube (16 pts)": generate_hypercube(4),
+    "3‑simplex (4 pts in 3‑D)": generate_simplex(3),
+    "Swiss roll (500 pts, 3‑D)": generate_swiss_roll,
 }
+# ── Helpers ──────────────────────────────────────────────────────────────────
 def parse_text_points(text: str) -> np.ndarray:
     txt = textwrap.dedent(text.strip())
     rows = [r for r in txt.splitlines() if r.strip()]
                   min_dist=min_dist, random_state=seed)
     return um.fit_transform(data), None
+# ── Streamlit UI ─────────────────────────────────────────────────────────────
 st.set_page_config(layout="wide")
 st.title("🌀 Dimensionality Reduction Explorer")
 st.write("""
+Upload **one or many** CSV/TXT files *or* use the other sources, pick an algorithm,
+(optionally cluster), and explore the 2‑D embedding.  Each result is downloadable
+with a full pair‑wise distance table.
 """)
+# Sidebar ────────────────────────────────────────────────────────────────────
 with st.sidebar:
     st.header("1️⃣ Data Input")
+    mode = st.radio("Source", ["Example shape", "Upload CSV/TXT", "Paste text"])
+    datasets: List[Tuple[str, np.ndarray]] = []
     if mode == "Example shape":
         key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys()))
         src = EXAMPLE_SHAPES[key]
         data_raw = src() if callable(src) else src
+        datasets.append((key.replace(" ", "_"), data_raw))
     elif mode == "Upload CSV/TXT":
+        uploads = st.file_uploader(
+            "Upload one **or many** files",
+            type=["csv", "txt"],
+            accept_multiple_files=True
+        )
+        if not uploads:
             st.stop()
+        for up in uploads:
+            txt = io.StringIO(up.getvalue().decode("utf-8")).read()
+            pts = parse_text_points(txt)
+            datasets.append((up.name.rsplit(".", 1)[0], pts))
+    else:  # Paste text
         placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..."
         txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
         if not txt.strip():
             st.stop()
         data_raw = parse_text_points(txt)
+        datasets.append(("pasted_points", data_raw))
     st.header("2️⃣ Algorithm & Params")
+    algo = st.selectbox("Method", ["t‑SNE", "PCA", "UMAP"])
     seed = st.number_input("Random seed", value=42, step=1)
+    if algo == "t‑SNE":
         perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0)
     elif algo == "UMAP":
         neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
     st.header("3️⃣ Clustering (optional)")
     do_cluster = st.checkbox("Cluster embedding")
     if do_cluster:
+        cluster_algo = st.selectbox("Algorithm", ["KMeans", "DBSCAN"])
         if cluster_algo == "KMeans":
             n_clusters = st.slider("n_clusters", 2, 10, 3, 1)
         else:
     st.markdown("---")
     run = st.button("Run & Visualize 🚀")
+# ── Main processing ─────────────────────────────────────────────────────────
+def process_dataset(name: str, pts: np.ndarray):
     if pts.ndim != 2 or pts.shape[0] < 2:
+        st.error(f"Dataset **{name}** needs at least two points in an (n_pts × n_dims) array.")
+        return None, None
+    # Dimensionality reduction
+    if algo == "t‑SNE":
         emb, kl = run_tsne(pts, perp, seed)
     elif algo == "PCA":
         emb, kl = run_pca(pts)
     else:
         emb, kl = run_umap(pts, neighbors, min_dist, seed)
+    # Trustworthiness
     n_samples = pts.shape[0]
     k_max = (n_samples - 1) // 2
+    tw = trustworthiness(pts, emb, n_neighbors=k_max) if k_max >= 1 else None
+    # DataFrame for embedding
+    df = pd.DataFrame(emb, columns=["x", "y"])
+    # Clustering
     if do_cluster:
         if cluster_algo == "KMeans":
             labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
         else:
             labels = DBSCAN(eps=eps).fit_predict(emb)
         df["cluster"] = labels.astype(str)
+    # Pair‑wise distances in embedding
+    dist_matrix = cdist(emb, emb, metric="euclidean")
+    dist_df = pd.DataFrame(dist_matrix,
+                           columns=[f"dist_{i}" for i in range(n_samples)])
+    out_df = pd.concat([df, dist_df], axis=1)
+    return out_df, {"kl": kl, "tw": tw, "k_max": k_max}
+if run:
+    results: List[Tuple[str, pd.DataFrame]] = []
+    for name, pts in datasets:
+        st.subheader(f"📂 Dataset: {name}")
+        out_df, stats = process_dataset(name, pts)
+        if out_df is None:
+            continue
+        # Scatter plot
+        color_arg = "cluster" if ("cluster" in out_df.columns) else None
+        fig = px.scatter(out_df, x="x", y="y", color=color_arg,
+                         title=f"{algo} embedding ({name})",
+                         width=700, height=500)
+        fig.update_traces(marker=dict(size=8))
+        fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
+        st.plotly_chart(fig, use_container_width=True)
+        # Stats
+        if stats["tw"] is not None:
+            st.markdown(f"**Trustworthiness (k={stats['k_max']}):** {stats['tw']:.3f}")
+        else:
+            st.markdown("**Trustworthiness:** Not enough samples to compute (need ≥ 3 points).")
+        if stats["kl"] is not None:
+            st.markdown(f"**t‑SNE KL divergence:** {stats['kl']:.3f}")
+        # Distance matrix preview
+        with st.expander("🔍 Show pair‑wise distance matrix"):
+            st.dataframe(out_df.filter(like="dist_"))
+        # Download CSV for this dataset
+        csv_bytes = out_df.to_csv(index=False).encode("utf‑8")
+        st.download_button(
+            f"Download embedding + distances ({name})",
+            data=csv_bytes,
+            file_name=f"{name}_embedding_with_distances.csv",
+            mime="text/csv"
+        )
+        # Keep for ZIP if batch
+        results.append((name, csv_bytes))
+    # One‑click ZIP if multiple datasets
+    if len(results) >= 2:
+        zip_buf = io.BytesIO()
+        with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
+            for nm, csv_b in results:
+                zf.writestr(f"{nm}_embedding_with_distances.csv", csv_b)
+        st.download_button(
+            "📦 Download **all** results as ZIP",
+            data=zip_buf.getvalue(),
+            file_name="all_embeddings_with_distances.zip",
+            mime="application/zip"
+        )