Spaces:

euler314
/

tsne

Running

App Files Files Community

euler314 commited on 2 days ago

Commit

d7768bb

verified ·

1 Parent(s): 599c56e

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -90

app.py CHANGED Viewed

@@ -1,21 +1,21 @@
 import io
-import textwrap
 import itertools
 import zipfile
 from typing import List, Tuple
 import numpy as np
 import pandas as pd
-import streamlit as st
-from sklearn.manifold import TSNE, trustworthiness
-from sklearn.decomposition import PCA
-from sklearn.cluster import KMeans, DBSCAN
-import umap.umap_ as umap
 import plotly.express as px
 from scipy.spatial.distance import cdist
 from sklearn.datasets import make_swiss_roll
-# ── Example shapes (some generated on demand) ────────────────────────────────
 def generate_hypercube(n=4):
     return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
@@ -29,16 +29,16 @@ def generate_swiss_roll(n_samples=500, noise=0.05):
     return X
 EXAMPLE_SHAPES = {
-    "Cube (3‑D, 8 pts)": np.array([
         [0,0,0],[0,0,1],[0,1,0],[0,1,1],
         [1,0,0],[1,0,1],[1,1,0],[1,1,1]
     ], dtype=float),
-    "Square pyramid (3‑D, 5 pts)": np.array([
         [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
     ], dtype=float),
-    "4‑D hypercube (16 pts)": generate_hypercube(4),
-    "3‑simplex (4 pts in 3‑D)": generate_simplex(3),
-    "Swiss roll (500 pts, 3‑D)": generate_swiss_roll,
 }
 # ── Helpers ──────────────────────────────────────────────────────────────────
@@ -62,17 +62,24 @@ def run_umap(data, n_neighbors, min_dist, seed):
                   min_dist=min_dist, random_state=seed)
     return um.fit_transform(data), None
 # ── Streamlit UI ─────────────────────────────────────────────────────────────
 st.set_page_config(layout="wide")
 st.title("🌀 Dimensionality Reduction Explorer")
 st.write("""
-Upload **one or many** CSV/TXT files *or* use the other sources, pick an algorithm,
-(optionally cluster), and explore the 2‑D embedding.  Each result is downloadable
-with a full pair‑wise distance table.
 """)
-# Sidebar ────────────────────────────────────────────────────────────────────
 with st.sidebar:
     st.header("1️⃣ Data Input")
     mode = st.radio("Source", ["Example shape", "Upload CSV/TXT", "Paste text"])
@@ -87,9 +94,7 @@ with st.sidebar:
     elif mode == "Upload CSV/TXT":
         uploads = st.file_uploader(
-            "Upload one **or many** files",
-            type=["csv", "txt"],
-            accept_multiple_files=True
         )
         if not uploads:
             st.stop()
@@ -102,14 +107,13 @@ with st.sidebar:
         txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
         if not txt.strip():
             st.stop()
-        data_raw = parse_text_points(txt)
-        datasets.append(("pasted_points", data_raw))
     st.header("2️⃣ Algorithm & Params")
-    algo = st.selectbox("Method", ["t‑SNE", "PCA", "UMAP"])
     seed = st.number_input("Random seed", value=42, step=1)
-    if algo == "t‑SNE":
         perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0)
     elif algo == "UMAP":
         neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
@@ -128,28 +132,30 @@ with st.sidebar:
     run = st.button("Run & Visualize 🚀")
 # ── Main processing ─────────────────────────────────────────────────────────
-def process_dataset(name: str, pts: np.ndarray):
     if pts.ndim != 2 or pts.shape[0] < 2:
-        st.error(f"Dataset **{name}** needs at least two points in an (n_pts × n_dims) array.")
         return None, None
-    # Dimensionality reduction
-    if algo == "t‑SNE":
         emb, kl = run_tsne(pts, perp, seed)
     elif algo == "PCA":
         emb, kl = run_pca(pts)
     else:
         emb, kl = run_umap(pts, neighbors, min_dist, seed)
-    # Trustworthiness
     n_samples = pts.shape[0]
     k_max = (n_samples - 1) // 2
     tw = trustworthiness(pts, emb, n_neighbors=k_max) if k_max >= 1 else None
-    # DataFrame for embedding
-    df = pd.DataFrame(emb, columns=["x", "y"])
-    # Clustering
     if do_cluster:
         if cluster_algo == "KMeans":
             labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
@@ -157,65 +163,62 @@ def process_dataset(name: str, pts: np.ndarray):
             labels = DBSCAN(eps=eps).fit_predict(emb)
         df["cluster"] = labels.astype(str)
-    # Pair‑wise distances in embedding
-    dist_matrix = cdist(emb, emb, metric="euclidean")
-    dist_df = pd.DataFrame(dist_matrix,
-                           columns=[f"dist_{i}" for i in range(n_samples)])
-    out_df = pd.concat([df, dist_df], axis=1)
-    return out_df, {"kl": kl, "tw": tw, "k_max": k_max}
-if run:
-    results: List[Tuple[str, pd.DataFrame]] = []
-    for name, pts in datasets:
-        st.subheader(f"📂 Dataset: {name}")
-        out_df, stats = process_dataset(name, pts)
-        if out_df is None:
-            continue
-        # Scatter plot
-        color_arg = "cluster" if ("cluster" in out_df.columns) else None
-        fig = px.scatter(out_df, x="x", y="y", color=color_arg,
-                         title=f"{algo} embedding ({name})",
-                         width=700, height=500)
-        fig.update_traces(marker=dict(size=8))
-        fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
-        st.plotly_chart(fig, use_container_width=True)
-        # Stats
-        if stats["tw"] is not None:
-            st.markdown(f"**Trustworthiness (k={stats['k_max']}):** {stats['tw']:.3f}")
-        else:
-            st.markdown("**Trustworthiness:** Not enough samples to compute (need ≥ 3 points).")
-        if stats["kl"] is not None:
-            st.markdown(f"**t‑SNE KL divergence:** {stats['kl']:.3f}")
-        # Distance matrix preview
-        with st.expander("🔍 Show pair‑wise distance matrix"):
-            st.dataframe(out_df.filter(like="dist_"))
-        # Download CSV for this dataset
-        csv_bytes = out_df.to_csv(index=False).encode("utf‑8")
-        st.download_button(
-            f"Download embedding + distances ({name})",
-            data=csv_bytes,
-            file_name=f"{name}_embedding_with_distances.csv",
-            mime="text/csv"
-        )
-        # Keep for ZIP if batch
-        results.append((name, csv_bytes))
-    # One‑click ZIP if multiple datasets
-    if len(results) >= 2:
-        zip_buf = io.BytesIO()
-        with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
-            for nm, csv_b in results:
-                zf.writestr(f"{nm}_embedding_with_distances.csv", csv_b)
-        st.download_button(
-            "📦 Download **all** results as ZIP",
-            data=zip_buf.getvalue(),
-            file_name="all_embeddings_with_distances.zip",
-            mime="application/zip"
-        )

 import io
 import itertools
+import textwrap
 import zipfile
 from typing import List, Tuple
 import numpy as np
 import pandas as pd
 import plotly.express as px
+import streamlit as st
 from scipy.spatial.distance import cdist
+from sklearn.cluster import DBSCAN, KMeans
+from sklearn.decomposition import PCA
 from sklearn.datasets import make_swiss_roll
+from sklearn.manifold import TSNE, trustworthiness
+import umap.umap_ as umap
+# ── Example shapes ───────────────────────────────────────────────────────────
 def generate_hypercube(n=4):
     return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float)
     return X
 EXAMPLE_SHAPES = {
+    "Cube (3-D, 8 pts)": np.array([
         [0,0,0],[0,0,1],[0,1,0],[0,1,1],
         [1,0,0],[1,0,1],[1,1,0],[1,1,1]
     ], dtype=float),
+    "Square pyramid (3-D, 5 pts)": np.array([
         [-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1]
     ], dtype=float),
+    "4-D hypercube (16 pts)": generate_hypercube(4),
+    "3-simplex (4 pts in 3-D)": generate_simplex(3),
+    "Swiss roll (500 pts, 3-D)": generate_swiss_roll,
 }
 # ── Helpers ──────────────────────────────────────────────────────────────────
                   min_dist=min_dist, random_state=seed)
     return um.fit_transform(data), None
+def distinct_count(dist_row: np.ndarray, tol: float = 1e-3) -> int:
+    """Count unique non-zero distances in a row after rounding to 3 decimals."""
+    nz = dist_row[dist_row > tol]
+    rounded = (nz * 1000).round().astype(int)  # rounding to 3 d.p.
+    return len(np.unique(rounded))
 # ── Streamlit UI ─────────────────────────────────────────────────────────────
 st.set_page_config(layout="wide")
 st.title("🌀 Dimensionality Reduction Explorer")
 st.write("""
+Upload **one or many** CSV/TXT files *or* use an example shape, pick an algorithm,
+(optionally cluster), and explore the 2-D embedding.
+Every output CSV now contains the embedding, the original point coordinates,
+all pair-wise distances, **and** the number of distinct distances per point.
 """)
+# ── Sidebar ──────────────────────────────────────────────────────────────────
 with st.sidebar:
     st.header("1️⃣ Data Input")
     mode = st.radio("Source", ["Example shape", "Upload CSV/TXT", "Paste text"])
     elif mode == "Upload CSV/TXT":
         uploads = st.file_uploader(
+            "Upload file(s)", type=["csv", "txt"], accept_multiple_files=True
         )
         if not uploads:
             st.stop()
         txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder)
         if not txt.strip():
             st.stop()
+        datasets.append(("pasted_points", parse_text_points(txt)))
     st.header("2️⃣ Algorithm & Params")
+    algo = st.selectbox("Method", ["t-SNE", "PCA", "UMAP"])
     seed = st.number_input("Random seed", value=42, step=1)
+    if algo == "t-SNE":
         perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0)
     elif algo == "UMAP":
         neighbors = st.slider("n_neighbors", 5, 200, 15, 5)
     run = st.button("Run & Visualize 🚀")
 # ── Main processing ─────────────────────────────────────────────────────────
+def process_dataset(name: str, pts: np.ndarray) -> Tuple[pd.DataFrame, dict]:
     if pts.ndim != 2 or pts.shape[0] < 2:
+        st.error(f"Dataset **{name}** needs at least two points (rows).")
         return None, None
+    # 1. Reduce dimensionality
+    if algo == "t-SNE":
         emb, kl = run_tsne(pts, perp, seed)
     elif algo == "PCA":
         emb, kl = run_pca(pts)
     else:
         emb, kl = run_umap(pts, neighbors, min_dist, seed)
+    # 2. Trustworthiness
     n_samples = pts.shape[0]
     k_max = (n_samples - 1) // 2
     tw = trustworthiness(pts, emb, n_neighbors=k_max) if k_max >= 1 else None
+    # 3. Build DataFrame in requested column order
+    df_emb = pd.DataFrame(emb, columns=["x", "y"])
+    df_pts = pd.DataFrame(pts, columns=[f"p{i}" for i in range(pts.shape[1])])
+    df = pd.concat([df_emb, df_pts], axis=1)
+    # 4. Clustering (optional)
     if do_cluster:
         if cluster_algo == "KMeans":
             labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb)
             labels = DBSCAN(eps=eps).fit_predict(emb)
         df["cluster"] = labels.astype(str)
+    # 5. Pair-wise distances (in embedding space)
+    dists = cdist(emb, emb, metric="euclidean")
+    dist_df = pd.DataFrame(dists, columns=[f"dist_{i}" for i in range(n_samples)])
+    df = pd.concat([df, dist_df], axis=1)
+    # 6. Distinct-distance count per point
+    df["distinct_count"] = [distinct_count(row) for row in dists]
+    return df, {"kl": kl, "tw": tw, "k_max": k_max}
+if run:
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+        for name, pts in datasets:
+            st.subheader(f"📂 Dataset: {name}")
+            out_df, stats = process_dataset(name, pts)
+            if out_df is None:
+                continue
+            # Plot
+            color_col = "cluster" if "cluster" in out_df.columns else None
+            fig = px.scatter(out_df, x="x", y="y", color=color_col,
+                             title=f"{algo} embedding ({name})",
+                             width=700, height=500)
+            fig.update_traces(marker=dict(size=8))
+            fig.update_layout(margin=dict(l=20, r=20, t=40, b=20))
+            st.plotly_chart(fig, use_container_width=True)
+            # Stats
+            if stats["tw"] is not None:
+                st.markdown(f"**Trustworthiness (k={stats['k_max']}):** {stats['tw']:.3f}")
+            else:
+                st.markdown("**Trustworthiness:** Not enough samples to compute.")
+            if stats["kl"] is not None:
+                st.markdown(f"**t-SNE KL divergence:** {stats['kl']:.3f}")
+            # Data preview
+            with st.expander("Preview first 10 rows"):
+                st.dataframe(out_df.head(10))
+            # Individual CSV download
+            csv_bytes = out_df.to_csv(index=False).encode("utf-8")
+            st.download_button(
+                f"Download CSV ({name})",
+                data=csv_bytes,
+                file_name=f"{name}_embedding_with_distances.csv",
+                mime="text/csv"
+            )
+            # Add to ZIP
+            zf.writestr(f"{name}_embedding_with_distances.csv", csv_bytes)
+    # ZIP download (always available once run)
+    st.download_button(
+        "📦 Download ALL results as ZIP",
+        data=zip_buffer.getvalue(),
+        file_name="all_embeddings_with_distances.zip",
+        mime="application/zip"
+    )