|
import io |
|
import itertools |
|
import textwrap |
|
import zipfile |
|
from typing import List, Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
import streamlit as st |
|
from scipy.spatial.distance import cdist |
|
from sklearn.cluster import DBSCAN, KMeans |
|
from sklearn.decomposition import PCA |
|
from sklearn.datasets import make_swiss_roll |
|
from sklearn.manifold import TSNE, trustworthiness |
|
import umap.umap_ as umap |
|
|
|
|
|
def generate_hypercube(n=4): |
|
return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float) |
|
|
|
def generate_simplex(n=3): |
|
eye = np.eye(n, dtype=float) |
|
origin = np.zeros((1, n), dtype=float) |
|
return np.vstack([eye, origin]) |
|
|
|
def generate_swiss_roll(n_samples=500, noise=0.05): |
|
X, _ = make_swiss_roll(n_samples=n_samples, noise=noise) |
|
return X |
|
|
|
EXAMPLE_SHAPES = { |
|
"Cube (3-D, 8 pts)": np.array([ |
|
[0,0,0],[0,0,1],[0,1,0],[0,1,1], |
|
[1,0,0],[1,0,1],[1,1,0],[1,1,1] |
|
], dtype=float), |
|
"Square pyramid (3-D, 5 pts)": np.array([ |
|
[-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1] |
|
], dtype=float), |
|
"4-D hypercube (16 pts)": generate_hypercube(4), |
|
"3-simplex (4 pts in 3-D)": generate_simplex(3), |
|
"Swiss roll (500 pts, 3-D)": generate_swiss_roll, |
|
} |
|
|
|
|
|
def parse_text_points(text: str) -> np.ndarray: |
|
txt = textwrap.dedent(text.strip()) |
|
rows = [r for r in txt.splitlines() if r.strip()] |
|
data = [list(map(float, r.replace(",", " ").split())) for r in rows] |
|
return np.array(data, dtype=float) |
|
|
|
def run_tsne(data, perp, seed): |
|
ts = TSNE(n_components=2, perplexity=perp, random_state=seed, init="pca") |
|
emb = ts.fit_transform(data) |
|
return emb, ts.kl_divergence_ |
|
|
|
def run_pca(data): |
|
pca = PCA(n_components=2) |
|
return pca.fit_transform(data), None |
|
|
|
def run_umap(data, n_neighbors, min_dist, seed): |
|
um = umap.UMAP(n_components=2, n_neighbors=n_neighbors, |
|
min_dist=min_dist, random_state=seed) |
|
return um.fit_transform(data), None |
|
|
|
def distinct_count(dist_row: np.ndarray, tol: float = 1e-3) -> int: |
|
"""Count unique non-zero distances in a row after rounding to 3 decimals.""" |
|
nz = dist_row[dist_row > tol] |
|
rounded = (nz * 1000).round().astype(int) |
|
return len(np.unique(rounded)) |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
st.title("π Dimensionality Reduction Explorer") |
|
|
|
st.write(""" |
|
Upload **one or many** CSV/TXT files *or* use an example shape, pick an algorithm, |
|
(optionally cluster), and explore the 2-D embedding. |
|
Every output CSV now contains the embedding, the original point coordinates, |
|
all pair-wise distances, **and** the number of distinct distances per point. |
|
""") |
|
|
|
|
|
with st.sidebar: |
|
st.header("1οΈβ£ Data Input") |
|
mode = st.radio("Source", ["Example shape", "Upload CSV/TXT", "Paste text"]) |
|
|
|
datasets: List[Tuple[str, np.ndarray]] = [] |
|
|
|
if mode == "Example shape": |
|
key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys())) |
|
src = EXAMPLE_SHAPES[key] |
|
data_raw = src() if callable(src) else src |
|
datasets.append((key.replace(" ", "_"), data_raw)) |
|
|
|
elif mode == "Upload CSV/TXT": |
|
uploads = st.file_uploader( |
|
"Upload file(s)", type=["csv", "txt"], accept_multiple_files=True |
|
) |
|
if not uploads: |
|
st.stop() |
|
for up in uploads: |
|
txt = io.StringIO(up.getvalue().decode("utf-8")).read() |
|
pts = parse_text_points(txt) |
|
datasets.append((up.name.rsplit(".", 1)[0], pts)) |
|
else: |
|
placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..." |
|
txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder) |
|
if not txt.strip(): |
|
st.stop() |
|
datasets.append(("pasted_points", parse_text_points(txt))) |
|
|
|
st.header("2οΈβ£ Algorithm & Params") |
|
algo = st.selectbox("Method", ["t-SNE", "PCA", "UMAP"]) |
|
seed = st.number_input("Random seed", value=42, step=1) |
|
|
|
if algo == "t-SNE": |
|
perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0) |
|
elif algo == "UMAP": |
|
neighbors = st.slider("n_neighbors", 5, 200, 15, 5) |
|
min_dist = st.slider("min_dist", 0.0, 0.99, 0.1, 0.01) |
|
|
|
st.header("3οΈβ£ Clustering (optional)") |
|
do_cluster = st.checkbox("Cluster embedding") |
|
if do_cluster: |
|
cluster_algo = st.selectbox("Algorithm", ["KMeans", "DBSCAN"]) |
|
if cluster_algo == "KMeans": |
|
n_clusters = st.slider("n_clusters", 2, 10, 3, 1) |
|
else: |
|
eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.5, 0.1) |
|
|
|
st.markdown("---") |
|
run = st.button("Run & Visualize π") |
|
|
|
|
|
def process_dataset(name: str, pts: np.ndarray) -> Tuple[pd.DataFrame, dict]: |
|
if pts.ndim != 2 or pts.shape[0] < 2: |
|
st.error(f"Dataset **{name}** needs at least two points (rows).") |
|
return None, None |
|
|
|
|
|
if algo == "t-SNE": |
|
emb, kl = run_tsne(pts, perp, seed) |
|
elif algo == "PCA": |
|
emb, kl = run_pca(pts) |
|
else: |
|
emb, kl = run_umap(pts, neighbors, min_dist, seed) |
|
|
|
|
|
n_samples = pts.shape[0] |
|
k_max = (n_samples - 1) // 2 |
|
tw = trustworthiness(pts, emb, n_neighbors=k_max) if k_max >= 1 else None |
|
|
|
|
|
df_emb = pd.DataFrame(emb, columns=["x", "y"]) |
|
df_pts = pd.DataFrame(pts, columns=[f"p{i}" for i in range(pts.shape[1])]) |
|
df = pd.concat([df_emb, df_pts], axis=1) |
|
|
|
|
|
if do_cluster: |
|
if cluster_algo == "KMeans": |
|
labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb) |
|
else: |
|
labels = DBSCAN(eps=eps).fit_predict(emb) |
|
df["cluster"] = labels.astype(str) |
|
|
|
|
|
dists = cdist(emb, emb, metric="euclidean") |
|
dist_df = pd.DataFrame(dists, columns=[f"dist_{i}" for i in range(n_samples)]) |
|
df = pd.concat([df, dist_df], axis=1) |
|
|
|
|
|
df["distinct_count"] = [distinct_count(row) for row in dists] |
|
|
|
return df, {"kl": kl, "tw": tw, "k_max": k_max} |
|
|
|
if run: |
|
zip_buffer = io.BytesIO() |
|
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: |
|
for name, pts in datasets: |
|
st.subheader(f"π Dataset: {name}") |
|
out_df, stats = process_dataset(name, pts) |
|
if out_df is None: |
|
continue |
|
|
|
|
|
color_col = "cluster" if "cluster" in out_df.columns else None |
|
fig = px.scatter(out_df, x="x", y="y", color=color_col, |
|
title=f"{algo} embedding ({name})", |
|
width=700, height=500) |
|
fig.update_traces(marker=dict(size=8)) |
|
fig.update_layout(margin=dict(l=20, r=20, t=40, b=20)) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
if stats["tw"] is not None: |
|
st.markdown(f"**Trustworthiness (k={stats['k_max']}):** {stats['tw']:.3f}") |
|
else: |
|
st.markdown("**Trustworthiness:** Not enough samples to compute.") |
|
if stats["kl"] is not None: |
|
st.markdown(f"**t-SNE KL divergence:** {stats['kl']:.3f}") |
|
|
|
|
|
with st.expander("Preview first 10 rows"): |
|
st.dataframe(out_df.head(10)) |
|
|
|
|
|
csv_bytes = out_df.to_csv(index=False).encode("utf-8") |
|
st.download_button( |
|
f"Download CSV ({name})", |
|
data=csv_bytes, |
|
file_name=f"{name}_embedding_with_distances.csv", |
|
mime="text/csv" |
|
) |
|
|
|
|
|
zf.writestr(f"{name}_embedding_with_distances.csv", csv_bytes) |
|
|
|
|
|
st.download_button( |
|
"π¦ Download ALL results as ZIP", |
|
data=zip_buffer.getvalue(), |
|
file_name="all_embeddings_with_distances.zip", |
|
mime="application/zip" |
|
) |
|
|