|
import io |
|
import textwrap |
|
import itertools |
|
import zipfile |
|
from typing import List, Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
from sklearn.manifold import TSNE, trustworthiness |
|
from sklearn.decomposition import PCA |
|
from sklearn.cluster import KMeans, DBSCAN |
|
import umap.umap_ as umap |
|
import plotly.express as px |
|
from scipy.spatial.distance import cdist |
|
from sklearn.datasets import make_swiss_roll |
|
|
|
|
|
def generate_hypercube(n=4): |
|
return np.array(list(itertools.product([0, 1], repeat=n)), dtype=float) |
|
|
|
def generate_simplex(n=3): |
|
eye = np.eye(n, dtype=float) |
|
origin = np.zeros((1, n), dtype=float) |
|
return np.vstack([eye, origin]) |
|
|
|
def generate_swiss_roll(n_samples=500, noise=0.05): |
|
X, _ = make_swiss_roll(n_samples=n_samples, noise=noise) |
|
return X |
|
|
|
EXAMPLE_SHAPES = { |
|
"Cube (3βD, 8 pts)": np.array([ |
|
[0,0,0],[0,0,1],[0,1,0],[0,1,1], |
|
[1,0,0],[1,0,1],[1,1,0],[1,1,1] |
|
], dtype=float), |
|
"Square pyramid (3βD, 5 pts)": np.array([ |
|
[-1,-1,0],[1,-1,0],[1,1,0],[-1,1,0],[0,0,1] |
|
], dtype=float), |
|
"4βD hypercube (16 pts)": generate_hypercube(4), |
|
"3βsimplex (4 pts in 3βD)": generate_simplex(3), |
|
"Swiss roll (500 pts, 3βD)": generate_swiss_roll, |
|
} |
|
|
|
|
|
def parse_text_points(text: str) -> np.ndarray: |
|
txt = textwrap.dedent(text.strip()) |
|
rows = [r for r in txt.splitlines() if r.strip()] |
|
data = [list(map(float, r.replace(",", " ").split())) for r in rows] |
|
return np.array(data, dtype=float) |
|
|
|
def run_tsne(data, perp, seed): |
|
ts = TSNE(n_components=2, perplexity=perp, random_state=seed, init="pca") |
|
emb = ts.fit_transform(data) |
|
return emb, ts.kl_divergence_ |
|
|
|
def run_pca(data): |
|
pca = PCA(n_components=2) |
|
return pca.fit_transform(data), None |
|
|
|
def run_umap(data, n_neighbors, min_dist, seed): |
|
um = umap.UMAP(n_components=2, n_neighbors=n_neighbors, |
|
min_dist=min_dist, random_state=seed) |
|
return um.fit_transform(data), None |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
st.title("π Dimensionality Reduction Explorer") |
|
|
|
st.write(""" |
|
Upload **one or many** CSV/TXT files *or* use the other sources, pick an algorithm, |
|
(optionally cluster), and explore the 2βD embedding. Each result is downloadable |
|
with a full pairβwise distance table. |
|
""") |
|
|
|
|
|
with st.sidebar: |
|
st.header("1οΈβ£ Data Input") |
|
mode = st.radio("Source", ["Example shape", "Upload CSV/TXT", "Paste text"]) |
|
|
|
datasets: List[Tuple[str, np.ndarray]] = [] |
|
|
|
if mode == "Example shape": |
|
key = st.selectbox("Choose example", list(EXAMPLE_SHAPES.keys())) |
|
src = EXAMPLE_SHAPES[key] |
|
data_raw = src() if callable(src) else src |
|
datasets.append((key.replace(" ", "_"), data_raw)) |
|
|
|
elif mode == "Upload CSV/TXT": |
|
uploads = st.file_uploader( |
|
"Upload one **or many** files", |
|
type=["csv", "txt"], |
|
accept_multiple_files=True |
|
) |
|
if not uploads: |
|
st.stop() |
|
for up in uploads: |
|
txt = io.StringIO(up.getvalue().decode("utf-8")).read() |
|
pts = parse_text_points(txt) |
|
datasets.append((up.name.rsplit(".", 1)[0], pts)) |
|
else: |
|
placeholder = "e.g.\n0,0,0\n0,0,1\n0,1,0\n..." |
|
txt = st.text_area("Paste coordinates", height=200, placeholder=placeholder) |
|
if not txt.strip(): |
|
st.stop() |
|
data_raw = parse_text_points(txt) |
|
datasets.append(("pasted_points", data_raw)) |
|
|
|
st.header("2οΈβ£ Algorithm & Params") |
|
algo = st.selectbox("Method", ["tβSNE", "PCA", "UMAP"]) |
|
seed = st.number_input("Random seed", value=42, step=1) |
|
|
|
if algo == "tβSNE": |
|
perp = st.slider("Perplexity", 1.0, 50.0, 30.0, 1.0) |
|
elif algo == "UMAP": |
|
neighbors = st.slider("n_neighbors", 5, 200, 15, 5) |
|
min_dist = st.slider("min_dist", 0.0, 0.99, 0.1, 0.01) |
|
|
|
st.header("3οΈβ£ Clustering (optional)") |
|
do_cluster = st.checkbox("Cluster embedding") |
|
if do_cluster: |
|
cluster_algo = st.selectbox("Algorithm", ["KMeans", "DBSCAN"]) |
|
if cluster_algo == "KMeans": |
|
n_clusters = st.slider("n_clusters", 2, 10, 3, 1) |
|
else: |
|
eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.5, 0.1) |
|
|
|
st.markdown("---") |
|
run = st.button("Run & Visualize π") |
|
|
|
|
|
def process_dataset(name: str, pts: np.ndarray): |
|
if pts.ndim != 2 or pts.shape[0] < 2: |
|
st.error(f"Dataset **{name}** needs at least two points in an (n_pts Γ n_dims) array.") |
|
return None, None |
|
|
|
|
|
if algo == "tβSNE": |
|
emb, kl = run_tsne(pts, perp, seed) |
|
elif algo == "PCA": |
|
emb, kl = run_pca(pts) |
|
else: |
|
emb, kl = run_umap(pts, neighbors, min_dist, seed) |
|
|
|
|
|
n_samples = pts.shape[0] |
|
k_max = (n_samples - 1) // 2 |
|
tw = trustworthiness(pts, emb, n_neighbors=k_max) if k_max >= 1 else None |
|
|
|
|
|
df = pd.DataFrame(emb, columns=["x", "y"]) |
|
|
|
|
|
if do_cluster: |
|
if cluster_algo == "KMeans": |
|
labels = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(emb) |
|
else: |
|
labels = DBSCAN(eps=eps).fit_predict(emb) |
|
df["cluster"] = labels.astype(str) |
|
|
|
|
|
dist_matrix = cdist(emb, emb, metric="euclidean") |
|
dist_df = pd.DataFrame(dist_matrix, |
|
columns=[f"dist_{i}" for i in range(n_samples)]) |
|
out_df = pd.concat([df, dist_df], axis=1) |
|
|
|
return out_df, {"kl": kl, "tw": tw, "k_max": k_max} |
|
|
|
if run: |
|
results: List[Tuple[str, pd.DataFrame]] = [] |
|
|
|
for name, pts in datasets: |
|
st.subheader(f"π Dataset: {name}") |
|
out_df, stats = process_dataset(name, pts) |
|
if out_df is None: |
|
continue |
|
|
|
|
|
color_arg = "cluster" if ("cluster" in out_df.columns) else None |
|
fig = px.scatter(out_df, x="x", y="y", color=color_arg, |
|
title=f"{algo} embedding ({name})", |
|
width=700, height=500) |
|
fig.update_traces(marker=dict(size=8)) |
|
fig.update_layout(margin=dict(l=20, r=20, t=40, b=20)) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
if stats["tw"] is not None: |
|
st.markdown(f"**Trustworthiness (k={stats['k_max']}):** {stats['tw']:.3f}") |
|
else: |
|
st.markdown("**Trustworthiness:** Not enough samples to compute (need β₯β―3 points).") |
|
if stats["kl"] is not None: |
|
st.markdown(f"**tβSNE KL divergence:** {stats['kl']:.3f}") |
|
|
|
|
|
with st.expander("π Show pairβwise distance matrix"): |
|
st.dataframe(out_df.filter(like="dist_")) |
|
|
|
|
|
csv_bytes = out_df.to_csv(index=False).encode("utfβ8") |
|
st.download_button( |
|
f"Download embeddingβ―+β―distances ({name})", |
|
data=csv_bytes, |
|
file_name=f"{name}_embedding_with_distances.csv", |
|
mime="text/csv" |
|
) |
|
|
|
|
|
results.append((name, csv_bytes)) |
|
|
|
|
|
if len(results) >= 2: |
|
zip_buf = io.BytesIO() |
|
with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf: |
|
for nm, csv_b in results: |
|
zf.writestr(f"{nm}_embedding_with_distances.csv", csv_b) |
|
st.download_button( |
|
"π¦ Download **all** results as ZIP", |
|
data=zip_buf.getvalue(), |
|
file_name="all_embeddings_with_distances.zip", |
|
mime="application/zip" |
|
) |
|
|