computerscience-person's picture
Visualizations of clustering models.
2a796f1
import marimo
__generated_with = "0.11.17"
app = marimo.App(width="medium")
@app.cell
def _(mo):
mo.md(r"""# Analyzing Colorectal Cancer Dataset""")
return
@app.cell
def _():
import marimo as mo
import polars as pl
return mo, pl
@app.cell
def _(pl):
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
# dataset.select("Tumor_Size_mm").describe()
return (dataset,)
@app.cell(hide_code=True)
def _(dataset, pl):
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
ord_encoder = OrdinalEncoder()
ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years'))
encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years'])
encoded_schema = {name: pl.Int8 for name in encoded_features}
# print(encoded_schema)
dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema)
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
# dataset_encoded
return (
OneHotEncoder,
OrdinalEncoder,
dataset_encoded,
dataset_encoded_parts,
encoded_features,
encoded_schema,
ord_encoded,
ord_encoder,
)
@app.cell
def _(dataset_encoded, mo):
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage'])
y = dataset_encoded.select(['Survival_5_years'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
logreg = LogisticRegression()
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
bnb = BernoulliNB()
y_pred_bnb = bnb.fit(X_train, y_train).predict(X_test)
dectree = DecisionTreeClassifier()
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
mo.md(f"""
## Logistic Regression
Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
Precision score: {precision_score(y_test, y_pred_logreg)}
Confusion matrix:
```
{confusion_matrix(y_test, y_pred_logreg)}
```
Classification report:
```
{classification_report(y_test, y_pred_logreg)}
```
## Bernoulli Naive Bayes
Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
Precision score: {precision_score(y_test, y_pred_bnb)}
Confusion matrix:
```
{confusion_matrix(y_test, y_pred_bnb)}
```
Classification report:
```
{classification_report(y_test, y_pred_bnb)}
```
## Decision Tree Classifier
Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
Precision score: {precision_score(y_test, y_pred_dectree)}
Confusion matrix:
```
{confusion_matrix(y_test, y_pred_dectree)}
```
Classification report:
```
{classification_report(y_test, y_pred_dectree)}
```
{mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')}
""")
return (
BernoulliNB,
DecisionTreeClassifier,
LogisticRegression,
X,
X_test,
X_train,
accuracy_score,
bnb,
classification_report,
confusion_matrix,
dectree,
logreg,
precision_score,
train_test_split,
y,
y_pred_bnb,
y_pred_dectree,
y_pred_logreg,
y_test,
y_train,
)
@app.cell
def _(OrdinalEncoder, dataset, mo, pl):
def _():
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
from sklearn.svm import SVC
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
import altair as alt
genmut_encoder = OrdinalEncoder()
genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation'))
genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation'])
encoded_schema = {name: pl.Int8 for name in genmut_features}
dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema)
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
# Use samples since dataset is way too big to run locally
dataset_encoded = dataset_encoded.sample(3000, seed=11)
X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation'])
y = dataset_encoded.select(['Cancer_Stage']).to_series()
kmeans = KMeans(n_clusters=3, random_state=11)
spec = SpectralClustering(n_clusters=3, random_state=11)
labels_kmeans = kmeans.fit_predict(X)
labels_spec = spec.fit_predict(X)
# df_kmeans_parts = pl.DataFrame(labels_kmeans, schema=pl.String)
df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster'))
df_spec = X.with_columns(pl.lit(labels_spec, dtype=pl.String).alias('spectral_cluster'))
return mo.vstack([
mo.md(f"""
## K-Means Clustering
### External Metrics (Based on Cancer Stage Labels)
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)}
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)}
Homogeneity: {homogeneity_score(y, labels_kmeans)}
Completeness: {completeness_score(y, labels_kmeans)}
V-measure: {v_measure_score(y, labels_kmeans)}
### Internal Metrics
Silhouette Score: {silhouette_score(X, labels_kmeans)}
Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)}
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)}
## Spectral Clustering
### External Metrics (Based on Cancer Stage Labels)
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)}
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)}
Homogeneity: {homogeneity_score(y, labels_spec)}
Completeness: {completeness_score(y, labels_spec)}
V-measure: {v_measure_score(y, labels_spec)}
### Internal Metrics
Silhouette Score: {silhouette_score(X, labels_spec)}
Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)}
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)}
{mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')}
"""),
mo.hstack([
alt.Chart(df_kmeans, autosize='pad').mark_rect().encode(
alt.X('Genetic_Mutation:N'),
y='Tumor_Size_mm',
color='kmeans_cluster'
).properties(
width=325
).interactive(),
alt.Chart(df_spec, autosize='pad').mark_rect().encode(
alt.X('Genetic_Mutation:N'),
y='Tumor_Size_mm',
color='spectral_cluster'
).properties(
width=325
).interactive(),
])
])
_()
return
if __name__ == "__main__":
app.run()