|
import marimo |
|
|
|
__generated_with = "0.11.17" |
|
app = marimo.App(width="medium") |
|
|
|
|
|
@app.cell |
|
def _(mo): |
|
mo.md(r"""# Analyzing Colorectal Cancer Dataset""") |
|
return |
|
|
|
|
|
@app.cell |
|
def _(): |
|
import marimo as mo |
|
import polars as pl |
|
return mo, pl |
|
|
|
|
|
@app.cell |
|
def _(pl): |
|
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv') |
|
|
|
return (dataset,) |
|
|
|
|
|
@app.cell(hide_code=True) |
|
def _(dataset, pl): |
|
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder |
|
|
|
ord_encoder = OrdinalEncoder() |
|
ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years')) |
|
encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years']) |
|
encoded_schema = {name: pl.Int8 for name in encoded_features} |
|
|
|
dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema) |
|
dataset_encoded = dataset.with_columns(dataset_encoded_parts) |
|
|
|
return ( |
|
OneHotEncoder, |
|
OrdinalEncoder, |
|
dataset_encoded, |
|
dataset_encoded_parts, |
|
encoded_features, |
|
encoded_schema, |
|
ord_encoded, |
|
ord_encoder, |
|
) |
|
|
|
|
|
@app.cell |
|
def _(dataset_encoded, mo): |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.naive_bayes import BernoulliNB |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix |
|
|
|
X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage']) |
|
y = dataset_encoded.select(['Survival_5_years']) |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101) |
|
logreg = LogisticRegression() |
|
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test) |
|
bnb = BernoulliNB() |
|
y_pred_bnb = bnb.fit(X_train, y_train).predict(X_test) |
|
dectree = DecisionTreeClassifier() |
|
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test) |
|
|
|
|
|
mo.md(f""" |
|
## Logistic Regression |
|
|
|
Accuracy score: {accuracy_score(y_test, y_pred_logreg)} |
|
|
|
Precision score: {precision_score(y_test, y_pred_logreg)} |
|
|
|
Confusion matrix: |
|
``` |
|
{confusion_matrix(y_test, y_pred_logreg)} |
|
``` |
|
|
|
Classification report: |
|
``` |
|
{classification_report(y_test, y_pred_logreg)} |
|
``` |
|
|
|
## Bernoulli Naive Bayes |
|
|
|
Accuracy score: {accuracy_score(y_test, y_pred_bnb)} |
|
|
|
Precision score: {precision_score(y_test, y_pred_bnb)} |
|
|
|
Confusion matrix: |
|
``` |
|
{confusion_matrix(y_test, y_pred_bnb)} |
|
``` |
|
|
|
Classification report: |
|
``` |
|
{classification_report(y_test, y_pred_bnb)} |
|
``` |
|
|
|
## Decision Tree Classifier |
|
|
|
Accuracy score: {accuracy_score(y_test, y_pred_dectree)} |
|
|
|
Precision score: {precision_score(y_test, y_pred_dectree)} |
|
|
|
Confusion matrix: |
|
``` |
|
{confusion_matrix(y_test, y_pred_dectree)} |
|
``` |
|
|
|
Classification report: |
|
``` |
|
{classification_report(y_test, y_pred_dectree)} |
|
``` |
|
|
|
{mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')} |
|
""") |
|
return ( |
|
BernoulliNB, |
|
DecisionTreeClassifier, |
|
LogisticRegression, |
|
X, |
|
X_test, |
|
X_train, |
|
accuracy_score, |
|
bnb, |
|
classification_report, |
|
confusion_matrix, |
|
dectree, |
|
logreg, |
|
precision_score, |
|
train_test_split, |
|
y, |
|
y_pred_bnb, |
|
y_pred_dectree, |
|
y_pred_logreg, |
|
y_test, |
|
y_train, |
|
) |
|
|
|
|
|
@app.cell |
|
def _(OrdinalEncoder, dataset, mo, pl): |
|
def _(): |
|
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN |
|
from sklearn.svm import SVC |
|
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score |
|
import altair as alt |
|
|
|
genmut_encoder = OrdinalEncoder() |
|
genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation')) |
|
genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation']) |
|
encoded_schema = {name: pl.Int8 for name in genmut_features} |
|
dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema) |
|
dataset_encoded = dataset.with_columns(dataset_encoded_parts) |
|
|
|
dataset_encoded = dataset_encoded.sample(3000, seed=11) |
|
|
|
X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation']) |
|
y = dataset_encoded.select(['Cancer_Stage']).to_series() |
|
|
|
kmeans = KMeans(n_clusters=3, random_state=11) |
|
spec = SpectralClustering(n_clusters=3, random_state=11) |
|
|
|
labels_kmeans = kmeans.fit_predict(X) |
|
labels_spec = spec.fit_predict(X) |
|
|
|
|
|
df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster')) |
|
df_spec = X.with_columns(pl.lit(labels_spec, dtype=pl.String).alias('spectral_cluster')) |
|
|
|
return mo.vstack([ |
|
mo.md(f""" |
|
## K-Means Clustering |
|
|
|
### External Metrics (Based on Cancer Stage Labels) |
|
|
|
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)} |
|
|
|
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)} |
|
|
|
Homogeneity: {homogeneity_score(y, labels_kmeans)} |
|
|
|
Completeness: {completeness_score(y, labels_kmeans)} |
|
|
|
V-measure: {v_measure_score(y, labels_kmeans)} |
|
|
|
### Internal Metrics |
|
|
|
Silhouette Score: {silhouette_score(X, labels_kmeans)} |
|
|
|
Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)} |
|
|
|
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)} |
|
|
|
|
|
## Spectral Clustering |
|
|
|
### External Metrics (Based on Cancer Stage Labels) |
|
|
|
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)} |
|
|
|
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)} |
|
|
|
Homogeneity: {homogeneity_score(y, labels_spec)} |
|
|
|
Completeness: {completeness_score(y, labels_spec)} |
|
|
|
V-measure: {v_measure_score(y, labels_spec)} |
|
|
|
### Internal Metrics |
|
|
|
Silhouette Score: {silhouette_score(X, labels_spec)} |
|
|
|
Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)} |
|
|
|
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)} |
|
|
|
{mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')} |
|
"""), |
|
|
|
mo.hstack([ |
|
alt.Chart(df_kmeans, autosize='pad').mark_rect().encode( |
|
alt.X('Genetic_Mutation:N'), |
|
y='Tumor_Size_mm', |
|
color='kmeans_cluster' |
|
).properties( |
|
width=325 |
|
).interactive(), |
|
|
|
alt.Chart(df_spec, autosize='pad').mark_rect().encode( |
|
alt.X('Genetic_Mutation:N'), |
|
y='Tumor_Size_mm', |
|
color='spectral_cluster' |
|
).properties( |
|
width=325 |
|
).interactive(), |
|
]) |
|
]) |
|
|
|
|
|
_() |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run() |
|
|