|
import marimo |
|
|
|
__generated_with = "0.11.4" |
|
app = marimo.App(width="medium") |
|
|
|
|
|
@app.cell(hide_code=True) |
|
def _(mo): |
|
mo.md(r"""# Diabetes Dataset Analysis""") |
|
return |
|
|
|
|
|
@app.cell(hide_code=True) |
|
def _(): |
|
import marimo as mo |
|
import polars as pl |
|
return mo, pl |
|
|
|
|
|
@app.cell(hide_code=True) |
|
def _(mo): |
|
mo.accordion( |
|
{"Notes": """ |
|
## Dataset Column Notes |
|
|
|
> Only highlighted columns of interest |
|
|
|
* Diabetes_binary: [ 0 (No diabetes) | 1 (Pre/diabetes) ] |
|
* HighBP: [ 0 (No High BP) | 1 (High BP) ] |
|
* HighChol: [ 0 (No High BP) | 1 (High BP) ] |
|
* Stroke: [ 0 (Never) | 1 (Had) ] |
|
* HeartDiseaseorAttack: [ 0 (No) | 1 (Yes) ] |
|
* Smoker: [ 0 (<100 cigs lifetime) | 1 (>100 cigs lifetime) |
|
* HvyAlcohol: [ 0 (<14 🍺/week for men, <7 🍺/week for women) | 1 (otherwise) ] |
|
"""} |
|
) |
|
return |
|
|
|
|
|
@app.cell(hide_code=True) |
|
def _(pl): |
|
dataset_raw = pl.read_csv("dataset/diabetes_binary_health_indicators_BRFSS2015.csv") |
|
dataset_prior_conditions = dataset_raw.select(["Diabetes_binary", "HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"]) |
|
dataset_prior_conditions.head() |
|
return dataset_prior_conditions, dataset_raw |
|
|
|
|
|
@app.cell |
|
def _(mo): |
|
mo.md("""## Naive Bayes' Classifier""") |
|
return |
|
|
|
|
|
@app.cell |
|
def _(dataset_prior_conditions, mo, pl): |
|
from sklearn.naive_bayes import BernoulliNB |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
|
X_priors_NB, y_priors_NB = dataset_prior_conditions.select(pl.exclude("Diabetes_binary")), dataset_prior_conditions.select("Diabetes_binary") |
|
X_train_priors, X_test_priors, y_train_priors, y_test_priors = train_test_split( |
|
X_priors_NB, y_priors_NB, random_state=33, test_size=0.25 |
|
) |
|
|
|
bnb = BernoulliNB() |
|
y_pred_priors = bnb.fit(X_train_priors, y_train_priors).predict(X_test_priors) |
|
mo.md(f""" |
|
Accuracy : {accuracy_score(y_test_priors, y_pred_priors)} |
|
|
|
Confusion Matrix: |
|
|
|
``` |
|
{confusion_matrix(y_test_priors, y_pred_priors)} |
|
``` |
|
|
|
Classification Report: |
|
|
|
``` |
|
{classification_report(y_test_priors, y_pred_priors)} |
|
``` |
|
""") |
|
return ( |
|
BernoulliNB, |
|
X_priors_NB, |
|
X_test_priors, |
|
X_train_priors, |
|
accuracy_score, |
|
bnb, |
|
classification_report, |
|
confusion_matrix, |
|
train_test_split, |
|
y_pred_priors, |
|
y_priors_NB, |
|
y_test_priors, |
|
y_train_priors, |
|
) |
|
|
|
|
|
@app.cell |
|
def _(X_test_priors, pl, y_pred_priors, y_test_priors): |
|
import altair as alt |
|
alt.data_transformers.enable("vegafusion") |
|
|
|
|
|
dataset_result_priors = pl.concat([X_test_priors, y_test_priors, pl.DataFrame({"Predicted Diabetes_binary": y_pred_priors})], how="horizontal") |
|
dataset_result_priors1 = dataset_result_priors.select( |
|
(pl.col("HighBP") * 8), |
|
(pl.col("HighChol") * 4), |
|
(pl.col("Stroke") * 2), |
|
pl.exclude(["HighBP", "HighChol", "Stroke"]) |
|
) |
|
dataset_result_priors1 = dataset_result_priors1.select( |
|
pl.sum_horizontal(pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack")), |
|
pl.col("Diabetes_binary", "Predicted Diabetes_binary") |
|
) |
|
dataset_result_priors2 = dataset_result_priors.select( |
|
pl.exclude(["Diabetes_binary", "Predicted Diabetes_binary"]), |
|
(pl.col("Diabetes_binary") * 2), |
|
pl.col("Predicted Diabetes_binary") |
|
) |
|
dataset_result_priors2 = dataset_result_priors2.select( |
|
pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"), |
|
pl.sum_horizontal(pl.col("Diabetes_binary", "Predicted Diabetes_binary")) |
|
) |
|
dataset_result_priors2.head(10) |
|
return ( |
|
alt, |
|
dataset_result_priors, |
|
dataset_result_priors1, |
|
dataset_result_priors2, |
|
) |
|
|
|
|
|
@app.cell |
|
def _(): |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run() |
|
|