File size: 4,023 Bytes
9d94c83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import marimo
__generated_with = "0.11.4"
app = marimo.App(width="medium")
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# Diabetes Dataset Analysis""")
return
@app.cell(hide_code=True)
def _():
import marimo as mo
import polars as pl
return mo, pl
@app.cell(hide_code=True)
def _(mo):
mo.accordion(
{"Notes": """
## Dataset Column Notes
> Only highlighted columns of interest
* Diabetes_binary: [ 0 (No diabetes) | 1 (Pre/diabetes) ]
* HighBP: [ 0 (No High BP) | 1 (High BP) ]
* HighChol: [ 0 (No High BP) | 1 (High BP) ]
* Stroke: [ 0 (Never) | 1 (Had) ]
* HeartDiseaseorAttack: [ 0 (No) | 1 (Yes) ]
* Smoker: [ 0 (<100 cigs lifetime) | 1 (>100 cigs lifetime)
* HvyAlcohol: [ 0 (<14 🍺/week for men, <7 🍺/week for women) | 1 (otherwise) ]
"""}
)
return
@app.cell(hide_code=True)
def _(pl):
dataset_raw = pl.read_csv("dataset/diabetes_binary_health_indicators_BRFSS2015.csv")
dataset_prior_conditions = dataset_raw.select(["Diabetes_binary", "HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"])
dataset_prior_conditions.head()
return dataset_prior_conditions, dataset_raw
@app.cell
def _(mo):
mo.md("""## Naive Bayes' Classifier""")
return
@app.cell
def _(dataset_prior_conditions, mo, pl):
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
X_priors_NB, y_priors_NB = dataset_prior_conditions.select(pl.exclude("Diabetes_binary")), dataset_prior_conditions.select("Diabetes_binary")
X_train_priors, X_test_priors, y_train_priors, y_test_priors = train_test_split(
X_priors_NB, y_priors_NB, random_state=33, test_size=0.25
)
bnb = BernoulliNB()
y_pred_priors = bnb.fit(X_train_priors, y_train_priors).predict(X_test_priors)
mo.md(f"""
Accuracy : {accuracy_score(y_test_priors, y_pred_priors)}
Confusion Matrix:
```
{confusion_matrix(y_test_priors, y_pred_priors)}
```
Classification Report:
```
{classification_report(y_test_priors, y_pred_priors)}
```
""")
return (
BernoulliNB,
X_priors_NB,
X_test_priors,
X_train_priors,
accuracy_score,
bnb,
classification_report,
confusion_matrix,
train_test_split,
y_pred_priors,
y_priors_NB,
y_test_priors,
y_train_priors,
)
@app.cell
def _(X_test_priors, pl, y_pred_priors, y_test_priors):
import altair as alt
alt.data_transformers.enable("vegafusion")
# X_test_priors, y_pred_priors, y_test_priors
dataset_result_priors = pl.concat([X_test_priors, y_test_priors, pl.DataFrame({"Predicted Diabetes_binary": y_pred_priors})], how="horizontal")
dataset_result_priors1 = dataset_result_priors.select(
(pl.col("HighBP") * 8),
(pl.col("HighChol") * 4),
(pl.col("Stroke") * 2),
pl.exclude(["HighBP", "HighChol", "Stroke"])
)
dataset_result_priors1 = dataset_result_priors1.select(
pl.sum_horizontal(pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack")),
pl.col("Diabetes_binary", "Predicted Diabetes_binary")
)
dataset_result_priors2 = dataset_result_priors.select(
pl.exclude(["Diabetes_binary", "Predicted Diabetes_binary"]),
(pl.col("Diabetes_binary") * 2),
pl.col("Predicted Diabetes_binary")
)
dataset_result_priors2 = dataset_result_priors2.select(
pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"),
pl.sum_horizontal(pl.col("Diabetes_binary", "Predicted Diabetes_binary"))
)
dataset_result_priors2.head(10)
return (
alt,
dataset_result_priors,
dataset_result_priors1,
dataset_result_priors2,
)
@app.cell
def _():
return
if __name__ == "__main__":
app.run()
|