File size: 4,023 Bytes
9d94c83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import marimo

__generated_with = "0.11.4"
app = marimo.App(width="medium")


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""# Diabetes Dataset Analysis""")
    return


@app.cell(hide_code=True)
def _():
    import marimo as mo
    import polars as pl
    return mo, pl


@app.cell(hide_code=True)
def _(mo):
    mo.accordion(
        {"Notes": """
    ## Dataset Column Notes

    > Only highlighted columns of interest 

    * Diabetes_binary: [ 0 (No diabetes) | 1 (Pre/diabetes) ]
    * HighBP: [ 0 (No High BP) | 1 (High BP) ]
    * HighChol: [ 0 (No High BP) | 1 (High BP) ]
    * Stroke: [ 0 (Never) | 1 (Had) ]
    * HeartDiseaseorAttack: [ 0 (No) | 1 (Yes) ]
    * Smoker: [ 0 (<100 cigs lifetime) | 1 (>100 cigs lifetime) 
    * HvyAlcohol: [ 0 (<14 🍺/week for men, <7 🍺/week for women) | 1 (otherwise) ]
        """}
    )
    return


@app.cell(hide_code=True)
def _(pl):
    dataset_raw = pl.read_csv("dataset/diabetes_binary_health_indicators_BRFSS2015.csv")
    dataset_prior_conditions = dataset_raw.select(["Diabetes_binary", "HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"])
    dataset_prior_conditions.head()
    return dataset_prior_conditions, dataset_raw


@app.cell
def _(mo):
    mo.md("""## Naive Bayes' Classifier""")
    return


@app.cell
def _(dataset_prior_conditions, mo, pl):
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

    X_priors_NB, y_priors_NB = dataset_prior_conditions.select(pl.exclude("Diabetes_binary")), dataset_prior_conditions.select("Diabetes_binary")
    X_train_priors, X_test_priors, y_train_priors, y_test_priors = train_test_split(
        X_priors_NB, y_priors_NB, random_state=33, test_size=0.25
    )

    bnb = BernoulliNB()
    y_pred_priors = bnb.fit(X_train_priors, y_train_priors).predict(X_test_priors)
    mo.md(f"""
        Accuracy             : {accuracy_score(y_test_priors, y_pred_priors)}

        Confusion Matrix:

    ```
        {confusion_matrix(y_test_priors, y_pred_priors)}
    ```

        Classification Report: 

    ```
        {classification_report(y_test_priors, y_pred_priors)}
    ```
        """)
    return (
        BernoulliNB,
        X_priors_NB,
        X_test_priors,
        X_train_priors,
        accuracy_score,
        bnb,
        classification_report,
        confusion_matrix,
        train_test_split,
        y_pred_priors,
        y_priors_NB,
        y_test_priors,
        y_train_priors,
    )


@app.cell
def _(X_test_priors, pl, y_pred_priors, y_test_priors):
    import altair as alt
    alt.data_transformers.enable("vegafusion")

    # X_test_priors, y_pred_priors, y_test_priors
    dataset_result_priors = pl.concat([X_test_priors, y_test_priors, pl.DataFrame({"Predicted Diabetes_binary": y_pred_priors})], how="horizontal")
    dataset_result_priors1 = dataset_result_priors.select(
        (pl.col("HighBP") * 8),
        (pl.col("HighChol") * 4),
        (pl.col("Stroke") * 2),
        pl.exclude(["HighBP", "HighChol", "Stroke"])
    )
    dataset_result_priors1 = dataset_result_priors1.select(
        pl.sum_horizontal(pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack")),
        pl.col("Diabetes_binary", "Predicted Diabetes_binary")
    )
    dataset_result_priors2 = dataset_result_priors.select(
        pl.exclude(["Diabetes_binary", "Predicted Diabetes_binary"]),
        (pl.col("Diabetes_binary") * 2),
        pl.col("Predicted Diabetes_binary")
    )
    dataset_result_priors2 = dataset_result_priors2.select(
        pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"),
        pl.sum_horizontal(pl.col("Diabetes_binary", "Predicted Diabetes_binary"))
    )
    dataset_result_priors2.head(10)
    return (
        alt,
        dataset_result_priors,
        dataset_result_priors1,
        dataset_result_priors2,
    )


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()