|
import streamlit as st |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.naive_bayes import GaussianNB |
|
from sklearn.svm import SVC |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
from sklearn.neural_network import MLPClassifier |
|
from sklearn.metrics import confusion_matrix, classification_report |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from huggingface_hub import login |
|
from datasets import load_dataset |
|
import io |
|
from contextlib import redirect_stdout |
|
import os |
|
|
|
|
|
dataset_name = "louiecerv/diabetes_dataset" |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
if not hf_token: |
|
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.") |
|
st.stop() |
|
|
|
|
|
login(token=hf_token) |
|
|
|
|
|
try: |
|
with st.spinner("Loading dataset..."): |
|
dataset = load_dataset(dataset_name) |
|
st.success("Dataset loaded successfully.") |
|
except ValueError: |
|
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.") |
|
st.stop() |
|
except PermissionError: |
|
st.error("Authentication failed. Check if your Hugging Face token is correct.") |
|
st.stop() |
|
except Exception as e: |
|
st.error(f"Unexpected error: {e}") |
|
st.stop() |
|
|
|
data = dataset["train"].to_pandas() |
|
|
|
|
|
st.title("Diabetes Prediction App") |
|
|
|
with st.expander("About This App"): |
|
st.markdown(""" |
|
## Dataset Description |
|
|
|
This app uses a dataset containing medical and lifestyle information about patients, |
|
along with their diabetes status (positive or negative). The goal is to predict |
|
whether a patient has diabetes based on their provided features. |
|
|
|
The dataset includes the following features: |
|
|
|
| Column | Description | Type | |
|
|-----------------|-------------------------------------------|---------| |
|
| gender | The gender of the patient | Object | |
|
| age | The age of the patient | Float | |
|
| hypertension | Whether the patient has hypertension (1 for yes, 0 for no) | Integer | |
|
| heart_disease | Whether the patient has heart disease (1 for yes, 0 for no) | Integer | |
|
| smoking_history | The smoking history of the patient | Object | |
|
| bmi | The body mass index of the patient | Float | |
|
| HbA1c_level | The HbA1c level of the patient | Float | |
|
| blood_glucose_level | The blood glucose level of the patient | Integer | |
|
| diabetes | Whether the patient has diabetes (1 for yes, 0 for no) | Integer | |
|
|
|
## Preprocessing Tasks |
|
|
|
The following preprocessing steps were performed on the data: |
|
|
|
* **Handle Missing Values:** Missing values were checked and imputed using appropriate methods. |
|
* **Encode Categorical Features:** Categorical features (gender, smoking_history) were converted |
|
into numerical representations using one-hot encoding. |
|
* **Scale Numerical Features:** Numerical features (age, bmi, HbA1c_level, blood_glucose_level) |
|
were scaled to a standard range. |
|
* **Split Data:** The dataset was divided into training and testing sets. |
|
* **Handle Class Imbalance (if present):** Techniques like oversampling or undersampling were used if needed. |
|
|
|
## ML Model Recommendation |
|
|
|
This app utilizes a machine learning model for binary classification. Suitable models for this type of prediction include: |
|
|
|
* Logistic Regression |
|
* Support Vector Machines (SVM) |
|
* Decision Trees |
|
* Random Forest |
|
* Gradient Boosting Machines (GBM) |
|
|
|
Created by Louie F. Cervantes, M.Eng. (Information Engineering) |
|
""") |
|
|
|
|
|
st.subheader("Dataset") |
|
st.write(data) |
|
|
|
|
|
st.subheader("Dataset Statistics") |
|
st.write(data.describe()) |
|
|
|
|
|
st.subheader("Data Visualizations") |
|
|
|
|
|
st.write("Histogram of Age") |
|
fig, ax = plt.subplots() |
|
ax.hist(data['age'], bins=10) |
|
ax.set_xlabel('Age') |
|
ax.set_ylabel('Frequency') |
|
st.pyplot(fig) |
|
|
|
|
|
st.write("Bar Chart of Gender") |
|
fig, ax = plt.subplots() |
|
ax.bar(data['gender'].value_counts().index, data['gender'].value_counts().values) |
|
ax.set_xlabel('Gender') |
|
ax.set_ylabel('Count') |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Data Preprocessing") |
|
|
|
|
|
st.write("Null Values:") |
|
st.write(data.isnull().sum()) |
|
|
|
|
|
imputer = SimpleImputer(strategy='mean') |
|
data['bmi'] = imputer.fit_transform(data[['bmi']]) |
|
|
|
|
|
st.write("Data Types:") |
|
|
|
|
|
buffer = io.StringIO() |
|
|
|
|
|
with redirect_stdout(buffer): |
|
data.info() |
|
|
|
|
|
info_string = buffer.getvalue() |
|
|
|
|
|
lines = info_string.splitlines() |
|
|
|
|
|
columns = [] |
|
cname = [] |
|
counts = [] |
|
nulls = [] |
|
dtypes = [] |
|
for line in lines[5:-2]: |
|
col_info = line.split() |
|
columns.append(col_info[0]) |
|
cname.append(col_info[1]) |
|
counts.append(col_info[2]) |
|
nulls.append(col_info[3]) |
|
dtypes.append(col_info[4]) |
|
|
|
|
|
info_df = pd.DataFrame({'Column': columns, |
|
'Name': cname, |
|
'Count': counts, |
|
'Null': nulls, |
|
'Data Type': dtypes}) |
|
|
|
|
|
st.dataframe(info_df) |
|
|
|
|
|
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns |
|
categorical_features = data.select_dtypes(include=['object']).columns |
|
st.write("Numeric Features:", numeric_features) |
|
st.write("Categorical Features:", categorical_features) |
|
|
|
|
|
encoder = OneHotEncoder(handle_unknown='ignore') |
|
encoded_data = encoder.fit_transform(data[categorical_features]) |
|
encoded_df = pd.DataFrame(encoded_data.toarray()) |
|
data = data.drop(categorical_features, axis=1) |
|
data = pd.concat([data, encoded_df], axis=1) |
|
|
|
|
|
X = data.drop('diabetes', axis=1) |
|
y = data['diabetes'] |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns |
|
|
|
|
|
X_train.columns = X_train.columns.astype(str) |
|
X_test.columns = X_test.columns.astype(str) |
|
|
|
|
|
scaler = StandardScaler() |
|
|
|
|
|
X_train_df = X_train |
|
X_test_df = X_test |
|
feature_names = X_train_df.columns |
|
|
|
|
|
scaler = StandardScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
|
|
X_train = pd.DataFrame(X_train, columns=feature_names) |
|
X_test = pd.DataFrame(X_test, columns=feature_names) |
|
|
|
|
|
if 'models_trained' not in st.session_state: |
|
st.session_state['models_trained'] = False |
|
|
|
|
|
st.subheader("Machine Learning Models") |
|
|
|
|
|
if 'models' not in st.session_state: |
|
st.session_state['models'] = { |
|
"Logistic Regression": LogisticRegression(), |
|
"Naive Bayes": GaussianNB(), |
|
"SVM": SVC(), |
|
"Decision Tree": DecisionTreeClassifier(), |
|
"Random Forest": RandomForestClassifier(), |
|
"Gradient Boosting": GradientBoostingClassifier(), |
|
"MLP Neural Network": MLPClassifier() |
|
} |
|
|
|
|
|
model_tabs = st.tabs(st.session_state['models'].keys()) |
|
|
|
|
|
if not st.session_state['models_trained']: |
|
st.write("Training Models with 100,000 data rows...") |
|
with st.spinner("Training Models..."): |
|
for i, (model_name, model) in enumerate(st.session_state['models'].items()): |
|
with model_tabs[i]: |
|
st.write(model_name) |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
st.write("Confusion Matrix:") |
|
st.write(confusion_matrix(y_test, y_pred)) |
|
cr = classification_report(y_test, y_pred, output_dict=True) |
|
|
|
cr_df = pd.DataFrame(cr).transpose() |
|
st.write(f"Classification Report - {model_name}") |
|
st.write(cr_df) |
|
st.session_state['models_trained'] = True |
|
|
|
|
|
st.subheader("Diabetes Prediction") |
|
|
|
|
|
selected_model_name = st.selectbox("Select Trained Model", list(st.session_state['models'].keys())) |
|
selected_model = st.session_state['models'][selected_model_name] |
|
|
|
|
|
gender = st.selectbox("Gender", ["Female", "Male", "Other"]) |
|
age = st.number_input("Age", min_value=0, max_value=120, value=30) |
|
hypertension = st.selectbox("Hypertension", ['0', '1']) |
|
heart_disease = st.selectbox("Heart Disease", ['0', '1']) |
|
smoking_history = st.selectbox("Smoking History", ['never', 'No Info', 'current', 'former', 'ever', 'not current']) |
|
bmi = st.number_input("BMI", min_value=0.0, value=25.0) |
|
hba1c_level = st.number_input("HbA1c Level", min_value=0.0, value=6.0) |
|
blood_glucose_level = st.number_input("Blood Glucose Level", min_value=0, value=100) |
|
|
|
if st.button("Predict Diabetes"): |
|
with st.spinner("Prrocessing inputs..."): |
|
|
|
input_data = pd.DataFrame({ |
|
'gender': [gender], |
|
'age': [age], |
|
'hypertension': [int(hypertension)], |
|
'heart_disease': [int(heart_disease)], |
|
'smoking_history': [smoking_history], |
|
'bmi': [bmi], |
|
'HbA1c_level': [hba1c_level], |
|
'blood_glucose_level': [blood_glucose_level] |
|
}) |
|
|
|
|
|
encoded_input = encoder.transform(input_data[['gender', 'smoking_history']]) |
|
encoded_input_df = pd.DataFrame(encoded_input.toarray(), columns=encoder.get_feature_names_out()) |
|
|
|
|
|
input_data = input_data.drop(['gender', 'smoking_history'], axis=1) |
|
input_data = pd.concat([input_data, encoded_input_df], axis=1) |
|
|
|
|
|
missing_cols = set(X_train.columns) - set(input_data.columns) |
|
for col in missing_cols: |
|
input_data[col] = 0 |
|
|
|
|
|
input_data = input_data.reindex(columns=X_train.columns, fill_value=0) |
|
|
|
|
|
input_data.columns = input_data.columns.astype(str) |
|
|
|
|
|
input_data_scaled = scaler.transform(input_data) |
|
input_data_scaled = pd.DataFrame(input_data_scaled, columns=input_data.columns) |
|
|
|
|
|
prediction = selected_model.predict(input_data_scaled) |
|
|
|
|
|
st.write("Prediction:") |
|
if prediction[0] == 0: |
|
st.info("The model predicts that you do not have diabetes.") |
|
else: |
|
st.warning("The model predicts that you have diabetes.") |
|
|
|
|