import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import confusion_matrix, classification_report import matplotlib.pyplot as plt import seaborn as sns from huggingface_hub import login from datasets import load_dataset import io from contextlib import redirect_stdout import os # Streamlit UI dataset_name = "louiecerv/diabetes_dataset" # Retrieve Hugging Face token from environment variable hf_token = os.getenv("HF_TOKEN") if not hf_token: st.error("HF_TOKEN environment variable is not set. Please set it before running the app.") st.stop() # Login to Hugging Face Hub login(token=hf_token) # Load dataset try: with st.spinner("Loading dataset..."): dataset = load_dataset(dataset_name) st.success("Dataset loaded successfully.") except ValueError: st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.") st.stop() except PermissionError: st.error("Authentication failed. Check if your Hugging Face token is correct.") st.stop() except Exception as e: st.error(f"Unexpected error: {e}") st.stop() data = dataset["train"].to_pandas() # Set the title of the Streamlit app st.title("Diabetes Prediction App") with st.expander("About This App"): st.markdown(""" ## Dataset Description This app uses a dataset containing medical and lifestyle information about patients, along with their diabetes status (positive or negative). The goal is to predict whether a patient has diabetes based on their provided features. The dataset includes the following features: | Column | Description | Type | |-----------------|-------------------------------------------|---------| | gender | The gender of the patient | Object | | age | The age of the patient | Float | | hypertension | Whether the patient has hypertension (1 for yes, 0 for no) | Integer | | heart_disease | Whether the patient has heart disease (1 for yes, 0 for no) | Integer | | smoking_history | The smoking history of the patient | Object | | bmi | The body mass index of the patient | Float | | HbA1c_level | The HbA1c level of the patient | Float | | blood_glucose_level | The blood glucose level of the patient | Integer | | diabetes | Whether the patient has diabetes (1 for yes, 0 for no) | Integer | ## Preprocessing Tasks The following preprocessing steps were performed on the data: * **Handle Missing Values:** Missing values were checked and imputed using appropriate methods. * **Encode Categorical Features:** Categorical features (gender, smoking_history) were converted into numerical representations using one-hot encoding. * **Scale Numerical Features:** Numerical features (age, bmi, HbA1c_level, blood_glucose_level) were scaled to a standard range. * **Split Data:** The dataset was divided into training and testing sets. * **Handle Class Imbalance (if present):** Techniques like oversampling or undersampling were used if needed. ## ML Model Recommendation This app utilizes a machine learning model for binary classification. Suitable models for this type of prediction include: * Logistic Regression * Support Vector Machines (SVM) * Decision Trees * Random Forest * Gradient Boosting Machines (GBM) Created by Louie F. Cervantes, M.Eng. (Information Engineering) """) # Display the dataset in a dataframe st.subheader("Dataset") st.write(data) # Show the statistics of the dataset st.subheader("Dataset Statistics") st.write(data.describe()) # Visualizations of the data st.subheader("Data Visualizations") # Histogram of age st.write("Histogram of Age") fig, ax = plt.subplots() ax.hist(data['age'], bins=10) ax.set_xlabel('Age') ax.set_ylabel('Frequency') st.pyplot(fig) # Bar chart of gender st.write("Bar Chart of Gender") fig, ax = plt.subplots() ax.bar(data['gender'].value_counts().index, data['gender'].value_counts().values) ax.set_xlabel('Gender') ax.set_ylabel('Count') st.pyplot(fig) # Preprocessing st.subheader("Data Preprocessing") # Check for null values st.write("Null Values:") st.write(data.isnull().sum()) # Handle null values imputer = SimpleImputer(strategy='mean') data['bmi'] = imputer.fit_transform(data[['bmi']]) # Check for consistency of data types st.write("Data Types:") # Create a buffer to capture the output of df.info() buffer = io.StringIO() # Redirect the output of df.info() to the buffer with redirect_stdout(buffer): data.info() # Get the captured output from the buffer info_string = buffer.getvalue() # Split the output string into lines lines = info_string.splitlines() # Extract column names and their data types columns = [] cname = [] counts = [] nulls = [] dtypes = [] for line in lines[5:-2]: # Skip header and footer lines col_info = line.split() columns.append(col_info[0]) cname.append(col_info[1]) counts.append(col_info[2]) nulls.append(col_info[3]) dtypes.append(col_info[4]) # Create a DataFrame info_df = pd.DataFrame({'Column': columns, 'Name': cname, 'Count': counts, 'Null': nulls, 'Data Type': dtypes}) # Display the DataFrame in Streamlit st.dataframe(info_df) # Identify numeric and categorical data numeric_features = data.select_dtypes(include=['int64', 'float64']).columns categorical_features = data.select_dtypes(include=['object']).columns st.write("Numeric Features:", numeric_features) st.write("Categorical Features:", categorical_features) # One-hot encoding for categorical data encoder = OneHotEncoder(handle_unknown='ignore') encoded_data = encoder.fit_transform(data[categorical_features]) encoded_df = pd.DataFrame(encoded_data.toarray()) data = data.drop(categorical_features, axis=1) data = pd.concat([data, encoded_df], axis=1) # Split data into training and testing sets X = data.drop('diabetes', axis=1) y = data['diabetes'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Redefine numeric_features after one-hot encoding and after dropping the target column numeric_features = X.select_dtypes(include=['int64', 'float64']).columns # Convert all column names to strings X_train.columns = X_train.columns.astype(str) X_test.columns = X_test.columns.astype(str) # Scale numeric features scaler = StandardScaler() # Save column names before scaling X_train_df = X_train # Save as DataFrame before scaling X_test_df = X_test # Save as DataFrame before scaling feature_names = X_train_df.columns # Store feature names separately # Apply StandardScaler (returns a NumPy array) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Convert back to DataFrame after scaling X_train = pd.DataFrame(X_train, columns=feature_names) X_test = pd.DataFrame(X_test, columns=feature_names) # Initialize session state for model training flag if 'models_trained' not in st.session_state: st.session_state['models_trained'] = False # ML Models st.subheader("Machine Learning Models") # Initialize session state for models if 'models' not in st.session_state: st.session_state['models'] = { "Logistic Regression": LogisticRegression(), "Naive Bayes": GaussianNB(), "SVM": SVC(), "Decision Tree": DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(), "Gradient Boosting": GradientBoostingClassifier(), "MLP Neural Network": MLPClassifier() } # Create tabs for different models model_tabs = st.tabs(st.session_state['models'].keys()) # Train the models and store them in session state if not st.session_state['models_trained']: st.write("Training Models with 100,000 data rows...") with st.spinner("Training Models..."): for i, (model_name, model) in enumerate(st.session_state['models'].items()): with model_tabs[i]: st.write(model_name) model.fit(X_train, y_train) y_pred = model.predict(X_test) st.write("Confusion Matrix:") st.write(confusion_matrix(y_test, y_pred)) cr = classification_report(y_test, y_pred, output_dict=True) # Display classification report as dataframe cr_df = pd.DataFrame(cr).transpose() st.write(f"Classification Report - {model_name}") st.write(cr_df) st.session_state['models_trained'] = True # Diabetes Prediction st.subheader("Diabetes Prediction") # Select the trained model to use selected_model_name = st.selectbox("Select Trained Model", list(st.session_state['models'].keys())) selected_model = st.session_state['models'][selected_model_name] # Input Fields gender = st.selectbox("Gender", ["Female", "Male", "Other"]) age = st.number_input("Age", min_value=0, max_value=120, value=30) hypertension = st.selectbox("Hypertension", ['0', '1']) heart_disease = st.selectbox("Heart Disease", ['0', '1']) smoking_history = st.selectbox("Smoking History", ['never', 'No Info', 'current', 'former', 'ever', 'not current']) bmi = st.number_input("BMI", min_value=0.0, value=25.0) hba1c_level = st.number_input("HbA1c Level", min_value=0.0, value=6.0) blood_glucose_level = st.number_input("Blood Glucose Level", min_value=0, value=100) if st.button("Predict Diabetes"): with st.spinner("Prrocessing inputs..."): # Create a DataFrame for the user input input_data = pd.DataFrame({ 'gender': [gender], 'age': [age], 'hypertension': [int(hypertension)], # Convert categorical numerical features to int 'heart_disease': [int(heart_disease)], 'smoking_history': [smoking_history], 'bmi': [bmi], 'HbA1c_level': [hba1c_level], 'blood_glucose_level': [blood_glucose_level] }) # Ensure encoding is applied correctly encoded_input = encoder.transform(input_data[['gender', 'smoking_history']]) encoded_input_df = pd.DataFrame(encoded_input.toarray(), columns=encoder.get_feature_names_out()) # Drop the original categorical columns and concatenate the encoded features input_data = input_data.drop(['gender', 'smoking_history'], axis=1) input_data = pd.concat([input_data, encoded_input_df], axis=1) # Ensure that the input data has the same columns as training data missing_cols = set(X_train.columns) - set(input_data.columns) # ✅ Corrected line for col in missing_cols: input_data[col] = 0 # Add missing columns with zero values # Reorder columns to match training data input_data = input_data.reindex(columns=X_train.columns, fill_value=0) # Convert all column names to strings input_data.columns = input_data.columns.astype(str) # Scale the user input (convert back to DataFrame after transformation) input_data_scaled = scaler.transform(input_data) input_data_scaled = pd.DataFrame(input_data_scaled, columns=input_data.columns) # Convert back to DataFrame # Make prediction using the selected model prediction = selected_model.predict(input_data_scaled) # Display the prediction st.write("Prediction:") if prediction[0] == 0: st.info("The model predicts that you do not have diabetes.") else: st.warning("The model predicts that you have diabetes.")