Spencer525's picture
Update app.py
e0263ce verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif
import io
import base64
# Function to create a download link
def get_download_link(data, filename, text):
b64 = base64.b64encode(data).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
return href
# Function to plot correlation matrix
def plot_correlation_matrix(data):
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
st.pyplot(plt)
# Function to calculate feature importance
def calculate_feature_importance(X, y):
# Encode non-sequential class labels to sequential integers
le = LabelEncoder()
y_encoded = le.fit_transform(y) # Transform y into continuous integers
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
methods = {
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"XGBoost": XGBClassifier(random_state=42)
}
importance_dict = {}
for name, model in methods.items():
model.fit(X_train_scaled, y_train)
importance_dict[name] = model.feature_importances_
# Permutation Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
perm_importance = permutation_importance(rf, X_test_scaled, y_test, n_repeats=10, random_state=42)
importance_dict["Permutation"] = perm_importance.importances_mean
# Mutual Information
mi_scores = mutual_info_classif(X_train_scaled, y_train)
importance_dict["Mutual Information"] = mi_scores
return importance_dict
# Streamlit app
st.title('Heart Disease Feature Analysis')
# File upload (this line defines `uploaded_file`)
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("Data Preview:")
st.write(data.head())
# Select target variable
target_col = st.selectbox("Select the target variable", data.columns)
if st.button('Analyze'):
X = data.drop(target_col, axis=1)
y = data[target_col]
# Ensure that `y` has continuous integer values for classification
st.write("Original Target Values:", y.unique()) # Show original target values for debugging
# Correlation Matrix
st.subheader('Correlation Matrix')
plot_correlation_matrix(data)
# Feature Importance
st.subheader('Feature Importance')
importance_dict = calculate_feature_importance(X, y)
# Create a DataFrame with all feature importances
importance_df = pd.DataFrame(importance_dict, index=X.columns)
st.write(importance_df)
else:
st.write("Please upload a CSV file to begin the analysis.")