Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from xgboost import XGBClassifier | |
from sklearn.inspection import permutation_importance | |
from sklearn.feature_selection import mutual_info_classif | |
import io | |
import base64 | |
# Function to create a download link | |
def get_download_link(data, filename, text): | |
b64 = base64.b64encode(data).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>' | |
return href | |
# Function to plot correlation matrix | |
def plot_correlation_matrix(data): | |
plt.figure(figsize=(12, 10)) | |
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5) | |
plt.title('Correlation Matrix') | |
plt.tight_layout() | |
st.pyplot(plt) | |
# Function to calculate feature importance | |
def calculate_feature_importance(X, y): | |
# Encode non-sequential class labels to sequential integers | |
le = LabelEncoder() | |
y_encoded = le.fit_transform(y) # Transform y into continuous integers | |
# Split the dataset | |
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42) | |
scaler = StandardScaler() | |
X_train_scaled = scaler.fit_transform(X_train) | |
X_test_scaled = scaler.transform(X_test) | |
methods = { | |
"Decision Tree": DecisionTreeClassifier(random_state=42), | |
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), | |
"XGBoost": XGBClassifier(random_state=42) | |
} | |
importance_dict = {} | |
for name, model in methods.items(): | |
model.fit(X_train_scaled, y_train) | |
importance_dict[name] = model.feature_importances_ | |
# Permutation Importance | |
rf = RandomForestClassifier(n_estimators=100, random_state=42) | |
rf.fit(X_train_scaled, y_train) | |
perm_importance = permutation_importance(rf, X_test_scaled, y_test, n_repeats=10, random_state=42) | |
importance_dict["Permutation"] = perm_importance.importances_mean | |
# Mutual Information | |
mi_scores = mutual_info_classif(X_train_scaled, y_train) | |
importance_dict["Mutual Information"] = mi_scores | |
return importance_dict | |
# Streamlit app | |
st.title('Heart Disease Feature Analysis') | |
# File upload (this line defines `uploaded_file`) | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = pd.read_csv(uploaded_file) | |
st.write("Data Preview:") | |
st.write(data.head()) | |
# Select target variable | |
target_col = st.selectbox("Select the target variable", data.columns) | |
if st.button('Analyze'): | |
X = data.drop(target_col, axis=1) | |
y = data[target_col] | |
# Ensure that `y` has continuous integer values for classification | |
st.write("Original Target Values:", y.unique()) # Show original target values for debugging | |
# Correlation Matrix | |
st.subheader('Correlation Matrix') | |
plot_correlation_matrix(data) | |
# Feature Importance | |
st.subheader('Feature Importance') | |
importance_dict = calculate_feature_importance(X, y) | |
# Create a DataFrame with all feature importances | |
importance_df = pd.DataFrame(importance_dict, index=X.columns) | |
st.write(importance_df) | |
else: | |
st.write("Please upload a CSV file to begin the analysis.") | |