Spaces:
Sleeping
Sleeping
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import io | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import PercentFormatter | |
import seaborn as sns | |
from sklearn.preprocessing import ( | |
OneHotEncoder, | |
OrdinalEncoder, | |
StandardScaler, | |
MinMaxScaler, | |
) | |
from sklearn.model_selection import train_test_split | |
from imblearn.under_sampling import RandomUnderSampler | |
from imblearn.over_sampling import RandomOverSampler, SMOTE | |
from sklearn.linear_model import Ridge, Lasso, LogisticRegression | |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | |
from sklearn.svm import SVR, SVC | |
from sklearn.naive_bayes import MultinomialNB | |
from xgboost import XGBRFRegressor, XGBRFClassifier | |
from lightgbm import LGBMRegressor, LGBMClassifier | |
from sklearn.metrics import ( | |
mean_absolute_error, | |
mean_squared_error, | |
mean_squared_error, | |
r2_score, | |
) | |
from sklearn.metrics import ( | |
accuracy_score, | |
f1_score, | |
confusion_matrix, | |
precision_score, | |
recall_score, | |
) | |
import pickle | |
st.set_page_config(page_title="Data Analytics", page_icon="📊") | |
sns.set_style("white") | |
sns.set_context("poster", font_scale=0.7) | |
palette = [ | |
"#1d7874", | |
"#679289", | |
"#f4c095", | |
"#ee2e31", | |
"#ffb563", | |
"#918450", | |
"#f85e00", | |
"#a41623", | |
"#9a031e", | |
"#d6d6d6", | |
"#ffee32", | |
"#ffd100", | |
"#333533", | |
"#202020", | |
] | |
def main(): | |
file = st.sidebar.file_uploader("Upload Your CSV File Here: ") | |
#st.markdown("**Process the uploaded CSV file**") | |
process = st.sidebar.button("Process Files") | |
option = st.sidebar.radio( | |
"Select Data Analysis: ", | |
( | |
"Exploratory Data Analysis", | |
"Univariate Analysis", | |
"Bivariate Analysis", | |
), | |
) | |
placeholder = st.empty() | |
placeholder.markdown( | |
"<h1 style='text-align: center;'>Tabular/CSV Data Analytics📊</h1>", | |
unsafe_allow_html=True | |
) | |
if file is not None and process: | |
data = load_csv(file) | |
st.session_state["data"] = data | |
if "data" in st.session_state: | |
data = st.session_state["data"] | |
placeholder.empty() | |
if option == "Exploratory Data Analysis": | |
st.markdown( | |
"<h1 style='text-align: center;'>Exploratory Data Analysis</h1>", unsafe_allow_html=True | |
) | |
st.subheader("Data Overview") | |
st.write(data_overview(data)) | |
st.write(duplicate(data)) | |
st.dataframe(data.head()) | |
st.subheader("Data Types and Unique Value Counts") | |
display_data_info(data) | |
st.subheader("Missing Data") | |
missing_data(data) | |
st.subheader("Value Counts") | |
value_counts(data) | |
st.subheader("Descriptive Statistics") | |
st.write(data.describe().T) | |
if option == "Univariate Analysis": | |
st.markdown( | |
"<h1 style='text-align: center;'>Univariate Analysis</h1>", | |
unsafe_allow_html=True, | |
) | |
plot = st.radio( | |
"Select a chart: ", | |
("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"), | |
) | |
if plot == "Count Plot": | |
column = st.selectbox( | |
"Select a column", [""] + list(data.select_dtypes("O")) | |
) | |
if column: | |
countplot(data, column) | |
if plot == "Pie Chart": | |
column = st.selectbox( | |
"Select a column", [""] + list(data.select_dtypes("O")) | |
) | |
if column: | |
piechart(data, column) | |
if plot == "Histogram": | |
column = st.selectbox( | |
"Select a column", | |
[""] + list(data.select_dtypes(include=["int", "float"])), | |
) | |
if column: | |
histogram(data, column) | |
if plot == "Violin Plot": | |
column = st.selectbox( | |
"Select a column", | |
[""] + list(data.select_dtypes(include=["int", "float"])), | |
) | |
if column: | |
violinplot(data, column) | |
if plot == "Scatter Plot": | |
column = st.selectbox( | |
"Select a column", | |
[""] + list(data.select_dtypes(include=["int", "float"])), | |
) | |
if column: | |
scatterplot(data, column) | |
if option == "Bivariate Analysis": | |
st.markdown( | |
"<h1 style='text-align: center;'>Bivariate Analysis</h1>", | |
unsafe_allow_html=True, | |
) | |
plot = st.radio( | |
"Select a chart: ", | |
("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"), | |
) | |
if plot == "Scatter Plot": | |
columns = st.multiselect( | |
"Select two columns", | |
[""] + list(data.select_dtypes(include=["int", "float"])), | |
) | |
if columns: | |
biscatterplot(data, columns) | |
if plot == "Bar Plot": | |
columns = st.multiselect("Select two columns", list(data.columns)) | |
if columns: | |
bibarplot(data, columns) | |
if plot == "Box Plot": | |
columns = st.multiselect("Select two columns", list(data.columns)) | |
if columns: | |
biboxplot(data, columns) | |
if plot == "Pareto Chart": | |
column = st.selectbox( | |
"Select a columns", | |
[""] + list(data.select_dtypes(include="object")), | |
) | |
if column: | |
paretoplot(data, column) | |
if option == "Preprocess": | |
st.markdown( | |
"<h1 style='text-align: center;'>Data Preprocessing</h1>", | |
unsafe_allow_html=True, | |
) | |
operation = st.radio( | |
"Select preprocessing step: ", | |
( | |
"Drop Columns", | |
"Handling Missing Values", | |
"Encode Categorical Features", | |
), | |
) | |
if operation == "Drop Columns": | |
columns = st.multiselect("Select Columns to drop: ", (data.columns)) | |
drop_columns = st.button("Drop Columns") | |
if drop_columns: | |
data.drop(columns, axis=1, inplace=True) | |
st.success("Dropped selected columns✅✅✅") | |
elif operation == "Handling Missing Values": | |
num_missing = st.selectbox( | |
"Select a Approach (Numerical columns only): ", | |
("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"), | |
).lower() | |
cat_missing = st.selectbox( | |
"Select a Approach (Categorical columns only): ", | |
("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"), | |
).lower() | |
hmv = st.button("Handle Missing Values") | |
if hmv: | |
if num_missing: | |
num_data = data.select_dtypes(include=["int64", "float64"]) | |
if num_missing == "drop": | |
data = data.dropna(subset=num_data.columns) | |
elif num_missing in [ | |
"mean", | |
"median", | |
"backward fill", | |
"forward fill", | |
]: | |
if num_missing == "mean": | |
fill_values = num_data.mean() | |
elif num_missing == "median": | |
fill_values = num_data.median() | |
elif num_missing == "backward fill": | |
fill_values = num_data.bfill() | |
elif num_missing == "forward fill": | |
fill_values = num_data.ffill() | |
data.fillna(value=fill_values, inplace=True) | |
st.success( | |
"Imputed missing values in numerical columns with selected approach." | |
) | |
if cat_missing: | |
cat_data = data.select_dtypes(exclude=["int", "float"]) | |
if cat_missing == "drop": | |
data = data.dropna(subset=cat_data.columns) | |
elif cat_missing == "most frequent values": | |
mode_values = data[cat_data.columns].mode().iloc[0] | |
data[cat_data.columns] = data[cat_data.columns].fillna( | |
mode_values | |
) | |
elif cat_missing == "replace with 'unknown'": | |
data[cat_data.columns] = data[cat_data.columns].fillna( | |
"Unknown" | |
) | |
st.success( | |
"Imputed missing values in categorical columns with selected approach." | |
) | |
elif operation == "Encode Categorical Features": | |
oe_columns = st.multiselect( | |
"Choose Columns for Ordinal Encoding", | |
[""] + list(data.select_dtypes(include="object")), | |
) | |
st.info("Other columns will be One Hot Encoded.") | |
encode_columns = st.button("Encode Columns") | |
if encode_columns: | |
bool_columns = data.select_dtypes(include=bool).columns | |
data[bool_columns] = data[bool_columns].astype(int) | |
if oe_columns: | |
oe = OrdinalEncoder() | |
data[oe_columns] = oe.fit_transform( | |
data[oe_columns].astype("str") | |
) | |
try: | |
remaining_cat_cols = [ | |
col | |
for col in data.select_dtypes(include="object") | |
if col not in oe_columns | |
] | |
except: | |
pass | |
if len(remaining_cat_cols) > 0: | |
data = pd.get_dummies( | |
data, columns=remaining_cat_cols, drop_first=False | |
) | |
st.success("Encoded categorical columns") | |
bool_columns = data.select_dtypes(include=bool).columns | |
data[bool_columns] = data[bool_columns].astype(int) | |
st.session_state["data"] = data | |
preprocessed_data_csv = data.to_csv(index=False) | |
preprocessed_data_buffer = io.StringIO() | |
preprocessed_data_buffer.write(preprocessed_data_csv) | |
preprocessed_data_bytes = preprocessed_data_buffer.getvalue() | |
if st.download_button( | |
label="Download Preprocessed Data", | |
key="preprocessed_data", | |
on_click=None, | |
data=preprocessed_data_bytes.encode(), | |
file_name="preprocessed_data.csv", | |
mime="text/csv", | |
): | |
st.success('Data Downloaded') | |
if option == "Training and Evaluation": | |
st.markdown( | |
"<h1 style='text-align: center;'>Training and Evaluation</h1>", | |
unsafe_allow_html=True, | |
) | |
algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification")) | |
if algo == "Regression": | |
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) | |
try: | |
X = data.drop(target, axis=1) | |
Y = data[target] | |
except Exception as e: | |
st.write(str(e)) | |
st.write( | |
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." | |
) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, Y, test_size=0.2, random_state=42 | |
) | |
scale = st.selectbox( | |
"Choose how do you want to scale features:", | |
("", "Standard Scaler", "Min Max Scaler"), | |
) | |
if scale == "Standard Scaler": | |
scaler = StandardScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
elif scale == "Min Max Scaler": | |
scaler = MinMaxScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
model = st.selectbox( | |
"Choose Regression Model for training: ", | |
( | |
"", | |
"Ridge Regression", | |
"Decision Tree Regressor", | |
"Random Forest Regressor", | |
"SVR", | |
"XGBRF Regressor", | |
"LGBM Regressor", | |
), | |
) | |
if model == "Ridge Regression": | |
reg = Ridge(alpha=1.0) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="ridge_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("ridge_regression_model.pkl", "wb") as model_file: | |
pickle.dump(reg, model_file) | |
elif model == "Decision Tree Regressor": | |
reg = DecisionTreeRegressor(max_depth=10) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="decision_tree_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open( | |
"decision_tree_regression_model.pkl", "wb" | |
) as model_file: | |
pickle.dump(reg, model_file) | |
elif model == "Random Forest Regressor": | |
reg = RandomForestRegressor(max_depth=10, n_estimators=100) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="random_forest_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open( | |
"random_forest_regression_model.pkl", "wb" | |
) as model_file: | |
pickle.dump(reg, model_file) | |
elif model == "SVR": | |
reg = SVR(C=1.0, epsilon=0.2) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="svr_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("svr_model.pkl", "wb") as model_file: | |
pickle.dump(reg, model_file) | |
elif model == "XGBRF Regressor": | |
reg = XGBRFRegressor(reg_lambda=1) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="xgbrf_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("xgbrf_regression_model.pkl", "wb") as model_file: | |
pickle.dump(reg, model_file) | |
elif model == "LGBM Regressor": | |
reg = LGBMRegressor(reg_lambda=1) | |
reg.fit(X_train, y_train) | |
pred = reg.predict(X_test) | |
st.write( | |
"Mean Absolute Error (MAE): {:.4f}".format( | |
mean_absolute_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Mean Squared Error (MSE): {:.4f}".format( | |
mean_squared_error(pred, y_test) | |
) | |
) | |
st.write( | |
"Root Mean Squared Error (RMSE): {:.4f}".format( | |
mean_squared_error(pred, y_test, squared=False) | |
) | |
) | |
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(reg), | |
file_name="lgbm_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("lgbm_regression_model.pkl", "wb") as model_file: | |
pickle.dump(reg, model_file) | |
elif algo == "Classification": | |
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) | |
try: | |
X = data.drop(target, axis=1) | |
Y = data[target] | |
except Exception as e: | |
st.write(str(e)) | |
st.write( | |
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." | |
) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, Y, test_size=0.2, random_state=42 | |
) | |
balance = st.selectbox( | |
"Do you want to balance dataset?", ("", "Yes", "No") | |
) | |
if balance == "Yes": | |
piechart(data, target) | |
sample = st.selectbox( | |
"Which approach you want to use?", | |
("", "Random Under Sampling", "Random Over Sampling", "SMOTE"), | |
) | |
if sample == "Random Under Sampling": | |
rus = RandomUnderSampler(random_state=42) | |
X_train, y_train = rus.fit_resample(X_train, y_train) | |
elif sample == "Random Over Sampling": | |
ros = RandomOverSampler(random_state=42) | |
X_train, y_train = ros.fit_resample(X_train, y_train) | |
elif sample == "SMOTE": | |
smote = SMOTE(random_state=42) | |
X_train, y_train = smote.fit_resample(X_train, y_train) | |
scale = st.selectbox( | |
"Choose how do you want to scale features:", | |
("", "Standard Scaler", "Min Max Scaler"), | |
) | |
if scale == "Standard Scaler": | |
scaler = StandardScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
elif scale == "Min Max Scaler": | |
scaler = MinMaxScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
model = st.selectbox( | |
"Choose Classification Model for training: ", | |
( | |
"", | |
"Logistic Regression", | |
"Decision Tree Classifier", | |
"Random Forest Classifier", | |
"SVC", | |
"XGBRF Classifier", | |
"LGBM Classifier", | |
), | |
) | |
if model == "Logistic Regression": | |
clf = LogisticRegression(penalty="l2") | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix( | |
pred, y_test, "Logistic Regression Confusion Matrix " | |
) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="logistic_regression_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("logistic_regression_model.pkl", "wb") as model_file: | |
pickle.dump(clf, model_file) | |
if model == "Decision Tree Classifier": | |
clf = DecisionTreeClassifier(max_depth=5) | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix( | |
pred, y_test, "DecisionTree Classifier Confusion Matrix " | |
) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="decision_tree_classifier_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open( | |
"decision_tree_classifier_model.pkl", "wb" | |
) as model_file: | |
pickle.dump(clf, model_file) | |
if model == "Random Forest Classifier": | |
clf = RandomForestClassifier(n_estimators=100, max_depth=5) | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix( | |
pred, y_test, "RandomForest Classifier Confusion Matrix " | |
) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="random_forest_classifier_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open( | |
"random_forest_classifier_model.pkl", "wb" | |
) as model_file: | |
pickle.dump(clf, model_file) | |
if model == "SVC": | |
clf = SVC(C=1.5) | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ") | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="svc_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("svc_model.pkl", "wb") as model_file: | |
pickle.dump(clf, model_file) | |
if model == "XGBRF Classifier": | |
clf = XGBRFClassifier(reg_lambda=1.0) | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix( | |
pred, y_test, "XGBRF Classifier Confusion Matrix " | |
) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="xgbrf_classifier_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("xgbrf_classifier_model.pkl", "wb") as model_file: | |
pickle.dump(clf, model_file) | |
if model == "LGBM Classifier": | |
clf = LGBMClassifier(reg_lambda=1.0) | |
clf.fit(X_train, y_train) | |
pred = clf.predict(X_test) | |
st.write( | |
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
) | |
try: | |
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
except ValueError: | |
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
plot_confusion_matrix( | |
pred, y_test, "LGBM Classifier Confusion Matrix " | |
) | |
if st.download_button( | |
label="Download Trained Model", | |
key="trained_model", | |
on_click=None, | |
data=pickle.dumps(clf), | |
file_name="lgbm_classifier_model.pkl", | |
mime="application/octet-stream", | |
): | |
with open("lgbm_classifier_model.pkl", "wb") as model_file: | |
pickle.dump(clf, model_file) | |
def load_csv(file): | |
data = pd.read_csv(file) | |
return data | |
def data_overview(data): | |
r, c = data.shape | |
st.write(f"Number of Rows: {r}") | |
return f"Number of Columns: {c}" | |
def missing_data(data): | |
missing_values = data.isna().sum() | |
missing_values = missing_values[missing_values > 0] | |
missing_value_per = (missing_values / data.shape[0]) * 100 | |
missing_value_per = missing_value_per.round(2).astype(str) + "%" | |
missing_df = pd.DataFrame( | |
{"Missing Values": missing_values, "Percentage": missing_value_per} | |
) | |
missing_df_html = missing_df.to_html( | |
classes="table table-striped", justify="center" | |
) | |
return st.markdown(missing_df_html, unsafe_allow_html=True) | |
def display_data_info(data): | |
dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"]) | |
dtypes.reset_index(inplace=True) | |
nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"]) | |
nunique.reset_index(inplace=True) | |
dtypes.columns = ["Column", "Data Type"] | |
nunique.columns = ["Column", "Unique Counts"] | |
combined_df = pd.merge(dtypes, nunique, on="Column") | |
combined_df_html = combined_df.to_html( | |
classes="table table-striped", justify="center" | |
) | |
return st.markdown(combined_df_html, unsafe_allow_html=True) | |
def value_counts(data): | |
column = st.selectbox("Select a Column", [""] + list(data.columns)) | |
if column: | |
st.write(data[column].value_counts()) | |
def duplicate(data): | |
if data.duplicated().any(): | |
st.write( | |
f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped." | |
) | |
data.drop_duplicates(keep="first", inplace=True) | |
return "" | |
else: | |
return "There are no duplicate rows in the DataFrame." | |
def countplot(data, col): | |
plt.figure(figsize=(10, 6)) | |
sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2) | |
plt.title(f"Countplot of {col} Column") | |
st.pyplot(plt) | |
def piechart(data, col): | |
value_counts = data[col].value_counts() | |
plt.figure(figsize=(8, 6)) | |
plt.pie( | |
value_counts, | |
labels=value_counts.index, | |
autopct="%1.1f%%", | |
colors=palette, | |
shadow=False, | |
wedgeprops=dict(edgecolor="#1c1c1c"), | |
) | |
plt.title(f"Pie Chart of {col} Column") | |
st.pyplot(plt) | |
def histogram(data, col): | |
plt.figure(figsize=(10, 6)) | |
sns.histplot( | |
data[col], | |
kde=True, | |
color=palette[4], | |
fill=True, | |
edgecolor="#1c1c1c", | |
linewidth=2, | |
) | |
plt.title(f"Histogram of {col} Column") | |
st.pyplot(plt) | |
def violinplot(data, col): | |
plt.figure(figsize=(10, 6)) | |
sns.violinplot(data[col], color=palette[8]) | |
plt.title(f"Violin Plot of {col} Column") | |
st.pyplot(plt) | |
def scatterplot(data, col): | |
plt.figure(figsize=(10, 8)) | |
sns.scatterplot(data[col], color=palette[3]) | |
plt.title(f"Scatter Plot of {col} Column") | |
st.pyplot(plt) | |
def biscatterplot(data, cols): | |
try: | |
plt.figure(figsize=(10, 8)) | |
sns.scatterplot( | |
data=data, | |
x=cols[0], | |
y=cols[1], | |
palette=palette[1:], | |
edgecolor="#1c1c1c", | |
linewidth=2, | |
) | |
plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns") | |
st.pyplot(plt) | |
except Exception as e: | |
st.write(str(e)) | |
def bibarplot(data, cols): | |
try: | |
plt.figure(figsize=(10, 8)) | |
sns.barplot( | |
data=data, | |
x=cols[0], | |
y=cols[1], | |
palette=palette[1:], | |
edgecolor="#1c1c1c", | |
linewidth=2, | |
) | |
plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns") | |
st.pyplot(plt) | |
except Exception as e: | |
st.write(str(e)) | |
def biboxplot(data, cols): | |
try: | |
plt.figure(figsize=(10, 8)) | |
sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2) | |
plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns") | |
st.pyplot(plt) | |
except Exception as e: | |
st.write(str(e)) | |
def paretoplot(data, categorical_col): | |
try: | |
value_counts = data[categorical_col].value_counts() | |
cumulative_percentage = (value_counts / value_counts.sum()).cumsum() | |
pareto_df = pd.DataFrame( | |
{ | |
"Categories": value_counts.index, | |
"Frequency": value_counts.values, | |
"Cumulative Percentage": cumulative_percentage.values * 100, | |
} | |
) | |
pareto_df = pareto_df.sort_values(by="Frequency", ascending=False) | |
fig, ax1 = plt.subplots(figsize=(10, 8)) | |
ax1.bar( | |
pareto_df["Categories"], | |
pareto_df["Frequency"], | |
color=palette[1:], | |
edgecolor="#1c1c1c", | |
linewidth=2, | |
) | |
ax2 = ax1.twinx() | |
ax2.yaxis.set_major_formatter(PercentFormatter()) | |
ax2.plot( | |
pareto_df["Categories"], | |
pareto_df["Cumulative Percentage"], | |
color=palette[3], | |
marker="D", | |
ms=10, | |
) | |
ax1.set_xlabel(categorical_col) | |
ax1.set_ylabel("Frequency", color=palette[0]) | |
ax2.set_ylabel("Cumulative Percentage", color=palette[3]) | |
st.pyplot(fig) | |
except Exception as e: | |
pass | |
def plot_confusion_matrix(y_true, y_pred, title): | |
cm = confusion_matrix(y_true, y_pred) | |
plt.figure(figsize=(6, 4)) | |
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False) | |
plt.xlabel("Predicted Label") | |
plt.ylabel("True Label") | |
plt.title(title) | |
st.pyplot(plt) | |
if __name__ == "__main__": | |
main() |