Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.metrics import accuracy_score | |
from sklearn.preprocessing import LabelEncoder | |
# LICENSE.streamlit.Apachev2 - Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE) | |
# LICENSE.pandas.BSD-3 - Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE) | |
# LICENSE.sklearn.BSD-3 - Copyright (c) 2007-2024 The scikit-learn developers (https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) | |
# Title of the app | |
st.title("Scoring Engine") | |
# File upload section | |
uploaded_file = st.file_uploader("Upload your dataset (CSV format)", type="csv") | |
if uploaded_file is not None: | |
# Load the dataset | |
df = pd.read_csv(uploaded_file, index_col=0) | |
# Dynamically calculate the mean ignoring NaN values | |
df['Average_score'] = df[['Boss_score', 'Colleague_score', 'Colleague_other_score', | |
'Report_score', 'Customer_score']].mean(axis=1, skipna=True) | |
# Round the calculated average score to 2 decimal places | |
df['Average_score'] = df['Average_score'].round(1) | |
# Function to calculate self-score | |
def self_score(average, benchmark): | |
if average > benchmark: | |
return "High" | |
elif average < benchmark: | |
return "Low" | |
else: | |
return "Equal" | |
# Apply the function to calculate 'Self_score' | |
df['Self_score'] = df.apply(lambda row: self_score(row['Average_score'], row['Benchmark_score']), axis=1) | |
# Encode object-type columns | |
encoded_df = df.copy() | |
le = LabelEncoder() | |
for column in encoded_df.select_dtypes(include=['object']).columns: | |
encoded_df[column] = le.fit_transform(encoded_df[column].astype(str)) | |
# Fill missing values with 0 | |
encoded_df = encoded_df.fillna(0) | |
# Prepare features (X) and labels (y) | |
X = encoded_df.drop(columns=['Self_score']) | |
y = encoded_df['Self_score'] | |
# Split data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) | |
# Train the Gaussian Naive Bayes model | |
gnb = GaussianNB() | |
gnb.fit(X_train, y_train) | |
# Make predictions and calculate confidence scores | |
y_pred = gnb.predict(X_test) | |
confidence_scores = gnb.predict_proba(X_test).max(axis=1) | |
# Evaluate the model | |
accuracy = accuracy_score(y_test, y_pred) | |
# Predict confidence scores for the entire dataset | |
y_prob = gnb.predict_proba(X) | |
confidence_scores = y_prob.max(axis=1) | |
df['Confidence_score (%)'] = confidence_scores | |
df['Confidence_score (%)'] = df['Confidence_score (%)'] * 100 | |
df =df.drop('All_raters_Score', axis = 1) | |
df = df[[ 'Title', 'Code', 'Dimensions', 'Boss_score', | |
'Colleague_score', 'Colleague_other_score', 'Report_score', | |
'Customer_score', 'Benchmark_score','Average_score', | |
'Self_score', 'Confidence_score (%)']] | |
st.write("### Processed Dataset") | |
st.write(df) | |
st.write(f"### Model Accuracy: {accuracy:.2f}") | |
# Download button for the processed dataset | |
csv = df.to_csv(index=False).encode('utf-8') | |
st.download_button( | |
label="Download Processed Dataset", | |
data=csv, | |
file_name="processed_dataset.csv", | |
mime="text/csv" | |
) | |
else: | |
st.write("Please upload a dataset to begin.") | |