scoring-engine / app.py
Vineedhar's picture
Update app.py
96e20c2 verified
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# LICENSE.streamlit.Apachev2 - Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE)
# LICENSE.pandas.BSD-3 - Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE)
# LICENSE.sklearn.BSD-3 - Copyright (c) 2007-2024 The scikit-learn developers (https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
# Title of the app
st.title("Scoring Engine")
# File upload section
uploaded_file = st.file_uploader("Upload your dataset (CSV format)", type="csv")
if uploaded_file is not None:
# Load the dataset
df = pd.read_csv(uploaded_file, index_col=0)
# Dynamically calculate the mean ignoring NaN values
df['Average_score'] = df[['Boss_score', 'Colleague_score', 'Colleague_other_score',
'Report_score', 'Customer_score']].mean(axis=1, skipna=True)
# Round the calculated average score to 2 decimal places
df['Average_score'] = df['Average_score'].round(1)
# Function to calculate self-score
def self_score(average, benchmark):
if average > benchmark:
return "High"
elif average < benchmark:
return "Low"
else:
return "Equal"
# Apply the function to calculate 'Self_score'
df['Self_score'] = df.apply(lambda row: self_score(row['Average_score'], row['Benchmark_score']), axis=1)
# Encode object-type columns
encoded_df = df.copy()
le = LabelEncoder()
for column in encoded_df.select_dtypes(include=['object']).columns:
encoded_df[column] = le.fit_transform(encoded_df[column].astype(str))
# Fill missing values with 0
encoded_df = encoded_df.fillna(0)
# Prepare features (X) and labels (y)
X = encoded_df.drop(columns=['Self_score'])
y = encoded_df['Self_score']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Train the Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# Make predictions and calculate confidence scores
y_pred = gnb.predict(X_test)
confidence_scores = gnb.predict_proba(X_test).max(axis=1)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# Predict confidence scores for the entire dataset
y_prob = gnb.predict_proba(X)
confidence_scores = y_prob.max(axis=1)
df['Confidence_score (%)'] = confidence_scores
df['Confidence_score (%)'] = df['Confidence_score (%)'] * 100
df =df.drop('All_raters_Score', axis = 1)
df = df[[ 'Title', 'Code', 'Dimensions', 'Boss_score',
'Colleague_score', 'Colleague_other_score', 'Report_score',
'Customer_score', 'Benchmark_score','Average_score',
'Self_score', 'Confidence_score (%)']]
st.write("### Processed Dataset")
st.write(df)
st.write(f"### Model Accuracy: {accuracy:.2f}")
# Download button for the processed dataset
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Processed Dataset",
data=csv,
file_name="processed_dataset.csv",
mime="text/csv"
)
else:
st.write("Please upload a dataset to begin.")