import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder # LICENSE.streamlit.Apachev2 - Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE) # LICENSE.pandas.BSD-3 - Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE) # LICENSE.sklearn.BSD-3 - Copyright (c) 2007-2024 The scikit-learn developers (https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) # Title of the app st.title("Scoring Engine") # File upload section uploaded_file = st.file_uploader("Upload your dataset (CSV format)", type="csv") if uploaded_file is not None: # Load the dataset df = pd.read_csv(uploaded_file, index_col=0) # Dynamically calculate the mean ignoring NaN values df['Average_score'] = df[['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score']].mean(axis=1, skipna=True) # Round the calculated average score to 2 decimal places df['Average_score'] = df['Average_score'].round(1) # Function to calculate self-score def self_score(average, benchmark): if average > benchmark: return "High" elif average < benchmark: return "Low" else: return "Equal" # Apply the function to calculate 'Self_score' df['Self_score'] = df.apply(lambda row: self_score(row['Average_score'], row['Benchmark_score']), axis=1) # Encode object-type columns encoded_df = df.copy() le = LabelEncoder() for column in encoded_df.select_dtypes(include=['object']).columns: encoded_df[column] = le.fit_transform(encoded_df[column].astype(str)) # Fill missing values with 0 encoded_df = encoded_df.fillna(0) # Prepare features (X) and labels (y) X = encoded_df.drop(columns=['Self_score']) y = encoded_df['Self_score'] # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) # Train the Gaussian Naive Bayes model gnb = GaussianNB() gnb.fit(X_train, y_train) # Make predictions and calculate confidence scores y_pred = gnb.predict(X_test) confidence_scores = gnb.predict_proba(X_test).max(axis=1) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) # Predict confidence scores for the entire dataset y_prob = gnb.predict_proba(X) confidence_scores = y_prob.max(axis=1) df['Confidence_score (%)'] = confidence_scores df['Confidence_score (%)'] = df['Confidence_score (%)'] * 100 df =df.drop('All_raters_Score', axis = 1) df = df[[ 'Title', 'Code', 'Dimensions', 'Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score', 'Benchmark_score','Average_score', 'Self_score', 'Confidence_score (%)']] st.write("### Processed Dataset") st.write(df) st.write(f"### Model Accuracy: {accuracy:.2f}") # Download button for the processed dataset csv = df.to_csv(index=False).encode('utf-8') st.download_button( label="Download Processed Dataset", data=csv, file_name="processed_dataset.csv", mime="text/csv" ) else: st.write("Please upload a dataset to begin.")