import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# LICENSE.streamlit.Apachev2 	- 	Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE)

# LICENSE.pandas.BSD-3	 		- 	Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE)

# LICENSE.sklearn.BSD-3 		- 	Copyright (c) 2007-2024 The scikit-learn developers (https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)


# Title of the app
st.title("Scoring Engine")

# File upload section
uploaded_file = st.file_uploader("Upload your dataset (CSV format)", type="csv")

if uploaded_file is not None:
    # Load the dataset
    df = pd.read_csv(uploaded_file, index_col=0)

    # Dynamically calculate the mean ignoring NaN values
    df['Average_score'] = df[['Boss_score', 'Colleague_score', 'Colleague_other_score',
                              'Report_score', 'Customer_score']].mean(axis=1, skipna=True)

    # Round the calculated average score to 2 decimal places
    df['Average_score'] = df['Average_score'].round(1)

    # Function to calculate self-score
    def self_score(average, benchmark):
        if average > benchmark:
            return "High"
        elif average < benchmark:
            return "Low"
        else:
            return "Equal"

    # Apply the function to calculate 'Self_score'
    df['Self_score'] = df.apply(lambda row: self_score(row['Average_score'], row['Benchmark_score']), axis=1)

    # Encode object-type columns
    encoded_df = df.copy()
    le = LabelEncoder()
    for column in encoded_df.select_dtypes(include=['object']).columns:
        encoded_df[column] = le.fit_transform(encoded_df[column].astype(str))

    # Fill missing values with 0
    encoded_df = encoded_df.fillna(0)

    # Prepare features (X) and labels (y)
    X = encoded_df.drop(columns=['Self_score'])
    y = encoded_df['Self_score']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    # Train the Gaussian Naive Bayes model
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    # Make predictions and calculate confidence scores
    y_pred = gnb.predict(X_test)
    confidence_scores = gnb.predict_proba(X_test).max(axis=1)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    

    # Predict confidence scores for the entire dataset
    y_prob = gnb.predict_proba(X)
    confidence_scores = y_prob.max(axis=1)
    df['Confidence_score (%)'] = confidence_scores
    df['Confidence_score (%)']  = df['Confidence_score (%)'] * 100
    df =df.drop('All_raters_Score', axis = 1)
    df = df[[ 'Title', 'Code', 'Dimensions', 'Boss_score',
             'Colleague_score', 'Colleague_other_score', 'Report_score',
             'Customer_score', 'Benchmark_score','Average_score',
             'Self_score', 'Confidence_score (%)']]
    st.write("### Processed Dataset")
    st.write(df)
    st.write(f"### Model Accuracy: {accuracy:.2f}")
    # Download button for the processed dataset
    csv = df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="Download Processed Dataset",
        data=csv,
        file_name="processed_dataset.csv",
        mime="text/csv"
    )
else:
    st.write("Please upload a dataset to begin.")