import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import altair as alt

# 🎓 App Title
st.title("📊 Student Performance Prediction App")

# 📚 About the App
st.write(
    """
    ## About This App
    This application predicts whether a student will pass or fail based on their exam scores and demographic data.
    ### Features:
    - **Dataset Overview**: View the number of students categorized by performance.
    - **Model Evaluation**: Check the model's accuracy on the test set.
    - **Student Performance Prediction**: Enter student details and get a prediction.
    
    The app uses **Streamlit** for the UI and **XGBoostClassifier** for predictions.
    """
)

# 📌 Load and preprocess data
def load_data():
    file_path = "exams.csv"
    df = pd.read_csv(file_path)
    
    # Define target variable: Pass if average score >= 50
    df["Average Score"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
    df["Passed"] = (df["Average Score"] >= 50).astype(int)
    
    # Drop unnecessary columns
    df.drop(columns=["Average Score", "lunch", "race/ethnicity", "gender"], inplace=True)
    
    # Encode categorical variables
    cat_cols = ["parental level of education", "test preparation course"]
    df = pd.get_dummies(df, columns=cat_cols)
    
    # Standardize numerical features
    scaler = StandardScaler()
    numerical_features = ["math score", "reading score", "writing score"]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df, scaler, numerical_features

# Train the model
def train_model(df):
    X = df.drop(columns=["Passed"])
    y = df["Passed"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBClassifier(
        n_estimators=1000, learning_rate=0.03, max_depth=10, 
        colsample_bytree=0.9, subsample=0.9, random_state=42
    )
    model.fit(X_train, y_train)
    return model, X_test, y_test, X_train.columns, X_train, scaler

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    accuracy = model.score(X_test, y_test)
    return accuracy

df, scaler, numerical_features = load_data()
model, X_test, y_test, feature_names, X_train, scaler = train_model(df)
accuracy = evaluate_model(model, X_test, y_test)

# 🏡 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["💁 Dataset Overview", "📊 Model Performance", "🎓 Predict Performance"])

# 📁 Tab 1: Dataset Overview
with tab1:
    st.write("### Dataset Summary")
    st.write(df.describe())
    
    st.write("### Distribution of Passed Students")
    pass_counts = df["Passed"].value_counts().reset_index()
    pass_counts.columns = ["Passed", "Count"]
    
    chart = alt.Chart(pass_counts).mark_bar().encode(
        x=alt.X("Passed:N", title="Passed (0 = No, 1 = Yes)"),
        y="Count",
        color="Passed:N"
    )
    st.altair_chart(chart, use_container_width=True)

# 📊 Tab 2: Model Performance
with tab2:
    st.write("### Model Evaluation")
    st.write(f"✅ **Model Accuracy:** {accuracy*100:.2f}%")

# 🎓 Tab 3: Predict Performance
with tab3:
    st.write("### Enter Student Details")
    
    math_score = st.number_input("Math Score", min_value=0, max_value=100, value=70)
    reading_score = st.number_input("Reading Score", min_value=0, max_value=100, value=70)
    writing_score = st.number_input("Writing Score", min_value=0, max_value=100, value=70)
    parent_education = st.selectbox("Parental Level of Education", ["Some high school", "High school", "Some college", "Associate's degree", "Bachelor's degree", "Master's degree"])
    test_prep = st.selectbox("Test Preparation Course", ["None", "Completed"])
    
    # Convert inputs to match model encoding
    input_data = pd.DataFrame({
        "math score": [math_score],
        "reading score": [reading_score],
        "writing score": [writing_score]
    })
    
    # Standardize numerical inputs
    input_data[numerical_features] = scaler.transform(input_data[numerical_features])
    
    # Add categorical columns dynamically
    for col in feature_names:
        if col.startswith("parental level of education_") or col.startswith("test preparation course_"):
            input_data[col] = 0
    
    input_data[f"parental level of education_{parent_education}"] = 1
    input_data[f"test preparation course_{test_prep}"] = 1
    
    # Fill missing encoded columns with 0
    input_data = input_data.reindex(columns=feature_names, fill_value=0)
    
    if st.button("Predict"):
        prediction = model.predict(input_data)[0]
        result = "Pass" if prediction == 1 else "Fail"
        st.subheader(f"Prediction: {result}")