File size: 4,830 Bytes
55b17cf
3bbe6c1
 
 
55b17cf
 
3bbe6c1
 
55b17cf
 
3bbe6c1
55b17cf
 
 
 
 
 
 
 
 
 
 
 
 
3bbe6c1
55b17cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bbe6c1
55b17cf
 
 
 
 
3bbe6c1
55b17cf
 
 
 
 
3bbe6c1
55b17cf
3bbe6c1
55b17cf
 
 
 
3bbe6c1
55b17cf
 
 
3bbe6c1
55b17cf
 
3bbe6c1
55b17cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bbe6c1
55b17cf
 
 
 
3bbe6c1
55b17cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import altair as alt

# πŸŽ“ App Title
st.title("πŸ“Š Student Performance Prediction App")

# πŸ“š About the App
st.write(
    """
    ## About This App
    This application predicts whether a student will pass or fail based on their exam scores and demographic data.
    ### Features:
    - **Dataset Overview**: View the number of students categorized by performance.
    - **Model Evaluation**: Check the model's accuracy on the test set.
    - **Student Performance Prediction**: Enter student details and get a prediction.
    
    The app uses **Streamlit** for the UI and **XGBoostClassifier** for predictions.
    """
)

# πŸ“Œ Load and preprocess data
def load_data():
    file_path = "exams.csv"
    df = pd.read_csv(file_path)
    
    # Define target variable: Pass if average score >= 50
    df["Average Score"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
    df["Passed"] = (df["Average Score"] >= 50).astype(int)
    
    # Drop unnecessary columns
    df.drop(columns=["Average Score", "lunch", "race/ethnicity", "gender"], inplace=True)
    
    # Encode categorical variables
    cat_cols = ["parental level of education", "test preparation course"]
    df = pd.get_dummies(df, columns=cat_cols)
    
    # Standardize numerical features
    scaler = StandardScaler()
    numerical_features = ["math score", "reading score", "writing score"]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df, scaler, numerical_features

# Train the model
def train_model(df):
    X = df.drop(columns=["Passed"])
    y = df["Passed"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBClassifier(
        n_estimators=1000, learning_rate=0.03, max_depth=10, 
        colsample_bytree=0.9, subsample=0.9, random_state=42
    )
    model.fit(X_train, y_train)
    return model, X_test, y_test, X_train.columns, X_train, scaler

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    accuracy = model.score(X_test, y_test)
    return accuracy

df, scaler, numerical_features = load_data()
model, X_test, y_test, feature_names, X_train, scaler = train_model(df)
accuracy = evaluate_model(model, X_test, y_test)

# 🏑 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["πŸ’ Dataset Overview", "πŸ“Š Model Performance", "πŸŽ“ Predict Performance"])

# πŸ“ Tab 1: Dataset Overview
with tab1:
    st.write("### Dataset Summary")
    st.write(df.describe())
    
    st.write("### Distribution of Passed Students")
    pass_counts = df["Passed"].value_counts().reset_index()
    pass_counts.columns = ["Passed", "Count"]
    
    chart = alt.Chart(pass_counts).mark_bar().encode(
        x=alt.X("Passed:N", title="Passed (0 = No, 1 = Yes)"),
        y="Count",
        color="Passed:N"
    )
    st.altair_chart(chart, use_container_width=True)

# πŸ“Š Tab 2: Model Performance
with tab2:
    st.write("### Model Evaluation")
    st.write(f"βœ… **Model Accuracy:** {accuracy*100:.2f}%")

# πŸŽ“ Tab 3: Predict Performance
with tab3:
    st.write("### Enter Student Details")
    
    math_score = st.number_input("Math Score", min_value=0, max_value=100, value=70)
    reading_score = st.number_input("Reading Score", min_value=0, max_value=100, value=70)
    writing_score = st.number_input("Writing Score", min_value=0, max_value=100, value=70)
    parent_education = st.selectbox("Parental Level of Education", ["Some high school", "High school", "Some college", "Associate's degree", "Bachelor's degree", "Master's degree"])
    test_prep = st.selectbox("Test Preparation Course", ["None", "Completed"])
    
    # Convert inputs to match model encoding
    input_data = pd.DataFrame({
        "math score": [math_score],
        "reading score": [reading_score],
        "writing score": [writing_score]
    })
    
    # Standardize numerical inputs
    input_data[numerical_features] = scaler.transform(input_data[numerical_features])
    
    # Add categorical columns dynamically
    for col in feature_names:
        if col.startswith("parental level of education_") or col.startswith("test preparation course_"):
            input_data[col] = 0
    
    input_data[f"parental level of education_{parent_education}"] = 1
    input_data[f"test preparation course_{test_prep}"] = 1
    
    # Fill missing encoded columns with 0
    input_data = input_data.reindex(columns=feature_names, fill_value=0)
    
    if st.button("Predict"):
        prediction = model.predict(input_data)[0]
        result = "Pass" if prediction == 1 else "Fail"
        st.subheader(f"Prediction: {result}")