chiichann's picture
Update app.py
55b17cf verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import altair as alt
# πŸŽ“ App Title
st.title("πŸ“Š Student Performance Prediction App")
# πŸ“š About the App
st.write(
"""
## About This App
This application predicts whether a student will pass or fail based on their exam scores and demographic data.
### Features:
- **Dataset Overview**: View the number of students categorized by performance.
- **Model Evaluation**: Check the model's accuracy on the test set.
- **Student Performance Prediction**: Enter student details and get a prediction.
The app uses **Streamlit** for the UI and **XGBoostClassifier** for predictions.
"""
)
# πŸ“Œ Load and preprocess data
def load_data():
file_path = "exams.csv"
df = pd.read_csv(file_path)
# Define target variable: Pass if average score >= 50
df["Average Score"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
df["Passed"] = (df["Average Score"] >= 50).astype(int)
# Drop unnecessary columns
df.drop(columns=["Average Score", "lunch", "race/ethnicity", "gender"], inplace=True)
# Encode categorical variables
cat_cols = ["parental level of education", "test preparation course"]
df = pd.get_dummies(df, columns=cat_cols)
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["math score", "reading score", "writing score"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])
return df, scaler, numerical_features
# Train the model
def train_model(df):
X = df.drop(columns=["Passed"])
y = df["Passed"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier(
n_estimators=1000, learning_rate=0.03, max_depth=10,
colsample_bytree=0.9, subsample=0.9, random_state=42
)
model.fit(X_train, y_train)
return model, X_test, y_test, X_train.columns, X_train, scaler
# Evaluate the model
def evaluate_model(model, X_test, y_test):
accuracy = model.score(X_test, y_test)
return accuracy
df, scaler, numerical_features = load_data()
model, X_test, y_test, feature_names, X_train, scaler = train_model(df)
accuracy = evaluate_model(model, X_test, y_test)
# 🏑 Streamlit Tabs
tab1, tab2, tab3 = st.tabs(["πŸ’ Dataset Overview", "πŸ“Š Model Performance", "πŸŽ“ Predict Performance"])
# πŸ“ Tab 1: Dataset Overview
with tab1:
st.write("### Dataset Summary")
st.write(df.describe())
st.write("### Distribution of Passed Students")
pass_counts = df["Passed"].value_counts().reset_index()
pass_counts.columns = ["Passed", "Count"]
chart = alt.Chart(pass_counts).mark_bar().encode(
x=alt.X("Passed:N", title="Passed (0 = No, 1 = Yes)"),
y="Count",
color="Passed:N"
)
st.altair_chart(chart, use_container_width=True)
# πŸ“Š Tab 2: Model Performance
with tab2:
st.write("### Model Evaluation")
st.write(f"βœ… **Model Accuracy:** {accuracy*100:.2f}%")
# πŸŽ“ Tab 3: Predict Performance
with tab3:
st.write("### Enter Student Details")
math_score = st.number_input("Math Score", min_value=0, max_value=100, value=70)
reading_score = st.number_input("Reading Score", min_value=0, max_value=100, value=70)
writing_score = st.number_input("Writing Score", min_value=0, max_value=100, value=70)
parent_education = st.selectbox("Parental Level of Education", ["Some high school", "High school", "Some college", "Associate's degree", "Bachelor's degree", "Master's degree"])
test_prep = st.selectbox("Test Preparation Course", ["None", "Completed"])
# Convert inputs to match model encoding
input_data = pd.DataFrame({
"math score": [math_score],
"reading score": [reading_score],
"writing score": [writing_score]
})
# Standardize numerical inputs
input_data[numerical_features] = scaler.transform(input_data[numerical_features])
# Add categorical columns dynamically
for col in feature_names:
if col.startswith("parental level of education_") or col.startswith("test preparation course_"):
input_data[col] = 0
input_data[f"parental level of education_{parent_education}"] = 1
input_data[f"test preparation course_{test_prep}"] = 1
# Fill missing encoded columns with 0
input_data = input_data.reindex(columns=feature_names, fill_value=0)
if st.button("Predict"):
prediction = model.predict(input_data)[0]
result = "Pass" if prediction == 1 else "Fail"
st.subheader(f"Prediction: {result}")