Spaces:

chiichann
/

student_performance_prediction_app

Sleeping

App Files Files Community

chiichann commited on Mar 4

Commit

55b17cf

verified ·

1 Parent(s): 2a8426f

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -64

app.py CHANGED Viewed

@@ -1,80 +1,130 @@
 import pandas as pd
 import numpy as np
-from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, r2_score
-import streamlit as st
 import altair as alt
-# Streamlit app title
-st.title("📊 Student Performance Predictor")
-# Load dataset
-try:
-    df = pd.read_csv("student_performance_data.csv")  # Ensure the file is in the same directory
-    st.write("### Preview of Dataset")
-    st.write(df.head())  # Show first few rows
-except FileNotFoundError:
-    st.error("File 'student_performance_data.csv' not found! Please upload the dataset.")
-# Ensure dataset contains the required columns
-required_columns = ['Study Hours', 'Attendance Rate', 'Assignment Grades', 'Final Exam Score']
-if not all(col in df.columns for col in required_columns):
-    st.error("Dataset must contain the following columns: " + ", ".join(required_columns))
-else:
-    # Prepare data for training
-    X = df[['Study Hours', 'Attendance Rate', 'Assignment Grades']]
-    y = df['Final Exam Score']
-    # Split data into training and testing sets
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Train a Linear Regression model
-    model = LinearRegression()
     model.fit(X_train, y_train)
-    # Make predictions
-    y_pred = model.predict(X_test)
-    # Evaluate the model
-    mse = mean_squared_error(y_test, y_pred)
-    r2 = r2_score(y_test, y_pred)
-    # Create tabs
-    tab1, tab2, tab3 = st.tabs(["📈 Data Visualization", "📊 Model Performance", "🎯 Prediction"])
-    # Tab 1: Data Visualization
-    with tab1:
-        st.write("### Data Visualization")
-        # Scatter plots
-        for col in ['Study Hours', 'Attendance Rate', 'Assignment Grades']:
-            st.write(f"**{col} vs Final Exam Score**")
-            chart = alt.Chart(df).mark_circle().encode(
-                x=col,
-                y='Final Exam Score',
-                tooltip=[col, 'Final Exam Score']
-            ).interactive()
-            st.altair_chart(chart, use_container_width=True)
-    # Tab 2: Model Performance
-    with tab2:
-        st.write("### Model Performance")
-        st.write(f"✅ Mean Squared Error (MSE): {mse:.2f}")
-        st.write(f"✅ R-squared Score: {r2:.2f}")
-    # Tab 3: Prediction
-    with tab3:
-        st.write("### Predict Final Exam Score")
-        study_hours = st.number_input("📚 Study Hours", min_value=0, value=10, step=1)
-        attendance_rate = st.slider("🎟️ Attendance Rate", min_value=0.0, max_value=1.0, step=0.01, value=0.85)
-        assignment_grades = st.number_input("📝 Average Assignment Grade", min_value=0, max_value=100, value=80, step=1)
-        if st.button("🔮 Predict"):
-            # Prepare input for prediction
-            input_data = np.array([[study_hours, attendance_rate, assignment_grades]])
-            # Make prediction
-            predicted_score = model.predict(input_data)[0]
-            st.success(f"🎯 Predicted Final Exam Score: {predicted_score:.2f}")

+import streamlit as st
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
 import altair as alt
+# 🎓 App Title
+st.title("📊 Student Performance Prediction App")
+# 📚 About the App
+st.write(
+    """
+    ## About This App
+    This application predicts whether a student will pass or fail based on their exam scores and demographic data.
+    ### Features:
+    - **Dataset Overview**: View the number of students categorized by performance.
+    - **Model Evaluation**: Check the model's accuracy on the test set.
+    - **Student Performance Prediction**: Enter student details and get a prediction.
+    The app uses **Streamlit** for the UI and **XGBoostClassifier** for predictions.
+    """
+)
+# 📌 Load and preprocess data
+def load_data():
+    file_path = "exams.csv"
+    df = pd.read_csv(file_path)
+    # Define target variable: Pass if average score >= 50
+    df["Average Score"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
+    df["Passed"] = (df["Average Score"] >= 50).astype(int)
+    # Drop unnecessary columns
+    df.drop(columns=["Average Score", "lunch", "race/ethnicity", "gender"], inplace=True)
+    # Encode categorical variables
+    cat_cols = ["parental level of education", "test preparation course"]
+    df = pd.get_dummies(df, columns=cat_cols)
+    # Standardize numerical features
+    scaler = StandardScaler()
+    numerical_features = ["math score", "reading score", "writing score"]
+    df[numerical_features] = scaler.fit_transform(df[numerical_features])
+    return df, scaler, numerical_features
+# Train the model
+def train_model(df):
+    X = df.drop(columns=["Passed"])
+    y = df["Passed"]
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    model = XGBClassifier(
+        n_estimators=1000, learning_rate=0.03, max_depth=10,
+        colsample_bytree=0.9, subsample=0.9, random_state=42
+    )
     model.fit(X_train, y_train)
+    return model, X_test, y_test, X_train.columns, X_train, scaler
+# Evaluate the model
+def evaluate_model(model, X_test, y_test):
+    accuracy = model.score(X_test, y_test)
+    return accuracy
+df, scaler, numerical_features = load_data()
+model, X_test, y_test, feature_names, X_train, scaler = train_model(df)
+accuracy = evaluate_model(model, X_test, y_test)
+# 🏡 Streamlit Tabs
+tab1, tab2, tab3 = st.tabs(["💁 Dataset Overview", "📊 Model Performance", "🎓 Predict Performance"])
+# 📁 Tab 1: Dataset Overview
+with tab1:
+    st.write("### Dataset Summary")
+    st.write(df.describe())
+    st.write("### Distribution of Passed Students")
+    pass_counts = df["Passed"].value_counts().reset_index()
+    pass_counts.columns = ["Passed", "Count"]
+    chart = alt.Chart(pass_counts).mark_bar().encode(
+        x=alt.X("Passed:N", title="Passed (0 = No, 1 = Yes)"),
+        y="Count",
+        color="Passed:N"
+    )
+    st.altair_chart(chart, use_container_width=True)
+# 📊 Tab 2: Model Performance
+with tab2:
+    st.write("### Model Evaluation")
+    st.write(f"✅ **Model Accuracy:** {accuracy*100:.2f}%")
+# 🎓 Tab 3: Predict Performance
+with tab3:
+    st.write("### Enter Student Details")
+    math_score = st.number_input("Math Score", min_value=0, max_value=100, value=70)
+    reading_score = st.number_input("Reading Score", min_value=0, max_value=100, value=70)
+    writing_score = st.number_input("Writing Score", min_value=0, max_value=100, value=70)
+    parent_education = st.selectbox("Parental Level of Education", ["Some high school", "High school", "Some college", "Associate's degree", "Bachelor's degree", "Master's degree"])
+    test_prep = st.selectbox("Test Preparation Course", ["None", "Completed"])
+    # Convert inputs to match model encoding
+    input_data = pd.DataFrame({
+        "math score": [math_score],
+        "reading score": [reading_score],
+        "writing score": [writing_score]
+    })
+    # Standardize numerical inputs
+    input_data[numerical_features] = scaler.transform(input_data[numerical_features])
+    # Add categorical columns dynamically
+    for col in feature_names:
+        if col.startswith("parental level of education_") or col.startswith("test preparation course_"):
+            input_data[col] = 0
+    input_data[f"parental level of education_{parent_education}"] = 1
+    input_data[f"test preparation course_{test_prep}"] = 1
+    # Fill missing encoded columns with 0
+    input_data = input_data.reindex(columns=feature_names, fill_value=0)
+    if st.button("Predict"):
+        prediction = model.predict(input_data)[0]
+        result = "Pass" if prediction == 1 else "Fail"
+        st.subheader(f"Prediction: {result}")