narinsak's picture
Update app.py
fbb40ed verified
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
# Load your data (replace with your actual data loading)
# Assuming you have a CSV file named 'penguins_lter.csv' in your working directory
try:
df = pd.read_csv('penguins_lter.csv')
except FileNotFoundError:
st.error("Error: 'penguins_lter.csv' not found. Please upload the file or adjust the path.")
st.stop()
# Data preprocessing (handle missing values)
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
df[col].fillna(df[col].mean(), inplace=True)
categorical_cols = df.select_dtypes(exclude=['number']).columns
for col in categorical_cols:
df[col].fillna(df[col].mode()[0], inplace=True)
# Model training and prediction (same as your original code)
# Assuming 'Species' is your target variable
X = df.drop('Species', axis=1)
y = df['Species']
# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=5))
])
# Train the pipeline
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Streamlit app
st.title("Penguin Species Classification")
st.write("This app predicts the species of a penguin based on its physical characteristics.")
# Display classification report
st.subheader("Classification Report")
st.text(classification_report(y_test, y_pred))
st.dataframe(df.head())