import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report # Load your data (replace with your actual data loading) # Assuming you have a CSV file named 'penguins_lter.csv' in your working directory try: df = pd.read_csv('penguins_lter.csv') except FileNotFoundError: st.error("Error: 'penguins_lter.csv' not found. Please upload the file or adjust the path.") st.stop() # Data preprocessing (handle missing values) numeric_cols = df.select_dtypes(include=['number']).columns for col in numeric_cols: df[col].fillna(df[col].mean(), inplace=True) categorical_cols = df.select_dtypes(exclude=['number']).columns for col in categorical_cols: df[col].fillna(df[col].mode()[0], inplace=True) # Model training and prediction (same as your original code) # Assuming 'Species' is your target variable X = df.drop('Species', axis=1) y = df['Species'] # Convert categorical features to numerical using one-hot encoding X = pd.get_dummies(X, drop_first=True) # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a pipeline pipeline = Pipeline([ ('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=5)) ]) # Train the pipeline pipeline.fit(X_train, y_train) # Make predictions y_pred = pipeline.predict(X_test) # Streamlit app st.title("Penguin Species Classification") st.write("This app predicts the species of a penguin based on its physical characteristics.") # Display classification report st.subheader("Classification Report") st.text(classification_report(y_test, y_pred)) st.dataframe(df.head())