import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score # Load your data (replace with your actual data loading) penguins = pd.read_csv('penguins_lter.csv') # Data Cleaning (same as your existing code) penguins_cleaned = penguins.dropna() penguins_cleaned = penguins_cleaned.drop_duplicates() # Fill missing values (same as your existing code) numerical_cols = penguins.select_dtypes(include=['number']).columns penguins[numerical_cols] = penguins[numerical_cols].fillna(penguins[numerical_cols].mean()) categorical_cols = penguins.select_dtypes(include=['object']).columns penguins[categorical_cols] = penguins[categorical_cols].fillna(penguins[categorical_cols].mode().iloc[0]) # Feature Engineering and Model Training (same as your existing code) X = penguins.drop('Species', axis=1) y = penguins['Species'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) numerical_features = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)'] categorical_features = ['Island', 'Sex'] numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ]) pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', KNeighborsClassifier()) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) accuracy = accuracy_score(y_test, y_pred) # Streamlit App st.title("Penguin Species Classification") st.write("This app predicts the species of a penguin based on its features.") # Display the accuracy st.write(f"Model Accuracy: {accuracy}") # Input features for prediction culmen_length = st.number_input("Culmen Length (mm)", min_value=0.0) culmen_depth = st.number_input("Culmen Depth (mm)", min_value=0.0) flipper_length = st.number_input("Flipper Length (mm)", min_value=0.0) body_mass = st.number_input("Body Mass (g)", min_value=0.0) island = st.selectbox("Island", penguins['Island'].unique()) sex = st.selectbox("Sex", penguins['Sex'].unique()) # Create a DataFrame for prediction new_penguin = pd.DataFrame({ 'Culmen Length (mm)': [culmen_length], 'Culmen Depth (mm)': [culmen_depth], 'Flipper Length (mm)': [flipper_length], 'Body Mass (g)': [body_mass], 'Island': [island], 'Sex': [sex] }) # Make prediction if st.button("Predict Species"): prediction = pipeline.predict(new_penguin) st.write(f"Predicted Species: {prediction[0]}")