import streamlit as st import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.cluster import DBSCAN from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt import seaborn as sns # Load the dataset def load_data(): return pd.read_csv("Animal Dataset.csv") # Convert ranges to float (if needed) def convert_range_to_float(val): try: if isinstance(val, str) and '-' in val: start, end = map(float, val.split('-')) return (start + end) / 2 else: return float(val) except: return np.nan # Title st.title("🐾 Animal Diet Prediction and Lifespan Clustering") st.markdown("#### Check animal diet and group them according to lifespan using Random Forest Classifier and DBSCAN") tabs = st.tabs(["📊 Supervised Learning", "🔍 Unsupervised Learning"]) data = load_data() # Select features to use numeric_features = ['Height (cm)', 'Weight (kg)', 'Average Speed (km/h)', 'Top Speed (km/h)', 'Gestation Period (days)', 'Offspring per Birth', 'Lifespan (years)'] # Clean the features for col in numeric_features: data[col] = data[col].apply(convert_range_to_float) data = data.dropna(subset=numeric_features) # Encode the target for supervised learning if 'Diet' in data.columns: label_encoder = LabelEncoder() data['Diet_encoded'] = label_encoder.fit_transform(data['Diet']) # Tab 1: Supervised Learning with tabs[0]: st.header("Supervised: Predict Animal Diet Type") # Sliders st.subheader("Adjust the animal's characteristics to predict its diet") height = st.slider("Height (cm)", min_value=int(data['Height (cm)'].min()), max_value=int(data['Height (cm)'].max()), help="How tall is the animal?") weight = st.slider("Weight (kg)", min_value=int(data['Weight (kg)'].min()), max_value=int(data['Weight (kg)'].max()), help="How much does the animal weigh?") speed = st.slider("Average Speed (km/h)", min_value=int(data['Average Speed (km/h)'].min()), max_value=int(data['Average Speed (km/h)'].max()), help="How fast can the animal move?") top_speed = st.slider("Top Speed (km/h)", min_value=int(data['Top Speed (km/h)'].min()), max_value=int(data['Top Speed (km/h)'].max()), help="What is the maximum speed the animal can reach?") gestation = st.slider("Gestation Period (days)", min_value=int(data['Gestation Period (days)'].min()), max_value=int(data['Gestation Period (days)'].max()), help="How long is the animal's pregnancy?") offspring = st.slider("Offspring per Birth", min_value=int(data['Offspring per Birth'].min()), max_value=int(data['Offspring per Birth'].max()), help="How many offspring does the animal give birth to at once?") # Prepare input data for prediction (using the same features) input_data = pd.DataFrame({ 'Height (cm)': [height], 'Weight (kg)': [weight], 'Average Speed (km/h)': [speed], 'Top Speed (km/h)': [top_speed], 'Gestation Period (days)': [gestation], 'Offspring per Birth': [offspring] }) # Train a RandomForest Classifier for supervised learning X = data[numeric_features[:-1]] y = data['Diet_encoded'] clf = RandomForestClassifier() clf.fit(X, y) diet_pred = clf.predict(input_data) # Show prediction result predicted_diet = label_encoder.inverse_transform(diet_pred) st.subheader(f"Predicted Diet: {predicted_diet[0]}") # Tab 2: Unsupervised Learning with tabs[1]: st.header("Unsupervised: Group Animals Based on Lifespan") st.write("In this part, we will group animals based on their lifespan. This helps to identify patterns in how different animals live and how long they survive.") # Select minimum and maximum Lifespan st.subheader("Choose the lifespan range to group animals") lifespan_min = st.slider("Lifespan Min (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The minimum lifespan of the animals you are interested in.") lifespan_max = st.slider("Lifespan Max (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The maximum lifespan of the animals you are interested in.") # Filter dataset by lifespan filtered_data = data[(data['Lifespan (years)'] >= lifespan_min) & (data['Lifespan (years)'] <= lifespan_max)] # Check if there is data available after filtering if filtered_data.empty: st.warning("No animals found with the selected lifespan range. Please adjust the sliders.") else: # Scale the data for DBSCAN (automatically chosen parameters) scaler = StandardScaler() X_scaled = scaler.fit_transform(filtered_data[numeric_features[:-1]]) # Don't use 'Lifespan (years)' in unsupervised learning # Automatically run DBSCAN with default parameters db = DBSCAN(eps=1.5, min_samples=5) clusters = db.fit_predict(X_scaled) filtered_data['Cluster'] = clusters st.subheader("Clustered Animals by Lifespan") st.write(f"Animals are grouped based on their characteristics (except lifespan). The animals in the same cluster share similarities.") # Show the animals matching the selected lifespan range st.subheader("Animals in the Selected Lifespan Range") st.dataframe(filtered_data[['Animal', 'Lifespan (years)', 'Weight (kg)', 'Cluster']]) # Plot: Reduce the size of the plot st.subheader("Cluster Plot (Lifespan vs Weight)") fig, ax = plt.subplots(figsize=(10, 6)) sns.scatterplot(x=filtered_data['Lifespan (years)'], y=filtered_data['Weight (kg)'], hue=clusters, palette="tab10", ax=ax) st.pyplot(fig)