Michael Rey
added all files
74c7b8b
raw
history blame contribute delete
5.81 kB
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
def load_data():
return pd.read_csv("Animal Dataset.csv")
# Convert ranges to float (if needed)
def convert_range_to_float(val):
try:
if isinstance(val, str) and '-' in val:
start, end = map(float, val.split('-'))
return (start + end) / 2
else:
return float(val)
except:
return np.nan
# Title
st.title("🐾 Animal Diet Prediction and Lifespan Clustering")
st.markdown("#### Check animal diet and group them according to lifespan using Random Forest Classifier and DBSCAN")
tabs = st.tabs(["πŸ“Š Supervised Learning", "πŸ” Unsupervised Learning"])
data = load_data()
# Select features to use
numeric_features = ['Height (cm)', 'Weight (kg)', 'Average Speed (km/h)', 'Top Speed (km/h)', 'Gestation Period (days)', 'Offspring per Birth', 'Lifespan (years)']
# Clean the features
for col in numeric_features:
data[col] = data[col].apply(convert_range_to_float)
data = data.dropna(subset=numeric_features)
# Encode the target for supervised learning
if 'Diet' in data.columns:
label_encoder = LabelEncoder()
data['Diet_encoded'] = label_encoder.fit_transform(data['Diet'])
# Tab 1: Supervised Learning
with tabs[0]:
st.header("Supervised: Predict Animal Diet Type")
# Sliders
st.subheader("Adjust the animal's characteristics to predict its diet")
height = st.slider("Height (cm)", min_value=int(data['Height (cm)'].min()), max_value=int(data['Height (cm)'].max()), help="How tall is the animal?")
weight = st.slider("Weight (kg)", min_value=int(data['Weight (kg)'].min()), max_value=int(data['Weight (kg)'].max()), help="How much does the animal weigh?")
speed = st.slider("Average Speed (km/h)", min_value=int(data['Average Speed (km/h)'].min()), max_value=int(data['Average Speed (km/h)'].max()), help="How fast can the animal move?")
top_speed = st.slider("Top Speed (km/h)", min_value=int(data['Top Speed (km/h)'].min()), max_value=int(data['Top Speed (km/h)'].max()), help="What is the maximum speed the animal can reach?")
gestation = st.slider("Gestation Period (days)", min_value=int(data['Gestation Period (days)'].min()), max_value=int(data['Gestation Period (days)'].max()), help="How long is the animal's pregnancy?")
offspring = st.slider("Offspring per Birth", min_value=int(data['Offspring per Birth'].min()), max_value=int(data['Offspring per Birth'].max()), help="How many offspring does the animal give birth to at once?")
# Prepare input data for prediction (using the same features)
input_data = pd.DataFrame({
'Height (cm)': [height],
'Weight (kg)': [weight],
'Average Speed (km/h)': [speed],
'Top Speed (km/h)': [top_speed],
'Gestation Period (days)': [gestation],
'Offspring per Birth': [offspring]
})
# Train a RandomForest Classifier for supervised learning
X = data[numeric_features[:-1]]
y = data['Diet_encoded']
clf = RandomForestClassifier()
clf.fit(X, y)
diet_pred = clf.predict(input_data)
# Show prediction result
predicted_diet = label_encoder.inverse_transform(diet_pred)
st.subheader(f"Predicted Diet: {predicted_diet[0]}")
# Tab 2: Unsupervised Learning
with tabs[1]:
st.header("Unsupervised: Group Animals Based on Lifespan")
st.write("In this part, we will group animals based on their lifespan. This helps to identify patterns in how different animals live and how long they survive.")
# Select minimum and maximum Lifespan
st.subheader("Choose the lifespan range to group animals")
lifespan_min = st.slider("Lifespan Min (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The minimum lifespan of the animals you are interested in.")
lifespan_max = st.slider("Lifespan Max (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The maximum lifespan of the animals you are interested in.")
# Filter dataset by lifespan
filtered_data = data[(data['Lifespan (years)'] >= lifespan_min) & (data['Lifespan (years)'] <= lifespan_max)]
# Check if there is data available after filtering
if filtered_data.empty:
st.warning("No animals found with the selected lifespan range. Please adjust the sliders.")
else:
# Scale the data for DBSCAN (automatically chosen parameters)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(filtered_data[numeric_features[:-1]]) # Don't use 'Lifespan (years)' in unsupervised learning
# Automatically run DBSCAN with default parameters
db = DBSCAN(eps=1.5, min_samples=5)
clusters = db.fit_predict(X_scaled)
filtered_data['Cluster'] = clusters
st.subheader("Clustered Animals by Lifespan")
st.write(f"Animals are grouped based on their characteristics (except lifespan). The animals in the same cluster share similarities.")
# Show the animals matching the selected lifespan range
st.subheader("Animals in the Selected Lifespan Range")
st.dataframe(filtered_data[['Animal', 'Lifespan (years)', 'Weight (kg)', 'Cluster']])
# Plot: Reduce the size of the plot
st.subheader("Cluster Plot (Lifespan vs Weight)")
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x=filtered_data['Lifespan (years)'], y=filtered_data['Weight (kg)'], hue=clusters, palette="tab10", ax=ax)
st.pyplot(fig)