import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
def load_data():
    return pd.read_csv("Animal Dataset.csv")

# Convert ranges to float (if needed)
def convert_range_to_float(val):
    try:
        if isinstance(val, str) and '-' in val:
            start, end = map(float, val.split('-'))
            return (start + end) / 2
        else:
            return float(val)
    except:
        return np.nan

# Title
st.title("🐾 Animal Diet Prediction and Lifespan Clustering")
st.markdown("#### Check animal diet and group them according to lifespan using Random Forest Classifier and DBSCAN")

tabs = st.tabs(["📊 Supervised Learning", "🔍 Unsupervised Learning"])

data = load_data()

# Select features to use
numeric_features = ['Height (cm)', 'Weight (kg)', 'Average Speed (km/h)', 'Top Speed (km/h)', 'Gestation Period (days)', 'Offspring per Birth', 'Lifespan (years)']

# Clean the features
for col in numeric_features:
    data[col] = data[col].apply(convert_range_to_float)

data = data.dropna(subset=numeric_features)

# Encode the target for supervised learning
if 'Diet' in data.columns:
    label_encoder = LabelEncoder()
    data['Diet_encoded'] = label_encoder.fit_transform(data['Diet'])

# Tab 1: Supervised Learning
with tabs[0]:
    st.header("Supervised: Predict Animal Diet Type")
    
    # Sliders
    st.subheader("Adjust the animal's characteristics to predict its diet")
    height = st.slider("Height (cm)", min_value=int(data['Height (cm)'].min()), max_value=int(data['Height (cm)'].max()), help="How tall is the animal?")
    weight = st.slider("Weight (kg)", min_value=int(data['Weight (kg)'].min()), max_value=int(data['Weight (kg)'].max()), help="How much does the animal weigh?")
    speed = st.slider("Average Speed (km/h)", min_value=int(data['Average Speed (km/h)'].min()), max_value=int(data['Average Speed (km/h)'].max()), help="How fast can the animal move?")
    top_speed = st.slider("Top Speed (km/h)", min_value=int(data['Top Speed (km/h)'].min()), max_value=int(data['Top Speed (km/h)'].max()), help="What is the maximum speed the animal can reach?")
    gestation = st.slider("Gestation Period (days)", min_value=int(data['Gestation Period (days)'].min()), max_value=int(data['Gestation Period (days)'].max()), help="How long is the animal's pregnancy?")
    offspring = st.slider("Offspring per Birth", min_value=int(data['Offspring per Birth'].min()), max_value=int(data['Offspring per Birth'].max()), help="How many offspring does the animal give birth to at once?")
    
    # Prepare input data for prediction (using the same features)
    input_data = pd.DataFrame({
        'Height (cm)': [height],
        'Weight (kg)': [weight],
        'Average Speed (km/h)': [speed],
        'Top Speed (km/h)': [top_speed],
        'Gestation Period (days)': [gestation],
        'Offspring per Birth': [offspring]
    })
    
    # Train a RandomForest Classifier for supervised learning
    X = data[numeric_features[:-1]]
    y = data['Diet_encoded']
    clf = RandomForestClassifier()
    clf.fit(X, y)
    diet_pred = clf.predict(input_data)

    # Show prediction result
    predicted_diet = label_encoder.inverse_transform(diet_pred)
    st.subheader(f"Predicted Diet: {predicted_diet[0]}")

# Tab 2: Unsupervised Learning
with tabs[1]:
    st.header("Unsupervised: Group Animals Based on Lifespan")
    st.write("In this part, we will group animals based on their lifespan. This helps to identify patterns in how different animals live and how long they survive.")
    
    # Select minimum and maximum Lifespan
    st.subheader("Choose the lifespan range to group animals")
    lifespan_min = st.slider("Lifespan Min (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The minimum lifespan of the animals you are interested in.")
    lifespan_max = st.slider("Lifespan Max (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The maximum lifespan of the animals you are interested in.")
    
    # Filter dataset by lifespan
    filtered_data = data[(data['Lifespan (years)'] >= lifespan_min) & (data['Lifespan (years)'] <= lifespan_max)]
    
    # Check if there is data available after filtering
    if filtered_data.empty:
        st.warning("No animals found with the selected lifespan range. Please adjust the sliders.")
    else:
        # Scale the data for DBSCAN (automatically chosen parameters)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(filtered_data[numeric_features[:-1]])  # Don't use 'Lifespan (years)' in unsupervised learning

        # Automatically run DBSCAN with default parameters
        db = DBSCAN(eps=1.5, min_samples=5)
        clusters = db.fit_predict(X_scaled)
        filtered_data['Cluster'] = clusters

        st.subheader("Clustered Animals by Lifespan")
        st.write(f"Animals are grouped based on their characteristics (except lifespan). The animals in the same cluster share similarities.")
        
        # Show the animals matching the selected lifespan range
        st.subheader("Animals in the Selected Lifespan Range")
        st.dataframe(filtered_data[['Animal', 'Lifespan (years)', 'Weight (kg)', 'Cluster']])

        # Plot: Reduce the size of the plot
        st.subheader("Cluster Plot (Lifespan vs Weight)")
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.scatterplot(x=filtered_data['Lifespan (years)'], y=filtered_data['Weight (kg)'], hue=clusters, palette="tab10", ax=ax)
        st.pyplot(fig)