Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.impute import SimpleImputer | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | |
import joblib | |
from datetime import datetime | |
# Load and prepare data | |
df = pd.read_csv("water_potability (1).csv") | |
imputer = SimpleImputer(strategy='mean') | |
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) | |
X = df_imputed.drop('Potability', axis=1) | |
y = df_imputed['Potability'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
model = RandomForestClassifier(random_state=42) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
# Metrics | |
accuracy = accuracy_score(y_test, y_pred) | |
precision = precision_score(y_test, y_pred) | |
recall = recall_score(y_test, y_pred) | |
f1 = f1_score(y_test, y_pred) | |
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False) | |
def predict(ph, hardness, solids, chloramines, sulfate, | |
conductivity, organic_carbon, trihalomethanes, turbidity): | |
input_data = pd.DataFrame([[ | |
ph, hardness, solids, chloramines, sulfate, | |
conductivity, organic_carbon, trihalomethanes, turbidity | |
]], columns=X.columns) | |
prediction = model.predict(input_data)[0] | |
label = "β Safe to Drink" if prediction == 1 else "β Not Safe to Drink" | |
proba = model.predict_proba(input_data)[0][prediction] | |
# --- Visuals --- | |
# Feature Importance | |
fig1, ax1 = plt.subplots(figsize=(6, 5)) | |
feature_importance.plot(kind='barh', ax=ax1) | |
ax1.set_title("Feature Importance") | |
ax1.set_xlabel("Importance") | |
ax1.invert_yaxis() | |
# Class Distribution | |
fig2, ax2 = plt.subplots(figsize=(4, 4)) | |
sns.countplot(x='Potability', data=df_imputed, ax=ax2) | |
ax2.set_title("Potability Class Distribution") | |
# Heatmap | |
fig3, ax3 = plt.subplots(figsize=(6, 5)) | |
sns.heatmap(df_imputed.corr(), cmap='coolwarm', annot=False, ax=ax3) | |
ax3.set_title("Feature Correlation Heatmap") | |
# Metrics Text | |
metrics_info = ( | |
f"π Model Performance on Test Set:\n\n" | |
f"- Accuracy : {accuracy:.2f}\n" | |
f"- Precision: {precision:.2f}\n" | |
f"- Recall : {recall:.2f}\n" | |
f"- F1 Score : {f1:.2f}\n\n" | |
f"Prediction Confidence: {proba:.2f}" | |
) | |
return f"{label}\n\n{metrics_info}", fig1, fig2, fig3 | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# π§ Trustworthy Water Quality Predictor") | |
gr.Markdown("Uses Random Forest with data imputation, performance metrics, and feature insights.") | |
with gr.Row(): | |
with gr.Column(): | |
ph = gr.Slider(0, 14, step=0.1, label="pH") | |
hardness = gr.Slider(50, 300, step=1, label="Hardness") | |
solids = gr.Slider(3000, 50000, step=100, label="Solids") | |
chloramines = gr.Slider(0, 15, step=0.1, label="Chloramines") | |
sulfate = gr.Slider(100, 500, step=1, label="Sulfate") | |
conductivity = gr.Slider(100, 800, step=1, label="Conductivity") | |
organic_carbon = gr.Slider(2, 30, step=0.1, label="Organic Carbon") | |
trihalomethanes = gr.Slider(0, 120, step=1, label="Trihalomethanes") | |
turbidity = gr.Slider(0, 7, step=0.1, label="Turbidity") | |
submit = gr.Button("Predict") | |
with gr.Column(): | |
result = gr.Textbox(label="Prediction + Metrics") | |
fig1 = gr.Plot(label="Feature Importance") | |
fig2 = gr.Plot(label="Potability Class Distribution") | |
fig3 = gr.Plot(label="Correlation Heatmap") | |
submit.click( | |
fn=predict, | |
inputs=[ph, hardness, solids, chloramines, sulfate, | |
conductivity, organic_carbon, trihalomethanes, turbidity], | |
outputs=[result, fig1, fig2, fig3] | |
) | |
demo.launch() | |