bhagwandas's picture
Create app.py
c96b110 verified
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from datetime import datetime
# Load and prepare data
df = pd.read_csv("water_potability (1).csv")
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
X = df_imputed.drop('Potability', axis=1)
y = df_imputed['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
def predict(ph, hardness, solids, chloramines, sulfate,
conductivity, organic_carbon, trihalomethanes, turbidity):
input_data = pd.DataFrame([[
ph, hardness, solids, chloramines, sulfate,
conductivity, organic_carbon, trihalomethanes, turbidity
]], columns=X.columns)
prediction = model.predict(input_data)[0]
label = "βœ… Safe to Drink" if prediction == 1 else "❌ Not Safe to Drink"
proba = model.predict_proba(input_data)[0][prediction]
# --- Visuals ---
# Feature Importance
fig1, ax1 = plt.subplots(figsize=(6, 5))
feature_importance.plot(kind='barh', ax=ax1)
ax1.set_title("Feature Importance")
ax1.set_xlabel("Importance")
ax1.invert_yaxis()
# Class Distribution
fig2, ax2 = plt.subplots(figsize=(4, 4))
sns.countplot(x='Potability', data=df_imputed, ax=ax2)
ax2.set_title("Potability Class Distribution")
# Heatmap
fig3, ax3 = plt.subplots(figsize=(6, 5))
sns.heatmap(df_imputed.corr(), cmap='coolwarm', annot=False, ax=ax3)
ax3.set_title("Feature Correlation Heatmap")
# Metrics Text
metrics_info = (
f"πŸ“Š Model Performance on Test Set:\n\n"
f"- Accuracy : {accuracy:.2f}\n"
f"- Precision: {precision:.2f}\n"
f"- Recall : {recall:.2f}\n"
f"- F1 Score : {f1:.2f}\n\n"
f"Prediction Confidence: {proba:.2f}"
)
return f"{label}\n\n{metrics_info}", fig1, fig2, fig3
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# πŸ’§ Trustworthy Water Quality Predictor")
gr.Markdown("Uses Random Forest with data imputation, performance metrics, and feature insights.")
with gr.Row():
with gr.Column():
ph = gr.Slider(0, 14, step=0.1, label="pH")
hardness = gr.Slider(50, 300, step=1, label="Hardness")
solids = gr.Slider(3000, 50000, step=100, label="Solids")
chloramines = gr.Slider(0, 15, step=0.1, label="Chloramines")
sulfate = gr.Slider(100, 500, step=1, label="Sulfate")
conductivity = gr.Slider(100, 800, step=1, label="Conductivity")
organic_carbon = gr.Slider(2, 30, step=0.1, label="Organic Carbon")
trihalomethanes = gr.Slider(0, 120, step=1, label="Trihalomethanes")
turbidity = gr.Slider(0, 7, step=0.1, label="Turbidity")
submit = gr.Button("Predict")
with gr.Column():
result = gr.Textbox(label="Prediction + Metrics")
fig1 = gr.Plot(label="Feature Importance")
fig2 = gr.Plot(label="Potability Class Distribution")
fig3 = gr.Plot(label="Correlation Heatmap")
submit.click(
fn=predict,
inputs=[ph, hardness, solids, chloramines, sulfate,
conductivity, organic_carbon, trihalomethanes, turbidity],
outputs=[result, fig1, fig2, fig3]
)
demo.launch()