import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score import joblib from datetime import datetime # Load and prepare data df = pd.read_csv("water_potability (1).csv") imputer = SimpleImputer(strategy='mean') df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) X = df_imputed.drop('Potability', axis=1) y = df_imputed['Potability'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = RandomForestClassifier(random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) # Metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False) def predict(ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity): input_data = pd.DataFrame([[ ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity ]], columns=X.columns) prediction = model.predict(input_data)[0] label = "✅ Safe to Drink" if prediction == 1 else "❌ Not Safe to Drink" proba = model.predict_proba(input_data)[0][prediction] # --- Visuals --- # Feature Importance fig1, ax1 = plt.subplots(figsize=(6, 5)) feature_importance.plot(kind='barh', ax=ax1) ax1.set_title("Feature Importance") ax1.set_xlabel("Importance") ax1.invert_yaxis() # Class Distribution fig2, ax2 = plt.subplots(figsize=(4, 4)) sns.countplot(x='Potability', data=df_imputed, ax=ax2) ax2.set_title("Potability Class Distribution") # Heatmap fig3, ax3 = plt.subplots(figsize=(6, 5)) sns.heatmap(df_imputed.corr(), cmap='coolwarm', annot=False, ax=ax3) ax3.set_title("Feature Correlation Heatmap") # Metrics Text metrics_info = ( f"📊 Model Performance on Test Set:\n\n" f"- Accuracy : {accuracy:.2f}\n" f"- Precision: {precision:.2f}\n" f"- Recall : {recall:.2f}\n" f"- F1 Score : {f1:.2f}\n\n" f"Prediction Confidence: {proba:.2f}" ) return f"{label}\n\n{metrics_info}", fig1, fig2, fig3 # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# 💧 Trustworthy Water Quality Predictor") gr.Markdown("Uses Random Forest with data imputation, performance metrics, and feature insights.") with gr.Row(): with gr.Column(): ph = gr.Slider(0, 14, step=0.1, label="pH") hardness = gr.Slider(50, 300, step=1, label="Hardness") solids = gr.Slider(3000, 50000, step=100, label="Solids") chloramines = gr.Slider(0, 15, step=0.1, label="Chloramines") sulfate = gr.Slider(100, 500, step=1, label="Sulfate") conductivity = gr.Slider(100, 800, step=1, label="Conductivity") organic_carbon = gr.Slider(2, 30, step=0.1, label="Organic Carbon") trihalomethanes = gr.Slider(0, 120, step=1, label="Trihalomethanes") turbidity = gr.Slider(0, 7, step=0.1, label="Turbidity") submit = gr.Button("Predict") with gr.Column(): result = gr.Textbox(label="Prediction + Metrics") fig1 = gr.Plot(label="Feature Importance") fig2 = gr.Plot(label="Potability Class Distribution") fig3 = gr.Plot(label="Correlation Heatmap") submit.click( fn=predict, inputs=[ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity], outputs=[result, fig1, fig2, fig3] ) demo.launch()