File size: 7,106 Bytes
96b98f3 e252d2c 43728f4 80a6e34 96b98f3 80a6e34 96b98f3 80a6e34 96b98f3 80a6e34 43728f4 96b98f3 43728f4 80a6e34 96b98f3 80a6e34 43728f4 80a6e34 e252d2c 80a6e34 e252d2c 80a6e34 e252d2c 80a6e34 e252d2c 80a6e34 96b98f3 edfc8c7 80a6e34 edfc8c7 80a6e34 edfc8c7 80a6e34 96b98f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import gradio as gr
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
matplotlib.use('Agg') # Avoid issues in some remote environments
# Pre-populate a short list of "recommended" Hugging Face datasets
# (Replace "datasorg/iris" etc. with real dataset IDs you want to showcase)
SUGGESTED_DATASETS = [
"datasorg/iris", # hypothetical ID
"uciml/wine_quality-red", # example from the HF Hub
"SKIP/ENTER_CUSTOM" # We'll treat this as a "separator" or "prompt" for custom
]
def load_and_prepare_dataset(dataset_id, label_column, feature_columns):
"""
Loads a dataset from the Hugging Face Hub,
converts it to a pandas DataFrame,
returns X, y as NumPy arrays for modeling.
"""
# Load only the "train" split for simplicity
# Many datasets have "train", "test", "validation" splits
ds = load_dataset(dataset_id, split="train")
# Convert to a DataFrame for easy manipulation
df = pd.DataFrame(ds)
# Subset to selected columns
if label_column not in df.columns:
raise ValueError(f"Label column '{label_column}' not in dataset columns: {df.columns.to_list()}")
for col in feature_columns:
if col not in df.columns:
raise ValueError(f"Feature column '{col}' not in dataset columns: {df.columns.to_list()}")
# Split into X and y
X = df[feature_columns].values
y = df[label_column].values
return X, y, df.columns.tolist()
def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
learning_rate, n_estimators, max_depth, test_size):
"""
1. Determine final dataset ID (either from dropdown or custom text).
2. Load dataset -> DataFrame -> X, y.
3. Train a GradientBoostingClassifier.
4. Generate plots & metrics (accuracy and confusion matrix).
"""
# Decide which dataset ID to use
if dataset_id != "SKIP/ENTER_CUSTOM":
final_id = dataset_id
else:
# Use the user-supplied "custom_dataset_id"
final_id = custom_dataset_id.strip()
# Prepare data
X, y, columns_available = load_and_prepare_dataset(
final_id,
label_column,
feature_columns
)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# Train model
clf = GradientBoostingClassifier(
learning_rate=learning_rate,
n_estimators=int(n_estimators),
max_depth=int(max_depth),
random_state=42
)
clf.fit(X_train, y_train)
# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
# Plot figure
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
# Subplot 1: Feature Importances
importances = clf.feature_importances_
axs[0].barh(range(len(feature_columns)), importances, color='skyblue')
axs[0].set_yticks(range(len(feature_columns)))
axs[0].set_yticklabels(feature_columns)
axs[0].set_xlabel("Importance")
axs[0].set_title("Feature Importances")
# Subplot 2: Confusion Matrix Heatmap
im = axs[1].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
axs[1].set_title("Confusion Matrix")
plt.colorbar(im, ax=axs[1])
# Labeling
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("True")
# If you want to annotate each cell:
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
color = "white" if cm[i, j] > thresh else "black"
axs[1].text(j, i, format(cm[i, j], "d"), ha="center", va="center", color=color)
plt.tight_layout()
output_text = f"**Dataset used:** {final_id}\n\n"
output_text += f"**Accuracy:** {accuracy:.3f}\n\n"
output_text += "**Confusion Matrix** (raw counts above)."
return output_text, fig, columns_available
def update_columns(dataset_id, custom_dataset_id):
"""
Callback to dynamically fetch the columns from the dataset
so the user can pick which columns to use as features/labels.
"""
if dataset_id != "SKIP/ENTER_CUSTOM":
final_id = dataset_id
else:
final_id = custom_dataset_id.strip()
# Try to load the dataset and return columns
try:
ds = load_dataset(final_id, split="train")
df = pd.DataFrame(ds)
cols = df.columns.tolist()
# Return as list of selectable options
return gr.update(choices=cols), gr.update(choices=cols), f"Columns found: {cols}"
except Exception as e:
return gr.update(choices=[]), gr.update(choices=[]), f"Error loading {final_id}: {e}"
with gr.Blocks() as demo:
gr.Markdown("## Train GradientBoostingClassifier on a Hugging Face dataset of your choice")
with gr.Row():
dataset_dropdown = gr.Dropdown(
choices=SUGGESTED_DATASETS,
value=SUGGESTED_DATASETS[0],
label="Choose a dataset"
)
custom_dataset_id = gr.Textbox(label="Or enter HF dataset (user/dataset)", value="",
placeholder="e.g. 'username/my_custom_dataset'")
# Button to load columns from the chosen dataset
load_cols_btn = gr.Button("Load columns")
load_cols_info = gr.Markdown()
with gr.Row():
label_col = gr.Dropdown(choices=[], label="Label column (choose 1)")
feature_cols = gr.CheckboxGroup(choices=[], label="Feature columns (choose 1 or more)")
# Once columns are chosen, we can set hyperparams
learning_rate_slider = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="learning_rate")
n_estimators_slider = gr.Slider(50, 300, value=100, step=50, label="n_estimators")
max_depth_slider = gr.Slider(1, 10, value=3, step=1, label="max_depth")
test_size_slider = gr.Slider(0.1, 0.9, value=0.3, step=0.1, label="test_size (fraction)")
train_button = gr.Button("Train & Evaluate")
output_text = gr.Markdown()
output_plot = gr.Plot()
# We might also want to show the columns for reference post-training
columns_return = gr.Markdown()
# When "Load columns" is clicked, we call update_columns to fetch the dataset columns
load_cols_btn.click(
fn=update_columns,
inputs=[dataset_dropdown, custom_dataset_id],
outputs=[label_col, feature_cols, load_cols_info]
)
# When "Train & Evaluate" is clicked, we train the model
train_button.click(
fn=train_model,
inputs=[
dataset_dropdown,
custom_dataset_id,
label_col,
feature_cols,
learning_rate_slider,
n_estimators_slider,
max_depth_slider,
test_size_slider
],
outputs=[output_text, output_plot, columns_return]
)
demo.launch()
|