Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from huggingface_hub.hf_api import create_repo, upload_folder, upload_file | |
from huggingface_hub.repository import Repository | |
import subprocess | |
import os | |
import tempfile | |
from uuid import uuid4 | |
import pickle | |
import sweetviz as sv | |
import dabl | |
import re | |
def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwise="off"): | |
df = pd.read_csv(dataset.name) | |
if column is not None: | |
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise) | |
else: | |
analyze_report = sv.analyze(df, pairwise_analysis=pairwise) | |
analyze_report.show_html('index.html', open_browser=False) | |
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) | |
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) | |
readme = f"---\ntitle: {dataset_name}\nemoji: β¨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" | |
with open("README.md", "w+") as f: | |
f.write(readme) | |
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) | |
return f"Your dataset report will be ready at {repo_url}" | |
from sklearn.utils import estimator_html_repr | |
def extract_estimator_config(model): | |
hyperparameter_dict = model.get_params(deep=True) | |
table = "| Hyperparameters | Value |\n| :-- | :-- |\n" | |
for hyperparameter, value in hyperparameter_dict.items(): | |
table += f"| {hyperparameter} | {value} |\n" | |
return table | |
def train_baseline(dataset, username, dataset_name, token, column): | |
df = pd.read_csv(dataset.name) | |
fc = dabl.SimpleClassifier(random_state=0) | |
df_clean = dabl.clean(df) | |
X = df_clean.drop(column, axis = 1) | |
y = df_clean[column] | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
from contextlib import redirect_stdout | |
with open('logs.txt', 'w') as f: | |
with redirect_stdout(f): | |
print('Logging training') | |
fc.fit(X, y) | |
repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token) | |
readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n" | |
readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n" | |
readme+="Metrics of the best model:\n\n" | |
for elem in str(fc.current_best_).split("\n"): | |
readme+= f"{elem}\n\n" | |
readme+= "\n\nSee model plot below:\n\n" | |
readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_))) | |
with open(f"{tmpdirname}/README.md", "w+") as f: | |
f.write(readme) | |
with open(f"{tmpdirname}/clf.pkl", mode="bw") as f: | |
pickle.dump(fc, file=f) | |
upload_folder(repo_id =f"{username}/{dataset_name}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./") | |
return f"Your model will be ready at {repo_url}" | |
with gr.Blocks() as demo: | |
main_title = gr.Markdown("""# Baseline Trainer πͺπβ¨""") | |
main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card.""") | |
with gr.Tabs(): | |
with gr.TabItem("Baseline Trainer") as baseline_trainer: | |
with gr.Row(): | |
with gr.Column(): | |
title = gr.Markdown(""" ## Train a supervised baseline model""") | |
description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.") | |
dataset = gr.File(label = "Dataset") | |
column = gr.Text(label = "Enter target variable:") | |
dataset_name = gr.Text(label = "Enter dataset name:") | |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.") | |
token = gr.Textbox(label = "Your Hugging Face Token") | |
username = gr.Textbox(label = "Your Hugging Face User Name") | |
inference_run = gr.Button("Train") | |
inference_progress = gr.StatusTracker(cover_container=True) | |
outcome = gr.outputs.Textbox(label = "Progress") | |
inference_run.click( | |
train_baseline, | |
inputs=[dataset, username, dataset_name, token, column], | |
outputs=outcome, | |
status_tracker=inference_progress, | |
) | |
with gr.TabItem("Analyze") as analyze: | |
with gr.Row(): | |
with gr.Column(): | |
title = gr.Markdown(""" ## Analyze Dataset """) | |
description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.") | |
dataset = gr.File(label = "Dataset") | |
column = gr.Text(label = "Compare dataset against a target variable (Optional)") | |
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis") | |
token = gr.Textbox(label = "Your Hugging Face Token") | |
username = gr.Textbox(label = "Your Hugging Face User Name") | |
dataset_name = gr.Textbox(label = "Dataset Name") | |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.") | |
inference_run = gr.Button("Infer") | |
inference_progress = gr.StatusTracker(cover_container=True) | |
outcome = gr.outputs.Textbox() | |
inference_run.click( | |
analyze_datasets, | |
inputs=[dataset, dataset_name, username, token, column, pairwise], | |
outputs=outcome, | |
status_tracker=inference_progress, | |
) | |
demo.launch(debug=True) |