|
import gradio as gr |
|
import pandas as pd |
|
from huggingface_hub.hf_api import create_repo, upload_file, HfApi |
|
from huggingface_hub.repository import Repository |
|
import subprocess |
|
import os |
|
import tempfile |
|
import sweetviz as sv |
|
|
|
def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"): |
|
df = pd.read_csv(dataset.name) |
|
username = HfApi().whoami(token=token)["name"] |
|
if column is not None: |
|
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise) |
|
else: |
|
analyze_report = sv.analyze(df, pairwise_analysis=pairwise) |
|
analyze_report.show_html('./index.html', open_browser=False) |
|
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) |
|
|
|
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" |
|
with open("README.md", "w+") as f: |
|
f.write(readme) |
|
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
|
|
return f"Your dataset report will be ready at {repo_url}" |
|
|
|
def compare_column_values(dataset, dataset_name, token, column, category): |
|
|
|
df = pd.read_csv(dataset.name) |
|
username = HfApi().whoami(token=token)["name"] |
|
arr = df[column].unique() |
|
arr = list(arr[arr != column]) |
|
compare_report = sv.compare_intra(df, df[column] == category, arr[0]) |
|
compare_report.show_html('./index.html', open_browser=False) |
|
|
|
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) |
|
|
|
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" |
|
with open("README.md", "w+") as f: |
|
f.write(readme) |
|
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
|
|
return f"Your dataset report will be ready at {repo_url}" |
|
|
|
def compare_dataset_splits(dataset, dataset_name, token, splits): |
|
df = pd.read_csv(dataset.name) |
|
train = df.sample(frac=splits) |
|
test = df.loc[df.index.difference(train.index)] |
|
username = HfApi().whoami(token=token)["name"] |
|
|
|
compare_report = sv.compare([train, "Training Data"], [test, "Test Data"]) |
|
compare_report.show_html('./index.html', open_browser=False) |
|
|
|
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) |
|
|
|
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" |
|
with open("README.md", "w+") as f: |
|
f.write(readme) |
|
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) |
|
|
|
return f"Your dataset report will be ready at {repo_url}" |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
main_title = gr.Markdown("""# Easy Analysis🪄🌟✨""") |
|
main_desc = gr.Markdown("""This app enables you to run three type of dataset analysis and pushes the interactive reports to your Hugging Face Hub profile as a Space. It uses SweetViz in the back.""") |
|
with gr.Tabs(): |
|
with gr.TabItem("Analyze") as analyze: |
|
with gr.Row(): |
|
with gr.Column(): |
|
title = gr.Markdown(""" ## Analyze Dataset """) |
|
description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.") |
|
dataset = gr.File(label = "Dataset") |
|
column = gr.Text(label = "Compare dataset against a target variable (Optional)") |
|
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis") |
|
token = gr.Textbox(label = "Your Hugging Face Token") |
|
dataset_name = gr.Textbox(label = "Dataset Name") |
|
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") |
|
inference_run = gr.Button("Infer") |
|
inference_progress = gr.StatusTracker(cover_container=True) |
|
outcome = gr.outputs.Textbox() |
|
inference_run.click( |
|
analyze_datasets, |
|
inputs=[dataset, dataset_name, token, column, pairwise], |
|
outputs=outcome, |
|
status_tracker=inference_progress, |
|
) |
|
with gr.TabItem("Compare Splits") as compare_splits: |
|
with gr.Row(): |
|
with gr.Column(): |
|
title = gr.Markdown(""" ## Compare Splits""") |
|
description = gr.Markdown("Split a dataset and compare splits. You need to give a fraction, e.g. 0.8.") |
|
dataset = gr.File(label = "Dataset") |
|
split_ratio = gr.Number(label = "Split Ratios") |
|
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") |
|
token = gr.Textbox(label = "Your Hugging Face Token") |
|
dataset_name = gr.Textbox(label = "Dataset Name") |
|
inference_run = gr.Button("Infer") |
|
inference_progress = gr.StatusTracker(cover_container=True) |
|
|
|
outcome = gr.outputs.Textbox() |
|
inference_run.click( |
|
compare_dataset_splits, |
|
inputs=[dataset, dataset_name, token, split_ratio], |
|
outputs=outcome, |
|
status_tracker=inference_progress, |
|
) |
|
|
|
with gr.TabItem("Compare Subsets") as compare_subsets: |
|
with gr.Row(): |
|
with gr.Column(): |
|
title = gr.Markdown(""" ## Compare Subsets""") |
|
description = gr.Markdown("Compare subsets of a dataset, e.g. you can pick Age Group column and compare adult category against young.") |
|
dataset = gr.File(label = "Dataset") |
|
column = gr.Text(label = "Enter column:") |
|
category = gr.Text(label = "Enter category:") |
|
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") |
|
token = gr.Textbox(label = "Your Hugging Face Token") |
|
dataset_name = gr.Textbox(label = "Dataset Name") |
|
inference_run = gr.Button("Run Analysis") |
|
inference_progress = gr.StatusTracker(cover_container=True) |
|
|
|
outcome = gr.outputs.Textbox() |
|
inference_run.click( |
|
compare_column_values, |
|
inputs=[dataset, dataset_name, token, column, category ], |
|
outputs=outcome, |
|
status_tracker=inference_progress, |
|
) |
|
|
|
demo.launch(debug=True) |