Spaces:

huggingface-projects
/

easy-analysis

Running

File size: 7,921 Bytes

0d87668

import gradio as gr
import pandas as pd
from huggingface_hub.hf_api import create_repo, upload_file
from huggingface_hub.repository import Repository
import subprocess
import os
import tempfile
import sweetviz as sv

def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwise="off"):
    df = pd.read_csv(dataset.name)
    if column is not None:
        analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
    else:
        analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
        analyze_report.show_html('index.html', open_browser=False)
    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"

def compare_column_values(dataset, dataset_name, username, token, column, category):

    df = pd.read_csv(dataset.name)
    arr = df[column].unique()
    arr = list(arr[arr != column])
    compare_report = sv.compare_intra(df, df[column] == category, arr[0])
    compare_report.show_html('index.html', open_browser=False)

    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"

def compare_dataset_splits(dataset, dataset_name, username, token, splits):
    df = pd.read_csv(dataset.name)
    train = df.sample(frac=splits)
    test = df.loc[df.index.difference(train.index)]

    compare_report = sv.compare([train, "Training Data"], [test, "Test Data"])
    compare_report.show_html('index.html', open_browser=False)

    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"



with gr.Blocks() as demo:
    main_title = gr.Markdown("""# Easy Analysis🪄🌟✨""")
    main_desc = gr.Markdown("""This app enables you to run three type of dataset analysis and pushes the interactive reports to your Hugging Face Hub profile as a Space. It uses SweetViz in the back.""")
    with gr.Tabs():
        with gr.TabItem("Analyze") as analyze:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Analyze Dataset """)
                    description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
                    dataset = gr.File(label = "Dataset")
                    column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                    pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    inference_run = gr.Button("Infer")
                    inference_progress = gr.StatusTracker(cover_container=True)
                outcome = gr.outputs.Textbox()
                inference_run.click(
                    analyze_datasets,
                    inputs=[dataset, dataset_name, username, token, column, pairwise],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )
        with gr.TabItem("Compare Splits") as compare_splits:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Compare Splits""")
                    description = gr.Markdown("Split a dataset and compare splits. You need to give a fraction, e.g. 0.8.")
                    dataset = gr.File(label = "Dataset")
                    split_ratio = gr.Number(label = "Split Ratios")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    inference_run = gr.Button("Infer")
                    inference_progress = gr.StatusTracker(cover_container=True)

                outcome = gr.outputs.Textbox()
                inference_run.click(
                    compare_dataset_splits,
                    inputs=[dataset, dataset_name, username, token, split_ratio],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )
        
        with gr.TabItem("Compare Subsets") as compare_subsets:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Compare Subsets""")
                    description = gr.Markdown("Compare subsets of a dataset, e.g. you can pick Age Group column and compare adult category against young.")
                    dataset = gr.File(label = "Dataset")
                    column = gr.Text(label = "Enter column:")
                    category = gr.Text(label = "Enter category:")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    inference_run = gr.Button("Run Analysis")
                    inference_progress = gr.StatusTracker(cover_container=True)

                outcome = gr.outputs.Textbox()
                inference_run.click(
                    compare_column_values,
                    inputs=[dataset, dataset_name, username, token, column, category ],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )

demo.launch(debug=True)