File size: 7,921 Bytes
0d87668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import pandas as pd
from huggingface_hub.hf_api import create_repo, upload_file
from huggingface_hub.repository import Repository
import subprocess
import os
import tempfile
import sweetviz as sv

def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwise="off"):
    df = pd.read_csv(dataset.name)
    if column is not None:
        analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
    else:
        analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
        analyze_report.show_html('index.html', open_browser=False)
    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"

def compare_column_values(dataset, dataset_name, username, token, column, category):

    df = pd.read_csv(dataset.name)
    arr = df[column].unique()
    arr = list(arr[arr != column])
    compare_report = sv.compare_intra(df, df[column] == category, arr[0])
    compare_report.show_html('index.html', open_browser=False)

    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"

def compare_dataset_splits(dataset, dataset_name, username, token, splits):
    df = pd.read_csv(dataset.name)
    train = df.sample(frac=splits)
    test = df.loc[df.index.difference(train.index)]

    compare_report = sv.compare([train, "Training Data"], [test, "Test Data"])
    compare_report.show_html('index.html', open_browser=False)

    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
    
    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"    
    with open("README.md", "w+") as f:
        f.write(readme)
    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)

    return f"Your dataset report will be ready at {repo_url}"



with gr.Blocks() as demo:
    main_title = gr.Markdown("""# Easy Analysis🪄🌟✨""")
    main_desc = gr.Markdown("""This app enables you to run three type of dataset analysis and pushes the interactive reports to your Hugging Face Hub profile as a Space. It uses SweetViz in the back.""")
    with gr.Tabs():
        with gr.TabItem("Analyze") as analyze:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Analyze Dataset """)
                    description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
                    dataset = gr.File(label = "Dataset")
                    column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                    pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    inference_run = gr.Button("Infer")
                    inference_progress = gr.StatusTracker(cover_container=True)
                outcome = gr.outputs.Textbox()
                inference_run.click(
                    analyze_datasets,
                    inputs=[dataset, dataset_name, username, token, column, pairwise],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )
        with gr.TabItem("Compare Splits") as compare_splits:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Compare Splits""")
                    description = gr.Markdown("Split a dataset and compare splits. You need to give a fraction, e.g. 0.8.")
                    dataset = gr.File(label = "Dataset")
                    split_ratio = gr.Number(label = "Split Ratios")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    inference_run = gr.Button("Infer")
                    inference_progress = gr.StatusTracker(cover_container=True)

                outcome = gr.outputs.Textbox()
                inference_run.click(
                    compare_dataset_splits,
                    inputs=[dataset, dataset_name, username, token, split_ratio],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )
        
        with gr.TabItem("Compare Subsets") as compare_subsets:
            with gr.Row():
                with gr.Column():
                    title = gr.Markdown(""" ## Compare Subsets""")
                    description = gr.Markdown("Compare subsets of a dataset, e.g. you can pick Age Group column and compare adult category against young.")
                    dataset = gr.File(label = "Dataset")
                    column = gr.Text(label = "Enter column:")
                    category = gr.Text(label = "Enter category:")
                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                    token = gr.Textbox(label = "Your Hugging Face Token")
                    username = gr.Textbox(label = "Your Hugging Face User Name")
                    dataset_name = gr.Textbox(label = "Dataset Name")
                    inference_run = gr.Button("Run Analysis")
                    inference_progress = gr.StatusTracker(cover_container=True)

                outcome = gr.outputs.Textbox()
                inference_run.click(
                    compare_column_values,
                    inputs=[dataset, dataset_name, username, token, column, category ],
                    outputs=outcome,
                    status_tracker=inference_progress,
                )

demo.launch(debug=True)