File size: 5,442 Bytes
a66cfba
 
 
 
 
 
 
2b0a4af
a66cfba
 
2b0a4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a66cfba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from typing import List, Dict
import httpx
import gradio as gr
import pandas as pd
import json

async def get_splits(dataset_name: str) -> Dict[str, List[Dict]]:
    URL = f"https://huggingface.co/api/datasets/{dataset_name}"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        dataset_info = response.json()
        return {
            "splits": [
                {"split": split_name, "config": config_name}
                for config_name, config_info in dataset_info.get("config", {}).items()
                for split_name in config_info.get("splits", [])
            ]
        }

async def get_first_rows(dataset: str, config: str, split: str) -> Dict[str, Dict[str, List[Dict]]]:
    URL = f"https://huggingface.co/datasets/{dataset}/resolve/main/dataset_info.json"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        dataset_info = response.json()
        split_info = dataset_info["splits"][split]
        first_rows = {
            "rows": [
                {"row": row} for row in split_info["examples"][:10]
            ]
        }
        return first_rows

# Guido von Roissum: https://www.youtube.com/watch?v=-DVyjdw4t9I
async def update_URL(dataset: str, config: str, split: str) -> str:
    URL = f"https://huggingface.co/datasets/{dataset}/tree/main/{config}/{split}"
    return URL

    


async def get_valid_datasets() -> List[str]:
    URL = f"https://huggingface.co/api/datasets"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        try:
            datasets = [dataset["id"] for dataset in response.json()]
        except (KeyError, json.JSONDecodeError):
            datasets = []  # Set a default value if the response is not in the expected format
        return datasets


def get_df_from_rows(api_output):
    dfFromSort = pd.DataFrame([row["row"] for row in api_output["rows"]])
    try:
        dfFromSort.sort_values(by=1, axis=1, ascending=True, inplace=False, kind='mergesort', na_position='last', ignore_index=False, key=None)
    except:
        print("Exception sorting due to keyerror?")
    return dfFromSort

async def update_configs(dataset_name: str):
    splits = await get_splits(dataset_name)
    all_configs = sorted(set([s["config"] for s in splits["splits"]]))
    return (gr.Dropdown.update(choices=all_configs, value=all_configs[0]),
            splits)

async def update_splits(config_name: str, state: gr.State):
    splits_for_config = sorted(set([s["split"] for s in state["splits"] if s["config"] == config_name]))
    dataset_name = state["splits"][0]["dataset"]
    dataset = await update_dataset(splits_for_config[0], config_name, dataset_name)
    return (gr.Dropdown.update(choices=splits_for_config, value=splits_for_config[0]), dataset)

async def update_dataset(split_name: str, config_name: str, dataset_name: str):
    rows = await get_first_rows(dataset_name, config_name, split_name)
    df = get_df_from_rows(rows)
    return df

   
async def openurl(URL: str) -> str:
    html = f"<a href={URL} target=_blank>{URL}</a>"
    return (html)

with gr.Blocks() as demo:
    gr.Markdown("<h1><center>🥫Datasetter📊 Datasets Analyzer and Transformer</center></h1>")
    gr.Markdown("""<div align="center">Curated Datasets: <a href = "https://www.kaggle.com/datasets">Kaggle</a>. <a href="https://www.nlm.nih.gov/research/umls/index.html">NLM UMLS</a>.  <a href="https://loinc.org/downloads/">LOINC</a>. <a href="https://www.cms.gov/medicare/icd-10/2022-icd-10-cm">ICD10 Diagnosis</a>. <a href="https://icd.who.int/dev11/downloads">ICD11</a>.  <a href="https://paperswithcode.com/datasets?q=medical&v=lst&o=newest">Papers,Code,Datasets for SOTA in Medicine</a>.   <a href="https://paperswithcode.com/datasets?q=mental&v=lst&o=newest">Mental</a>.  <a href="https://paperswithcode.com/datasets?q=behavior&v=lst&o=newest">Behavior</a>. <a href="https://www.cms.gov/medicare-coverage-database/downloads/downloads.aspx">CMS Downloads</a>.  <a href="https://www.cms.gov/medicare/fraud-and-abuse/physicianselfreferral/list_of_codes">CMS CPT and HCPCS Procedures and Services</a>  """)

    splits_data = gr.State()
    
    with gr.Row():
        dataset_name = gr.Dropdown(label="Dataset", interactive=True)
        config = gr.Dropdown(label="Subset", interactive=True)
        split = gr.Dropdown(label="Split", interactive=True)
    
    with gr.Row():
        #filterleft = gr.Textbox(label="First Column Filter",placeholder="Filter Column 1")
        URLcenter = gr.Textbox(label="Dataset URL", placeholder="URL")
        btn = gr.Button("Use Dataset")
        #URLoutput = gr.Textbox(label="Output",placeholder="URL Output")
        #URLoutput = gr.HTML(label="Output",placeholder="URL Output")
        URLoutput = gr.HTML(label="Output")

    with gr.Row():
        dataset = gr.DataFrame(wrap=True, interactive=True)
    
    demo.load(get_valid_datasets, inputs=None, outputs=[dataset_name])
    
    dataset_name.change(update_configs, inputs=[dataset_name], outputs=[config, splits_data])
    config.change(update_splits, inputs=[config, splits_data], outputs=[split, dataset])
    split.change(update_dataset, inputs=[split, config, dataset_name], outputs=[dataset])

    dataset_name.change(update_URL, inputs=[split, config, dataset_name], outputs=[URLcenter])

    btn.click(openurl, [URLcenter], URLoutput)

demo.launch(debug=True)