File size: 3,582 Bytes
d7d7fc8
 
ec912e5
d7d7fc8
ec912e5
 
 
 
d7d7fc8
 
 
 
 
 
 
 
ec912e5
d7d7fc8
 
 
96c07c1
 
d7d7fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
ec912e5
d7d7fc8
 
940f5c2
d7d7fc8
 
 
 
 
 
 
 
ec912e5
d7d7fc8
 
940f5c2
d7d7fc8
 
 
 
 
 
 
 
 
940f5c2
d7d7fc8
 
 
 
ec912e5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from typing import Optional

import gradio as gr
import pandas as pd
import pyarrow.parquet as pq
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from huggingface_hub import HfFileSystem

css = """
.settings {
    background: transparent;
}
.settings button span {
    color: var(--body-text-color-subdued);
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Row():
        with gr.Column(scale=10):
            gr.Markdown("# πŸ‘€ Parquet Viewer πŸ“š")
            gr.Markdown("View the content of Parquet files inside a dataset repository or pull request.")
            dataset_search = HuggingfaceHubSearch(
                label="Hub Dataset ID",
                placeholder="Search for dataset id on Huggingface",
                search_type="dataset",
            )
            with gr.Row():
                revision_dropdown = gr.Dropdown("main", label="Revision", allow_custom_value=True)
                parquet_file_dropdown = gr.Dropdown(label="Parquet file", allow_custom_value=True)
            gr.Markdown("Parquet content:")
            output_dataframe = gr.DataFrame()
        with gr.Column(scale=4, min_width="200px"):
            with gr.Accordion("Settings", open=False, elem_classes="settings"):
                gr.Markdown("Access private/gated repos")
                gr.LoginButton()

    @dataset_search.change(inputs=[dataset_search], outputs=[revision_dropdown, parquet_file_dropdown, output_dataframe])
    def dataset_update(dataset, oauth_token: Optional[gr.OAuthToken] = None):
        fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
        if "/" not in dataset:
            return {revision_dropdown: gr.Dropdown(choices=[], value="", info="")}
        try:
            prs = [f"{dataset}@refs/pr/{pr.num}" for pr in fs._api.get_repo_discussions(dataset, repo_type="dataset", discussion_type="pull_request")]
            revision = f"{dataset}@main"
            return {revision_dropdown: gr.Dropdown(choices=[revision] + prs, value=revision, info=f"{len(prs)} pull request{'s' if len(prs) > 1 else ''} available" if prs else None)}
        except Exception:
            return {revision_dropdown: gr.Dropdown(choices=[], value="", info="no revisions available")}

    @revision_dropdown.change(inputs=[revision_dropdown], outputs=[parquet_file_dropdown, output_dataframe])
    def revision_update(dataset_and_revision, oauth_token: Optional[gr.OAuthToken] = None):
        fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
        try:
            parquet_files = ["hf://" + path for path in fs.glob(f"datasets/{dataset_and_revision}/**/*.parquet")]
            parquet_file = parquet_files[0] if parquet_files else None
            return {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file, info=f"{len(parquet_files)} parquet file{'s' if len(parquet_files) > 1 else ''} available")}
        except Exception:
            return {parquet_file_dropdown: gr.Dropdown(choices=[], value="", info="")}
        
    @parquet_file_dropdown.change(inputs=[parquet_file_dropdown], outputs=[output_dataframe])
    def parquet_file_update(parquet_file, oauth_token: Optional[gr.OAuthToken] = None):
        fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
        try:
            return {output_dataframe: pd.DataFrame([{k: str(v)[:1000] for k, v in x.items()} for x in pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pylist()] if parquet_file else [])}
        except Exception:
            return {output_dataframe: []}


demo.launch()