Spaces:
Sleeping
Sleeping
from typing import Optional | |
import gradio as gr | |
import pandas as pd | |
import pyarrow.parquet as pq | |
from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
from huggingface_hub import HfFileSystem | |
css = """ | |
.settings { | |
background: transparent; | |
} | |
.settings button span { | |
color: var(--body-text-color-subdued); | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Row(): | |
with gr.Column(scale=10): | |
gr.Markdown("# π Parquet Viewer π") | |
gr.Markdown("View the content of Parquet files inside a dataset repository or pull request.") | |
dataset_search = HuggingfaceHubSearch( | |
label="Hub Dataset ID", | |
placeholder="Search for dataset id on Huggingface", | |
search_type="dataset", | |
) | |
with gr.Row(): | |
revision_dropdown = gr.Dropdown("main", label="Revision", allow_custom_value=True) | |
parquet_file_dropdown = gr.Dropdown(label="Parquet file", allow_custom_value=True) | |
gr.Markdown("Parquet content:") | |
output_dataframe = gr.DataFrame() | |
with gr.Column(scale=4, min_width="200px"): | |
with gr.Accordion("Settings", open=False, elem_classes="settings"): | |
gr.Markdown("Access private/gated repos") | |
gr.LoginButton() | |
def dataset_update(dataset, oauth_token: Optional[gr.OAuthToken] = None): | |
fs = HfFileSystem(token=oauth_token.token if oauth_token else None) | |
if "/" not in dataset: | |
return {revision_dropdown: gr.Dropdown(choices=[], value="", info="")} | |
try: | |
prs = [f"{dataset}@refs/pr/{pr.num}" for pr in fs._api.get_repo_discussions(dataset, repo_type="dataset", discussion_type="pull_request")] | |
revision = f"{dataset}@main" | |
return {revision_dropdown: gr.Dropdown(choices=[revision] + prs, value=revision, info=f"{len(prs)} pull request{'s' if len(prs) > 1 else ''} available" if prs else None)} | |
except Exception: | |
return {revision_dropdown: gr.Dropdown(choices=[], value="", info="no revisions available")} | |
def revision_update(dataset_and_revision, oauth_token: Optional[gr.OAuthToken] = None): | |
fs = HfFileSystem(token=oauth_token.token if oauth_token else None) | |
try: | |
parquet_files = ["hf://" + path for path in fs.glob(f"datasets/{dataset_and_revision}/**/*.parquet")] | |
parquet_file = parquet_files[0] if parquet_files else None | |
return {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file, info=f"{len(parquet_files)} parquet file{'s' if len(parquet_files) > 1 else ''} available")} | |
except Exception: | |
return {parquet_file_dropdown: gr.Dropdown(choices=[], value="", info="")} | |
def parquet_file_update(parquet_file, oauth_token: Optional[gr.OAuthToken] = None): | |
fs = HfFileSystem(token=oauth_token.token if oauth_token else None) | |
try: | |
return {output_dataframe: pd.DataFrame([{k: str(v)[:1000] for k, v in x.items()} for x in pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pylist()] if parquet_file else [])} | |
except Exception: | |
return {output_dataframe: []} | |
demo.launch() | |