Spaces:
Sleeping
Sleeping
import gradio as gr | |
from functools import lru_cache | |
from hffs.fs import HfFileSystem | |
from typing import List, Tuple, Callable | |
import pandas as pd | |
import numpy as np | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
from functools import partial | |
from io import StringIO | |
from tqdm.contrib.concurrent import thread_map | |
from datasets import Features | |
class AppError(RuntimeError): | |
pass | |
PAGE_SIZE = 20 | |
def get_parquet_fs(dataset: str) -> HfFileSystem: | |
try: | |
fs = HfFileSystem(dataset, repo_type="dataset", revision="refs/convert/parquet") | |
if any(fs.isfile(path) for path in fs.ls("") if not path.startswith(".")): | |
raise AppError(f"Parquet export doesn't exist for '{dataset}'.") | |
return fs | |
except: | |
raise AppError(f"Parquet export doesn't exist for '{dataset}'.") | |
def get_parquet_configs(dataset: str) -> List[str]: | |
fs = get_parquet_fs(dataset) | |
return [path for path in fs.ls("") if fs.isdir(path)] | |
def _sorted_split_key(split: str) -> str: | |
return split if not split.startswith("train") else chr(0) + split # always "train" first | |
def get_parquet_splits(dataset: str, config: str) -> List[str]: | |
fs = get_parquet_fs(dataset) | |
all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")] | |
return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key) | |
def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]: | |
try: | |
page = int(page) | |
assert page > 0 | |
except: | |
raise AppError(f"Bad page: {page}") | |
if not dataset: | |
raise AppError("Empty dataset name") | |
if not config: | |
raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.") | |
if not split: | |
raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.") | |
return dataset, config, split, int(page) | |
RowGroupReaders = List[Callable[[], pa.Table]] | |
def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, str]: | |
fs = get_parquet_fs(dataset) | |
sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet") | |
if not sources: | |
if config not in get_parquet_configs(dataset): | |
raise AppError(f"Invalid config {config}. Available configs are: {', '.join(get_parquet_configs(dataset))}.") | |
else: | |
raise AppError(f"Invalid split {split}. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.") | |
all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources) | |
features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema()) | |
columns = [col for col in features if all(bad_type not in str(features[col]) for bad_type in ["Image(", "Audio(", "'binary'"])] | |
info = "" if len(columns) == len(features) else f"Some columns are not supported yet: {sorted(set(features) - set(columns))}" | |
rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)]) | |
rg_readers = [partial(pf.read_row_group, i, columns=columns) for pf in all_pf for i in range(pf.metadata.num_row_groups)] | |
max_page = rg_offsets[-1] // PAGE_SIZE | |
return rg_offsets, rg_readers, max_page, info | |
def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame: | |
start_row, end_row = (page - 1) * page_size, page * page_size | |
start_rg, end_rg = np.searchsorted(rg_offsets, [start_row, end_row], side="right") | |
if page < 1 or end_rg >= len(rg_readers): | |
raise AppError(f"Page {page} does not exist") | |
pa_table = pa.concat_tables([rg_readers[i]() for i in range(start_rg, end_rg + 1)]) | |
offset = start_row - rg_offsets[start_rg - 1] if start_rg else start_row | |
pa_table = pa_table.slice(offset, end_row - start_row) | |
return pa_table.to_pandas() | |
def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]: | |
dataset, config, split, page = sanitize_inputs(dataset, config, split, page) | |
rg_offsets, rg_readers, max_page, info = index(dataset, config, split) | |
df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers) | |
buf = StringIO() | |
df.to_json(buf, lines=True, orient="records") | |
return buf.getvalue(), max_page, info | |
with gr.Blocks() as demo: | |
gr.Markdown("# 📖 Dataset Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)") | |
cp_dataset = gr.Textbox("squad", label="Pick a dataset", placeholder="squad") | |
cp_go = gr.Button("Explore") | |
cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False) | |
cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False) | |
with gr.Row(): | |
cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False) | |
cp_goto_page = gr.Button("Go to page", visible=False) | |
cp_error = gr.Markdown("", visible=False) | |
cp_info = gr.Markdown("", visible=False) | |
cp_result = gr.Markdown("", visible=False) | |
def show_error(message: str) -> dict(): | |
return { | |
cp_error: gr.update(visible=True, value=f"## ❌ Error:\n\n{message}"), | |
cp_info: gr.update(visible=False, value=""), | |
cp_result: gr.update(visible=False, value=""), | |
} | |
def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict: | |
try: | |
jsonl_result, max_page, info = get_page(dataset, config, split, page) | |
info = f"({info})" if info else "" | |
return { | |
cp_result: gr.update(visible=True, value=f"```json\n{jsonl_result}\n```"), | |
cp_info: gr.update(visible=True, value=f"Page {page}/{max_page}) {info}"), | |
cp_error: gr.update(visible=False, value="") | |
} | |
except AppError as err: | |
return show_error(str(err)) | |
def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict: | |
try: | |
return { | |
**show_dataset_at_config_and_split_and_page(dataset, config, split, "1"), | |
cp_page: gr.update(value="1", visible=True), | |
cp_goto_page: gr.update(visible=True), | |
} | |
except AppError as err: | |
return show_error(str(err)) | |
def show_dataset_at_config(dataset: str, config: str) -> dict: | |
try: | |
splits = get_parquet_splits(dataset, config) | |
if not splits: | |
raise AppError(f"Dataset {dataset} with config {config} has no splits.") | |
else: | |
split = splits[0] | |
return { | |
**show_dataset_at_config_and_split(dataset, config, split), | |
cp_split: gr.update(value=split, choices=splits, visible=len(splits) > 1), | |
} | |
except AppError as err: | |
return show_error(str(err)) | |
def show_dataset(dataset: str) -> dict: | |
try: | |
configs = get_parquet_configs(dataset) | |
if not configs: | |
raise AppError(f"Dataset {dataset} has no configs.") | |
else: | |
config = configs[0] | |
return { | |
**show_dataset_at_config(dataset, config), | |
cp_config: gr.update(value=config, choices=configs, visible=len(configs) > 1), | |
} | |
except AppError as err: | |
return show_error(str(err)) | |
all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_result, cp_info, cp_error] | |
cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs) | |
cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs) | |
cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs) | |
cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs) | |
if __name__ == "__main__": | |
demo.launch() | |