Spaces:
Running
Running
support images
Browse files
app.py
CHANGED
@@ -7,18 +7,72 @@ import numpy as np
|
|
7 |
import pyarrow as pa
|
8 |
import pyarrow.parquet as pq
|
9 |
from functools import partial
|
10 |
-
from io import StringIO
|
11 |
from tqdm.contrib.concurrent import thread_map
|
12 |
-
from datasets import Features
|
13 |
-
from fastapi import FastAPI
|
14 |
import uvicorn
|
|
|
15 |
|
16 |
|
17 |
class AppError(RuntimeError):
|
18 |
pass
|
19 |
|
20 |
|
|
|
21 |
PAGE_SIZE = 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
@lru_cache(maxsize=128)
|
@@ -48,26 +102,17 @@ def get_parquet_splits(dataset: str, config: str) -> List[str]:
|
|
48 |
all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
|
49 |
return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
except:
|
56 |
-
raise AppError(f"Bad page: {page}")
|
57 |
-
if not dataset:
|
58 |
-
raise AppError("Empty dataset name")
|
59 |
-
if not config:
|
60 |
-
raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
|
61 |
-
if not split:
|
62 |
-
raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
|
63 |
-
return dataset, config, split, int(page)
|
64 |
|
65 |
|
66 |
RowGroupReaders = List[Callable[[], pa.Table]]
|
67 |
|
68 |
|
69 |
@lru_cache(maxsize=128)
|
70 |
-
def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int,
|
71 |
fs = get_parquet_fs(dataset)
|
72 |
sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
|
73 |
if not sources:
|
@@ -78,12 +123,10 @@ def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupRe
|
|
78 |
desc = f"{dataset}/{config}/{split}"
|
79 |
all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources, desc=desc, unit="pq")
|
80 |
features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
|
81 |
-
columns = [col for col in features if all(bad_type not in str(features[col]) for bad_type in ["Image(", "Audio(", "'binary'"])]
|
82 |
-
info = "" if len(columns) == len(features) else f"Some columns are not supported yet: {sorted(set(features) - set(columns))}"
|
83 |
rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
|
84 |
-
rg_readers = [partial(pf.read_row_group, i
|
85 |
max_page = 1 + (rg_offsets[-1] - 1) // PAGE_SIZE
|
86 |
-
return rg_offsets, rg_readers, max_page,
|
87 |
|
88 |
|
89 |
def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
|
@@ -97,30 +140,70 @@ def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGrou
|
|
97 |
return pa_table.to_pandas()
|
98 |
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
@lru_cache(maxsize=128)
|
101 |
-
def
|
102 |
dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
|
103 |
-
rg_offsets, rg_readers, max_page,
|
104 |
if page > max_page:
|
105 |
raise AppError(f"Page {page} does not exist")
|
106 |
df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
|
112 |
with gr.Blocks() as demo:
|
113 |
gr.Markdown("# 📖 Datasets Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
|
114 |
-
cp_dataset = gr.Textbox("
|
115 |
cp_go = gr.Button("Explore")
|
116 |
cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
|
117 |
cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
|
118 |
-
|
119 |
-
cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
|
120 |
-
cp_goto_page = gr.Button("Go to page", visible=False)
|
121 |
cp_error = gr.Markdown("", visible=False)
|
122 |
cp_info = gr.Markdown("", visible=False)
|
123 |
cp_result = gr.Markdown("", visible=False)
|
|
|
|
|
|
|
124 |
|
125 |
def show_error(message: str) -> dict():
|
126 |
return {
|
@@ -131,15 +214,25 @@ with gr.Blocks() as demo:
|
|
131 |
|
132 |
def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
|
133 |
try:
|
134 |
-
|
135 |
info = f"({info})" if info else ""
|
136 |
return {
|
137 |
-
cp_result: gr.update(visible=True, value=
|
138 |
cp_info: gr.update(visible=True, value=f"Page {page}/{max_page} {info}"),
|
139 |
cp_error: gr.update(visible=False, value="")
|
140 |
}
|
141 |
except AppError as err:
|
142 |
return show_error(str(err))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
|
145 |
try:
|
@@ -147,6 +240,7 @@ with gr.Blocks() as demo:
|
|
147 |
**show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
|
148 |
cp_page: gr.update(value="1", visible=True),
|
149 |
cp_goto_page: gr.update(visible=True),
|
|
|
150 |
}
|
151 |
except AppError as err:
|
152 |
return show_error(str(err))
|
@@ -179,18 +273,12 @@ with gr.Blocks() as demo:
|
|
179 |
except AppError as err:
|
180 |
return show_error(str(err))
|
181 |
|
182 |
-
all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_result, cp_info, cp_error]
|
183 |
cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
|
184 |
cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
|
185 |
cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
|
186 |
cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
|
187 |
-
|
188 |
-
|
189 |
-
app = FastAPI()
|
190 |
-
|
191 |
-
@app.get("/lol")
|
192 |
-
def read_main():
|
193 |
-
return {"message": "This is your main app"}
|
194 |
|
195 |
|
196 |
if __name__ == "__main__":
|
|
|
7 |
import pyarrow as pa
|
8 |
import pyarrow.parquet as pq
|
9 |
from functools import partial
|
|
|
10 |
from tqdm.contrib.concurrent import thread_map
|
11 |
+
from datasets import Features, Image, Audio
|
12 |
+
from fastapi import FastAPI, Response
|
13 |
import uvicorn
|
14 |
+
import os
|
15 |
|
16 |
|
17 |
class AppError(RuntimeError):
|
18 |
pass
|
19 |
|
20 |
|
21 |
+
APP_URL = "http://127.0.0.1:7860" if os.getenv("DEV") else "https://lhoestq-datasets-explorer.hf.space"
|
22 |
PAGE_SIZE = 20
|
23 |
+
MAX_CACHED_BLOBS = PAGE_SIZE * 10
|
24 |
+
_blobs_cache = {}
|
25 |
+
|
26 |
+
#####################################################
|
27 |
+
# Define routes for image and audio files
|
28 |
+
#####################################################
|
29 |
+
|
30 |
+
app = FastAPI()
|
31 |
+
|
32 |
+
|
33 |
+
@app.get(
|
34 |
+
"/image",
|
35 |
+
responses={200: {"content": {"image/png": {}}}},
|
36 |
+
response_class=Response,
|
37 |
+
)
|
38 |
+
def image(id: str):
|
39 |
+
blob = get_blob(id)
|
40 |
+
return Response(content=blob, media_type="image/png")
|
41 |
+
|
42 |
+
|
43 |
+
@app.get(
|
44 |
+
"/audio",
|
45 |
+
responses={200: {"content": {"audio/wav": {}}}},
|
46 |
+
response_class=Response,
|
47 |
+
)
|
48 |
+
def audio(id: str):
|
49 |
+
blob = get_blob(id)
|
50 |
+
return Response(content=blob, media_type="audio/wav")
|
51 |
+
|
52 |
+
|
53 |
+
def push_blob(blob: bytes, blob_id: str) -> str:
|
54 |
+
global _blobs_cache
|
55 |
+
if blob_id in _blobs_cache:
|
56 |
+
del _blobs_cache[blob_id]
|
57 |
+
_blobs_cache[blob_id] = blob
|
58 |
+
if len(_blobs_cache) > MAX_CACHED_BLOBS:
|
59 |
+
del _blobs_cache[next(iter(_blobs_cache))]
|
60 |
+
return blob_id
|
61 |
+
|
62 |
+
|
63 |
+
def get_blob(blob_id: str) -> bytes:
|
64 |
+
global _blobs_cache
|
65 |
+
return _blobs_cache[blob_id]
|
66 |
+
|
67 |
+
|
68 |
+
def blobs_to_urls(blobs: List[bytes], type: str, prefix: str) -> List[str]:
|
69 |
+
image_blob_ids = [push_blob(blob, f"{prefix}-{i}") for i, blob in enumerate(blobs)]
|
70 |
+
return [APP_URL + f"/{type}?id={blob_id}" for blob_id in image_blob_ids]
|
71 |
+
|
72 |
+
|
73 |
+
#####################################################
|
74 |
+
# List configs, splits and parquet files
|
75 |
+
#####################################################
|
76 |
|
77 |
|
78 |
@lru_cache(maxsize=128)
|
|
|
102 |
all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
|
103 |
return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
|
104 |
|
105 |
+
|
106 |
+
#####################################################
|
107 |
+
# Index and query Parquet data
|
108 |
+
#####################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
|
111 |
RowGroupReaders = List[Callable[[], pa.Table]]
|
112 |
|
113 |
|
114 |
@lru_cache(maxsize=128)
|
115 |
+
def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, Features]:
|
116 |
fs = get_parquet_fs(dataset)
|
117 |
sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
|
118 |
if not sources:
|
|
|
123 |
desc = f"{dataset}/{config}/{split}"
|
124 |
all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources, desc=desc, unit="pq")
|
125 |
features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
|
|
|
|
|
126 |
rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
|
127 |
+
rg_readers = [partial(pf.read_row_group, i) for pf in all_pf for i in range(pf.metadata.num_row_groups)]
|
128 |
max_page = 1 + (rg_offsets[-1] - 1) // PAGE_SIZE
|
129 |
+
return rg_offsets, rg_readers, max_page, features
|
130 |
|
131 |
|
132 |
def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
|
|
|
140 |
return pa_table.to_pandas()
|
141 |
|
142 |
|
143 |
+
def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]:
|
144 |
+
try:
|
145 |
+
page = int(page)
|
146 |
+
assert page > 0
|
147 |
+
except:
|
148 |
+
raise AppError(f"Bad page: {page}")
|
149 |
+
if not dataset:
|
150 |
+
raise AppError("Empty dataset name")
|
151 |
+
if not config:
|
152 |
+
raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
|
153 |
+
if not split:
|
154 |
+
raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
|
155 |
+
return dataset, config, split, int(page)
|
156 |
+
|
157 |
+
|
158 |
@lru_cache(maxsize=128)
|
159 |
+
def get_page_df(dataset: str, config: str, split: str, page: str) -> Tuple[pd.DataFrame, int, Features]:
|
160 |
dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
|
161 |
+
rg_offsets, rg_readers, max_page, features = index(dataset, config, split)
|
162 |
if page > max_page:
|
163 |
raise AppError(f"Page {page} does not exist")
|
164 |
df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
|
165 |
+
return df, max_page, features
|
166 |
+
|
167 |
+
|
168 |
+
#####################################################
|
169 |
+
# Format results
|
170 |
+
#####################################################
|
171 |
+
|
172 |
+
|
173 |
+
def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]:
|
174 |
+
df, max_page, features = get_page_df(dataset, config, split, page)
|
175 |
+
unsupported_columns = []
|
176 |
+
for column, feature in features.items():
|
177 |
+
if isinstance(feature, Image):
|
178 |
+
blob_type = "image" # TODO: support audio - right now it seems that the markdown renderer in gradio doesn't support audio and shows nothing
|
179 |
+
blob_urls = blobs_to_urls([item.get("bytes") if isinstance(item, dict) else None for item in df[column]], blob_type, prefix=f"{dataset}-{config}-{split}-{page}-{column}")
|
180 |
+
df = df.drop([column], axis=1)
|
181 |
+
df[column] = [f"" for url in blob_urls]
|
182 |
+
elif any(bad_type in str(feature) for bad_type in ["Image(", "Audio(", "'binary'"]):
|
183 |
+
unsupported_columns.append(column)
|
184 |
+
df = df.drop([column], axis=1)
|
185 |
+
info = "" if not unsupported_columns else f"Some columns are not supported yet: {unsupported_columns}"
|
186 |
+
return df.to_markdown(index=False), max_page, info
|
187 |
+
|
188 |
+
|
189 |
+
#####################################################
|
190 |
+
# Gradio app
|
191 |
+
#####################################################
|
192 |
|
193 |
|
194 |
with gr.Blocks() as demo:
|
195 |
gr.Markdown("# 📖 Datasets Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
|
196 |
+
cp_dataset = gr.Textbox("competitions/aiornot", label="Pick a dataset", placeholder="competitions/aiornot")
|
197 |
cp_go = gr.Button("Explore")
|
198 |
cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
|
199 |
cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
|
200 |
+
cp_goto_next_page = gr.Button("Next page", visible=False)
|
|
|
|
|
201 |
cp_error = gr.Markdown("", visible=False)
|
202 |
cp_info = gr.Markdown("", visible=False)
|
203 |
cp_result = gr.Markdown("", visible=False)
|
204 |
+
with gr.Row():
|
205 |
+
cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
|
206 |
+
cp_goto_page = gr.Button("Go to page", visible=False)
|
207 |
|
208 |
def show_error(message: str) -> dict():
|
209 |
return {
|
|
|
214 |
|
215 |
def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
|
216 |
try:
|
217 |
+
markdown_result, max_page, info = get_page(dataset, config, split, page)
|
218 |
info = f"({info})" if info else ""
|
219 |
return {
|
220 |
+
cp_result: gr.update(visible=True, value=markdown_result),
|
221 |
cp_info: gr.update(visible=True, value=f"Page {page}/{max_page} {info}"),
|
222 |
cp_error: gr.update(visible=False, value="")
|
223 |
}
|
224 |
except AppError as err:
|
225 |
return show_error(str(err))
|
226 |
+
|
227 |
+
def show_dataset_at_config_and_split_and_next_page(dataset: str, config: str, split: str, page: str) -> dict:
|
228 |
+
try:
|
229 |
+
next_page = str(int(page) + 1)
|
230 |
+
return {
|
231 |
+
**show_dataset_at_config_and_split_and_page(dataset, config, split, next_page),
|
232 |
+
cp_page: gr.update(value=next_page, visible=True),
|
233 |
+
}
|
234 |
+
except AppError as err:
|
235 |
+
return show_error(str(err))
|
236 |
|
237 |
def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
|
238 |
try:
|
|
|
240 |
**show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
|
241 |
cp_page: gr.update(value="1", visible=True),
|
242 |
cp_goto_page: gr.update(visible=True),
|
243 |
+
cp_goto_next_page: gr.update(visible=True),
|
244 |
}
|
245 |
except AppError as err:
|
246 |
return show_error(str(err))
|
|
|
273 |
except AppError as err:
|
274 |
return show_error(str(err))
|
275 |
|
276 |
+
all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_goto_next_page, cp_result, cp_info, cp_error]
|
277 |
cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
|
278 |
cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
|
279 |
cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
|
280 |
cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
|
281 |
+
cp_goto_next_page.click(show_dataset_at_config_and_split_and_next_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
|
284 |
if __name__ == "__main__":
|