lhoestq HF staff commited on
Commit
ca78cff
·
1 Parent(s): aaf8e92

support images

Browse files
Files changed (1) hide show
  1. app.py +128 -40
app.py CHANGED
@@ -7,18 +7,72 @@ import numpy as np
7
  import pyarrow as pa
8
  import pyarrow.parquet as pq
9
  from functools import partial
10
- from io import StringIO
11
  from tqdm.contrib.concurrent import thread_map
12
- from datasets import Features
13
- from fastapi import FastAPI
14
  import uvicorn
 
15
 
16
 
17
  class AppError(RuntimeError):
18
  pass
19
 
20
 
 
21
  PAGE_SIZE = 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  @lru_cache(maxsize=128)
@@ -48,26 +102,17 @@ def get_parquet_splits(dataset: str, config: str) -> List[str]:
48
  all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
49
  return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
50
 
51
- def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]:
52
- try:
53
- page = int(page)
54
- assert page > 0
55
- except:
56
- raise AppError(f"Bad page: {page}")
57
- if not dataset:
58
- raise AppError("Empty dataset name")
59
- if not config:
60
- raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
61
- if not split:
62
- raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
63
- return dataset, config, split, int(page)
64
 
65
 
66
  RowGroupReaders = List[Callable[[], pa.Table]]
67
 
68
 
69
  @lru_cache(maxsize=128)
70
- def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, str]:
71
  fs = get_parquet_fs(dataset)
72
  sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
73
  if not sources:
@@ -78,12 +123,10 @@ def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupRe
78
  desc = f"{dataset}/{config}/{split}"
79
  all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources, desc=desc, unit="pq")
80
  features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
81
- columns = [col for col in features if all(bad_type not in str(features[col]) for bad_type in ["Image(", "Audio(", "'binary'"])]
82
- info = "" if len(columns) == len(features) else f"Some columns are not supported yet: {sorted(set(features) - set(columns))}"
83
  rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
84
- rg_readers = [partial(pf.read_row_group, i, columns=columns) for pf in all_pf for i in range(pf.metadata.num_row_groups)]
85
  max_page = 1 + (rg_offsets[-1] - 1) // PAGE_SIZE
86
- return rg_offsets, rg_readers, max_page, info
87
 
88
 
89
  def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
@@ -97,30 +140,70 @@ def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGrou
97
  return pa_table.to_pandas()
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  @lru_cache(maxsize=128)
101
- def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]:
102
  dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
103
- rg_offsets, rg_readers, max_page, info = index(dataset, config, split)
104
  if page > max_page:
105
  raise AppError(f"Page {page} does not exist")
106
  df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
107
- buf = StringIO()
108
- df.to_json(buf, lines=True, orient="records")
109
- return buf.getvalue(), max_page, info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
  with gr.Blocks() as demo:
113
  gr.Markdown("# 📖 Datasets Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
114
- cp_dataset = gr.Textbox("squad", label="Pick a dataset", placeholder="squad")
115
  cp_go = gr.Button("Explore")
116
  cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
117
  cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
118
- with gr.Row():
119
- cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
120
- cp_goto_page = gr.Button("Go to page", visible=False)
121
  cp_error = gr.Markdown("", visible=False)
122
  cp_info = gr.Markdown("", visible=False)
123
  cp_result = gr.Markdown("", visible=False)
 
 
 
124
 
125
  def show_error(message: str) -> dict():
126
  return {
@@ -131,15 +214,25 @@ with gr.Blocks() as demo:
131
 
132
  def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
133
  try:
134
- jsonl_result, max_page, info = get_page(dataset, config, split, page)
135
  info = f"({info})" if info else ""
136
  return {
137
- cp_result: gr.update(visible=True, value=f"```json\n{jsonl_result}\n```"),
138
  cp_info: gr.update(visible=True, value=f"Page {page}/{max_page} {info}"),
139
  cp_error: gr.update(visible=False, value="")
140
  }
141
  except AppError as err:
142
  return show_error(str(err))
 
 
 
 
 
 
 
 
 
 
143
 
144
  def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
145
  try:
@@ -147,6 +240,7 @@ with gr.Blocks() as demo:
147
  **show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
148
  cp_page: gr.update(value="1", visible=True),
149
  cp_goto_page: gr.update(visible=True),
 
150
  }
151
  except AppError as err:
152
  return show_error(str(err))
@@ -179,18 +273,12 @@ with gr.Blocks() as demo:
179
  except AppError as err:
180
  return show_error(str(err))
181
 
182
- all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_result, cp_info, cp_error]
183
  cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
184
  cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
185
  cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
186
  cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
187
-
188
-
189
- app = FastAPI()
190
-
191
- @app.get("/lol")
192
- def read_main():
193
- return {"message": "This is your main app"}
194
 
195
 
196
  if __name__ == "__main__":
 
7
  import pyarrow as pa
8
  import pyarrow.parquet as pq
9
  from functools import partial
 
10
  from tqdm.contrib.concurrent import thread_map
11
+ from datasets import Features, Image, Audio
12
+ from fastapi import FastAPI, Response
13
  import uvicorn
14
+ import os
15
 
16
 
17
  class AppError(RuntimeError):
18
  pass
19
 
20
 
21
+ APP_URL = "http://127.0.0.1:7860" if os.getenv("DEV") else "https://lhoestq-datasets-explorer.hf.space"
22
  PAGE_SIZE = 20
23
+ MAX_CACHED_BLOBS = PAGE_SIZE * 10
24
+ _blobs_cache = {}
25
+
26
+ #####################################################
27
+ # Define routes for image and audio files
28
+ #####################################################
29
+
30
+ app = FastAPI()
31
+
32
+
33
+ @app.get(
34
+ "/image",
35
+ responses={200: {"content": {"image/png": {}}}},
36
+ response_class=Response,
37
+ )
38
+ def image(id: str):
39
+ blob = get_blob(id)
40
+ return Response(content=blob, media_type="image/png")
41
+
42
+
43
+ @app.get(
44
+ "/audio",
45
+ responses={200: {"content": {"audio/wav": {}}}},
46
+ response_class=Response,
47
+ )
48
+ def audio(id: str):
49
+ blob = get_blob(id)
50
+ return Response(content=blob, media_type="audio/wav")
51
+
52
+
53
+ def push_blob(blob: bytes, blob_id: str) -> str:
54
+ global _blobs_cache
55
+ if blob_id in _blobs_cache:
56
+ del _blobs_cache[blob_id]
57
+ _blobs_cache[blob_id] = blob
58
+ if len(_blobs_cache) > MAX_CACHED_BLOBS:
59
+ del _blobs_cache[next(iter(_blobs_cache))]
60
+ return blob_id
61
+
62
+
63
+ def get_blob(blob_id: str) -> bytes:
64
+ global _blobs_cache
65
+ return _blobs_cache[blob_id]
66
+
67
+
68
+ def blobs_to_urls(blobs: List[bytes], type: str, prefix: str) -> List[str]:
69
+ image_blob_ids = [push_blob(blob, f"{prefix}-{i}") for i, blob in enumerate(blobs)]
70
+ return [APP_URL + f"/{type}?id={blob_id}" for blob_id in image_blob_ids]
71
+
72
+
73
+ #####################################################
74
+ # List configs, splits and parquet files
75
+ #####################################################
76
 
77
 
78
  @lru_cache(maxsize=128)
 
102
  all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
103
  return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
104
 
105
+
106
+ #####################################################
107
+ # Index and query Parquet data
108
+ #####################################################
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  RowGroupReaders = List[Callable[[], pa.Table]]
112
 
113
 
114
  @lru_cache(maxsize=128)
115
+ def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, Features]:
116
  fs = get_parquet_fs(dataset)
117
  sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
118
  if not sources:
 
123
  desc = f"{dataset}/{config}/{split}"
124
  all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources, desc=desc, unit="pq")
125
  features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
 
 
126
  rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
127
+ rg_readers = [partial(pf.read_row_group, i) for pf in all_pf for i in range(pf.metadata.num_row_groups)]
128
  max_page = 1 + (rg_offsets[-1] - 1) // PAGE_SIZE
129
+ return rg_offsets, rg_readers, max_page, features
130
 
131
 
132
  def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
 
140
  return pa_table.to_pandas()
141
 
142
 
143
+ def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]:
144
+ try:
145
+ page = int(page)
146
+ assert page > 0
147
+ except:
148
+ raise AppError(f"Bad page: {page}")
149
+ if not dataset:
150
+ raise AppError("Empty dataset name")
151
+ if not config:
152
+ raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
153
+ if not split:
154
+ raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
155
+ return dataset, config, split, int(page)
156
+
157
+
158
  @lru_cache(maxsize=128)
159
+ def get_page_df(dataset: str, config: str, split: str, page: str) -> Tuple[pd.DataFrame, int, Features]:
160
  dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
161
+ rg_offsets, rg_readers, max_page, features = index(dataset, config, split)
162
  if page > max_page:
163
  raise AppError(f"Page {page} does not exist")
164
  df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
165
+ return df, max_page, features
166
+
167
+
168
+ #####################################################
169
+ # Format results
170
+ #####################################################
171
+
172
+
173
+ def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]:
174
+ df, max_page, features = get_page_df(dataset, config, split, page)
175
+ unsupported_columns = []
176
+ for column, feature in features.items():
177
+ if isinstance(feature, Image):
178
+ blob_type = "image" # TODO: support audio - right now it seems that the markdown renderer in gradio doesn't support audio and shows nothing
179
+ blob_urls = blobs_to_urls([item.get("bytes") if isinstance(item, dict) else None for item in df[column]], blob_type, prefix=f"{dataset}-{config}-{split}-{page}-{column}")
180
+ df = df.drop([column], axis=1)
181
+ df[column] = [f"![]({url})" for url in blob_urls]
182
+ elif any(bad_type in str(feature) for bad_type in ["Image(", "Audio(", "'binary'"]):
183
+ unsupported_columns.append(column)
184
+ df = df.drop([column], axis=1)
185
+ info = "" if not unsupported_columns else f"Some columns are not supported yet: {unsupported_columns}"
186
+ return df.to_markdown(index=False), max_page, info
187
+
188
+
189
+ #####################################################
190
+ # Gradio app
191
+ #####################################################
192
 
193
 
194
  with gr.Blocks() as demo:
195
  gr.Markdown("# 📖 Datasets Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
196
+ cp_dataset = gr.Textbox("competitions/aiornot", label="Pick a dataset", placeholder="competitions/aiornot")
197
  cp_go = gr.Button("Explore")
198
  cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
199
  cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
200
+ cp_goto_next_page = gr.Button("Next page", visible=False)
 
 
201
  cp_error = gr.Markdown("", visible=False)
202
  cp_info = gr.Markdown("", visible=False)
203
  cp_result = gr.Markdown("", visible=False)
204
+ with gr.Row():
205
+ cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
206
+ cp_goto_page = gr.Button("Go to page", visible=False)
207
 
208
  def show_error(message: str) -> dict():
209
  return {
 
214
 
215
  def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
216
  try:
217
+ markdown_result, max_page, info = get_page(dataset, config, split, page)
218
  info = f"({info})" if info else ""
219
  return {
220
+ cp_result: gr.update(visible=True, value=markdown_result),
221
  cp_info: gr.update(visible=True, value=f"Page {page}/{max_page} {info}"),
222
  cp_error: gr.update(visible=False, value="")
223
  }
224
  except AppError as err:
225
  return show_error(str(err))
226
+
227
+ def show_dataset_at_config_and_split_and_next_page(dataset: str, config: str, split: str, page: str) -> dict:
228
+ try:
229
+ next_page = str(int(page) + 1)
230
+ return {
231
+ **show_dataset_at_config_and_split_and_page(dataset, config, split, next_page),
232
+ cp_page: gr.update(value=next_page, visible=True),
233
+ }
234
+ except AppError as err:
235
+ return show_error(str(err))
236
 
237
  def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
238
  try:
 
240
  **show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
241
  cp_page: gr.update(value="1", visible=True),
242
  cp_goto_page: gr.update(visible=True),
243
+ cp_goto_next_page: gr.update(visible=True),
244
  }
245
  except AppError as err:
246
  return show_error(str(err))
 
273
  except AppError as err:
274
  return show_error(str(err))
275
 
276
+ all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_goto_next_page, cp_result, cp_info, cp_error]
277
  cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
278
  cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
279
  cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
280
  cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
281
+ cp_goto_next_page.click(show_dataset_at_config_and_split_and_next_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
 
 
 
 
 
 
282
 
283
 
284
  if __name__ == "__main__":