kevinconka commited on
Commit
69f95c1
·
1 Parent(s): aa9ad30

HuggingFaceDatasetSaver changes

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. flagging.py +282 -4
app.py CHANGED
@@ -14,7 +14,7 @@ from utils import (
14
  load_badges,
15
  FlaggedCounter,
16
  )
17
- from flagging import myHuggingFaceDatasetSaver
18
  from model_yolov5 import load_model, inference
19
 
20
 
@@ -50,7 +50,7 @@ model.agnostic = True # NMS class-agnostic
50
 
51
  # Flagging
52
  dataset_name = "SEA-AI/crowdsourced-sea-images"
53
- hf_writer = myHuggingFaceDatasetSaver(get_token(), dataset_name)
54
  flagged_counter = FlaggedCounter(dataset_name)
55
 
56
 
 
14
  load_badges,
15
  FlaggedCounter,
16
  )
17
+ from flagging import HuggingFaceDatasetSaver
18
  from model_yolov5 import load_model, inference
19
 
20
 
 
50
 
51
  # Flagging
52
  dataset_name = "SEA-AI/crowdsourced-sea-images"
53
+ hf_writer = HuggingFaceDatasetSaver(get_token(), dataset_name)
54
  flagged_counter = FlaggedCounter(dataset_name)
55
 
56
 
flagging.py CHANGED
@@ -1,12 +1,290 @@
1
- import os
2
  import json
 
3
  from collections import OrderedDict
4
  from pathlib import Path
5
- from typing import Any
6
- import gradio as gr
7
- from gradio.flagging import HuggingFaceDatasetSaver, client_utils
8
  import huggingface_hub
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class myHuggingFaceDatasetSaver(HuggingFaceDatasetSaver):
12
  """
 
1
+ import csv
2
  import json
3
+ import uuid
4
  from collections import OrderedDict
5
  from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Sequence
7
+
8
+ import filelock
9
  import huggingface_hub
10
 
11
+ import gradio as gr
12
+ from gradio import utils
13
+ from gradio.flagging import client_utils, FlaggingCallback
14
+ from gradio_client.documentation import document
15
+ from gradio.components import Component
16
+
17
+
18
+ @document()
19
+ class HuggingFaceDatasetSaver(FlaggingCallback):
20
+ """
21
+ A callback that saves each flagged sample (both the input and output data) to a HuggingFace dataset.
22
+
23
+ Example:
24
+ import gradio as gr
25
+ hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
26
+ def image_classifier(inp):
27
+ return {'cat': 0.3, 'dog': 0.7}
28
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
29
+ allow_flagging="manual", flagging_callback=hf_writer)
30
+ Guides: using-flagging
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ hf_token: str,
36
+ dataset_name: str,
37
+ private: bool = False,
38
+ info_filename: str = "dataset_info.json",
39
+ separate_dirs: bool = False,
40
+ ):
41
+ """
42
+ Parameters:
43
+ hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset (defaults to the registered one).
44
+ dataset_name: The repo_id of the dataset to save the data to, e.g. "image-classifier-1" or "username/image-classifier-1".
45
+ private: Whether the dataset should be private (defaults to False).
46
+ info_filename: The name of the file to save the dataset info (defaults to "dataset_infos.json").
47
+ separate_dirs: If True, each flagged item will be saved in a separate directory. This makes the flagging more robust to concurrent editing, but may be less convenient to use.
48
+ """
49
+ self.hf_token = hf_token
50
+ self.dataset_id = dataset_name # TODO: rename parameter (but ensure backward compatibility somehow)
51
+ self.dataset_private = private
52
+ self.info_filename = info_filename
53
+ self.separate_dirs = separate_dirs
54
+
55
+ def setup(self, components: Sequence[Component], flagging_dir: str):
56
+ """
57
+ Params:
58
+ flagging_dir (str): local directory where the dataset is cloned,
59
+ updated, and pushed from.
60
+ """
61
+ # Setup dataset on the Hub
62
+ self.dataset_id = huggingface_hub.create_repo(
63
+ repo_id=self.dataset_id,
64
+ token=self.hf_token,
65
+ private=self.dataset_private,
66
+ repo_type="dataset",
67
+ exist_ok=True,
68
+ ).repo_id
69
+ path_glob = "**/*.jsonl" if self.separate_dirs else "data.csv"
70
+ huggingface_hub.metadata_update(
71
+ repo_id=self.dataset_id,
72
+ repo_type="dataset",
73
+ metadata={
74
+ "configs": [
75
+ {
76
+ "config_name": "default",
77
+ "data_files": [{"split": "train", "path": path_glob}],
78
+ }
79
+ ]
80
+ },
81
+ overwrite=True,
82
+ token=self.hf_token,
83
+ )
84
+
85
+ # Setup flagging dir
86
+ self.components = components
87
+ self.dataset_dir = (
88
+ Path(flagging_dir).absolute() / self.dataset_id.split("/")[-1]
89
+ )
90
+ self.dataset_dir.mkdir(parents=True, exist_ok=True)
91
+ self.infos_file = self.dataset_dir / self.info_filename
92
+
93
+ # Download remote files to local
94
+ remote_files = [self.info_filename]
95
+ if not self.separate_dirs:
96
+ # No separate dirs => means all data is in the same CSV file => download it to get its current content
97
+ remote_files.append("data.csv")
98
+
99
+ for filename in remote_files:
100
+ try:
101
+ huggingface_hub.hf_hub_download(
102
+ repo_id=self.dataset_id,
103
+ repo_type="dataset",
104
+ filename=filename,
105
+ local_dir=self.dataset_dir,
106
+ token=self.hf_token,
107
+ )
108
+ except huggingface_hub.utils.EntryNotFoundError:
109
+ pass
110
+
111
+ def flag(
112
+ self,
113
+ flag_data: list[Any],
114
+ flag_option: str = "",
115
+ username: str | None = None,
116
+ ) -> int:
117
+ if self.separate_dirs:
118
+ # JSONL files to support dataset preview on the Hub
119
+ unique_id = str(uuid.uuid4())
120
+ components_dir = self.dataset_dir / unique_id
121
+ data_file = components_dir / "metadata.jsonl"
122
+ path_in_repo = unique_id # upload in sub folder (safer for concurrency)
123
+ else:
124
+ # Unique CSV file
125
+ components_dir = self.dataset_dir
126
+ data_file = components_dir / "data.csv"
127
+ path_in_repo = None # upload at root level
128
+
129
+ return self._flag_in_dir(
130
+ data_file=data_file,
131
+ components_dir=components_dir,
132
+ path_in_repo=path_in_repo,
133
+ flag_data=flag_data,
134
+ flag_option=flag_option,
135
+ username=username or "",
136
+ )
137
+
138
+ def _flag_in_dir(
139
+ self,
140
+ data_file: Path,
141
+ components_dir: Path,
142
+ path_in_repo: str | None,
143
+ flag_data: list[Any],
144
+ flag_option: str = "",
145
+ username: str = "",
146
+ ) -> int:
147
+ # Deserialize components (write images/audio to files)
148
+ features, row = self._deserialize_components(
149
+ components_dir, flag_data, flag_option, username
150
+ )
151
+
152
+ # Write generic info to dataset_infos.json + upload
153
+ with filelock.FileLock(str(self.infos_file) + ".lock"):
154
+ if not self.infos_file.exists():
155
+ self.infos_file.write_text(
156
+ json.dumps({"flagged": {"features": features}})
157
+ )
158
+
159
+ huggingface_hub.upload_file(
160
+ repo_id=self.dataset_id,
161
+ repo_type="dataset",
162
+ token=self.hf_token,
163
+ path_in_repo=self.infos_file.name,
164
+ path_or_fileobj=self.infos_file,
165
+ )
166
+
167
+ headers = list(features.keys())
168
+
169
+ if not self.separate_dirs:
170
+ with filelock.FileLock(components_dir / ".lock"):
171
+ sample_nb = self._save_as_csv(data_file, headers=headers, row=row)
172
+ sample_name = str(sample_nb)
173
+ huggingface_hub.upload_folder(
174
+ repo_id=self.dataset_id,
175
+ repo_type="dataset",
176
+ commit_message=f"Flagged sample #{sample_name}",
177
+ path_in_repo=path_in_repo,
178
+ ignore_patterns="*.lock",
179
+ folder_path=components_dir,
180
+ token=self.hf_token,
181
+ )
182
+ else:
183
+ sample_name = self._save_as_jsonl(data_file, headers=headers, row=row)
184
+ sample_nb = len(
185
+ [path for path in self.dataset_dir.iterdir() if path.is_dir()]
186
+ )
187
+ huggingface_hub.upload_folder(
188
+ repo_id=self.dataset_id,
189
+ repo_type="dataset",
190
+ commit_message=f"Flagged sample #{sample_name}",
191
+ path_in_repo=path_in_repo,
192
+ ignore_patterns="*.lock",
193
+ folder_path=components_dir,
194
+ token=self.hf_token,
195
+ )
196
+
197
+ return sample_nb
198
+
199
+ @staticmethod
200
+ def _save_as_csv(data_file: Path, headers: list[str], row: list[Any]) -> int:
201
+ """Save data as CSV and return the sample name (row number)."""
202
+ is_new = not data_file.exists()
203
+
204
+ with data_file.open("a", newline="", encoding="utf-8") as csvfile:
205
+ writer = csv.writer(csvfile)
206
+
207
+ # Write CSV headers if new file
208
+ if is_new:
209
+ writer.writerow(utils.sanitize_list_for_csv(headers))
210
+
211
+ # Write CSV row for flagged sample
212
+ writer.writerow(utils.sanitize_list_for_csv(row))
213
+
214
+ with data_file.open(encoding="utf-8") as csvfile:
215
+ return sum(1 for _ in csv.reader(csvfile)) - 1
216
+
217
+ @staticmethod
218
+ def _save_as_jsonl(data_file: Path, headers: list[str], row: list[Any]) -> str:
219
+ """Save data as JSONL and return the sample name (uuid)."""
220
+ Path.mkdir(data_file.parent, parents=True, exist_ok=True)
221
+ with open(data_file, "w", encoding="utf-8") as f:
222
+ json.dump(dict(zip(headers, row)), f)
223
+ return data_file.parent.name
224
+
225
+ def _deserialize_components(
226
+ self,
227
+ data_dir: Path,
228
+ flag_data: list[Any],
229
+ flag_option: str = "",
230
+ username: str = "",
231
+ ) -> tuple[dict[Any, Any], list[Any]]:
232
+ """Deserialize components and return the corresponding row for the flagged sample.
233
+
234
+ Images/audio are saved to disk as individual files.
235
+ """
236
+ # Components that can have a preview on dataset repos
237
+ file_preview_types = {gr.Audio: "Audio", gr.Image: "Image"}
238
+
239
+ # Generate the row corresponding to the flagged sample
240
+ features = OrderedDict()
241
+ row = []
242
+ for component, sample in zip(self.components, flag_data):
243
+ # Get deserialized object (will save sample to disk if applicable -file, audio, image,...-)
244
+ label = component.label or ""
245
+ save_dir = data_dir / client_utils.strip_invalid_filename_characters(label)
246
+ save_dir.mkdir(exist_ok=True, parents=True)
247
+ deserialized = utils.simplify_file_data_in_str(
248
+ component.flag(sample, save_dir)
249
+ )
250
+
251
+ # Add deserialized object to row
252
+ features[label] = {"dtype": "string", "_type": "Value"}
253
+ try:
254
+ deserialized_path = Path(deserialized)
255
+ if not deserialized_path.exists():
256
+ raise FileNotFoundError(f"File {deserialized} not found")
257
+ row.append(str(deserialized_path.relative_to(self.dataset_dir)))
258
+ except (FileNotFoundError, TypeError, ValueError, OSError):
259
+ deserialized = "" if deserialized is None else str(deserialized)
260
+ row.append(deserialized)
261
+
262
+ # If component is eligible for a preview, add the URL of the file
263
+ # Be mindful that images and audio can be None
264
+ if isinstance(component, tuple(file_preview_types)): # type: ignore
265
+ for _component, _type in file_preview_types.items():
266
+ if isinstance(component, _component):
267
+ features[label + " file"] = {"_type": _type}
268
+ break
269
+ if deserialized:
270
+ path_in_repo = str( # returned filepath is absolute, we want it relative to compute URL
271
+ Path(deserialized).relative_to(self.dataset_dir)
272
+ ).replace("\\", "/")
273
+ row.append(
274
+ huggingface_hub.hf_hub_url(
275
+ repo_id=self.dataset_id,
276
+ filename=path_in_repo,
277
+ repo_type="dataset",
278
+ )
279
+ )
280
+ else:
281
+ row.append("")
282
+ features["flag"] = {"dtype": "string", "_type": "Value"}
283
+ features["username"] = {"dtype": "string", "_type": "Value"}
284
+ row.append(flag_option)
285
+ row.append(username)
286
+ return features, row
287
+
288
 
289
  class myHuggingFaceDatasetSaver(HuggingFaceDatasetSaver):
290
  """