davanstrien HF staff commited on
Commit
9d24b08
·
1 Parent(s): c4b6820

Refactor dataset migration tool for GitHub and Kaggle datasets

Browse files
Files changed (1) hide show
  1. app.py +121 -61
app.py CHANGED
@@ -2,13 +2,15 @@ import contextlib
2
  import re
3
  import tempfile
4
  from functools import lru_cache
 
5
 
6
  import gradio as gr
7
  from git import Repo
8
  from httpx import Client
9
- from typing import Optional
10
  from huggingface_hub import create_repo, upload_folder
11
  from toolz import groupby
 
 
12
 
13
  client = Client()
14
 
@@ -46,7 +48,7 @@ def upload_directory_to_hf(
46
  commit_message="Migrated from GitHub",
47
  ignore_patterns=[
48
  "*.git*",
49
- # "*README.md*",
50
  "*.DS_Store",
51
  "*.env",
52
  ], # ignore git files and .env files
@@ -132,6 +134,34 @@ def show_files_and_directories(url: str):
132
  )
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  html_text_app_description = """
136
  Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
137
  Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
@@ -148,71 +178,101 @@ This app will help you migrate a dataset currently hosted on GitHub to the Huggi
148
 
149
  with gr.Blocks(theme=gr.themes.Base()) as demo:
150
  gr.HTML(
151
- """<h1 style='text-align: center;'> GitHub to Hugging Face Hub Dataset Migration Tool</h1>
152
- <center><i> &#x2728; Migrate a dataset in a few steps &#x2728;</i></center>"""
153
- )
154
- gr.HTML(
155
- """<center> GitHub is a great place for sharing code but the Hugging Face Hub has many advantages for sharing datasets.
156
- <br> This Space will guide you through the process of migrating a dataset from GitHub to the Hugging Face Hub. </center>"""
157
  )
 
158
  with gr.Row():
159
  gr.LoginButton(size="sm")
160
-
161
- gr.Markdown("### Location of existing dataset")
162
- gr.Markdown("URL for the GitHub repository where the dataset is currently hosted")
163
- source_github_repository = gr.Textbox(lines=1, label="Source GitHub Repository URL")
164
- gr.Markdown(
165
- "Use advanced options to select specific files and folders to migrate. Currently this app supports migrating specific subfolder(s) or top level files. If this is not sufficient for your use case please open a discussion!"
166
- )
167
- with gr.Accordion("Advanced Options", open=False):
168
- gr.Markdown("### Select files and folder to migrate")
169
- gr.Markdown(
170
- "(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated."
171
- )
172
- folder_in_github_repo = gr.Dropdown(
173
- None,
174
- label="Folder in the GitHub Repository to migrate",
175
- allow_custom_value=True,
176
- visible=True,
177
- )
178
- files_in_github_repo = gr.Dropdown(
179
- None,
180
- label="Files in GitHub Repository to migrate",
181
- allow_custom_value=True,
182
- visible=True,
183
- )
184
- source_github_repository.change(
185
- show_files_and_directories,
186
- [source_github_repository],
187
- [folder_in_github_repo, files_in_github_repo],
188
- )
189
- gr.Markdown("### Destination for your migrated dataset")
190
- gr.Markdown("Destination repository for your dataset on the Hugging Face Hub")
191
- destination_hf_hub_repository = gr.Textbox(
192
- label="Destination Hugging Face Repository",
193
- placeholder="i.e. <hugging face username>/<repository_name>",
194
- )
195
- # gr.Markdown("## Authentication")
196
- # gr.Markdown(
197
- # """You need to provide a token with write access to the namespace you want to upload to.
198
- # You can generate/access your Hugging FAce token from [here](https://huggingface.co/settings/token)."""
199
- # )
200
- # hf_token = gr.Textbox(label="Hugging Face Token", type="password")
201
- summit_btn = gr.Button("Migrate Dataset")
202
- result = gr.Markdown(label="Summary", visible=True)
203
- summit_btn.click(
204
- push_to_hf,
205
- [
206
- source_github_repository,
207
- destination_hf_hub_repository,
208
- folder_in_github_repo,
209
- ],
210
- [result],
211
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  gr.Markdown(
213
  """You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
214
  If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
215
  )
216
 
217
-
218
  demo.launch()
 
2
  import re
3
  import tempfile
4
  from functools import lru_cache
5
+ from typing import Optional
6
 
7
  import gradio as gr
8
  from git import Repo
9
  from httpx import Client
 
10
  from huggingface_hub import create_repo, upload_folder
11
  from toolz import groupby
12
+ import kagglehub
13
+ from kagglehub import KaggleDatasetAdapter
14
 
15
  client = Client()
16
 
 
48
  commit_message="Migrated from GitHub",
49
  ignore_patterns=[
50
  "*.git*",
51
+ # "*README.md*",
52
  "*.DS_Store",
53
  "*.env",
54
  ], # ignore git files and .env files
 
134
  )
135
 
136
 
137
+ def push_kaggle_to_hf(
138
+ source_kaggle_dataset: str,
139
+ destination_hf_hub_repository: str,
140
+ file_path: str,
141
+ oauth_token: gr.OAuthToken,
142
+ ):
143
+ """Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter"""
144
+ if not file_path:
145
+ raise ValueError("File path must be specified for Kaggle datasets")
146
+
147
+ gr.Info("Loading Kaggle dataset...")
148
+ dataset = kagglehub.load_dataset(
149
+ KaggleDatasetAdapter.HUGGING_FACE,
150
+ source_kaggle_dataset,
151
+ file_path,
152
+ )
153
+ gr.Info("Loading Kaggle dataset...Done")
154
+
155
+ gr.Info("Pushing to Hugging Face Hub...")
156
+ dataset.push_to_hub(
157
+ destination_hf_hub_repository,
158
+ token=oauth_token.token,
159
+ )
160
+ gr.Info("Pushing to Hugging Face Hub...Done")
161
+
162
+ return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})"
163
+
164
+
165
  html_text_app_description = """
166
  Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
167
  Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
 
178
 
179
  with gr.Blocks(theme=gr.themes.Base()) as demo:
180
  gr.HTML(
181
+ """<h1 style='text-align: center;'> Dataset Migration Tool</h1>
182
+ <center><i> &#x2728; Migrate datasets to Hugging Face Hub in a few steps &#x2728;</i></center>"""
 
 
 
 
183
  )
184
+
185
  with gr.Row():
186
  gr.LoginButton(size="sm")
187
+
188
+ with gr.Tabs() as tabs:
189
+ with gr.Tab("GitHub"):
190
+ gr.Markdown("### Location of existing dataset")
191
+ gr.Markdown(
192
+ "URL for the GitHub repository where the dataset is currently hosted"
193
+ )
194
+ source_github_repository = gr.Textbox(
195
+ lines=1, label="Source GitHub Repository URL"
196
+ )
197
+
198
+ with gr.Accordion("Advanced Options", open=False):
199
+ gr.Markdown("### Select files and folder to migrate")
200
+ gr.Markdown(
201
+ "(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated."
202
+ )
203
+ folder_in_github_repo = gr.Dropdown(
204
+ None,
205
+ label="Folder in the GitHub Repository to migrate",
206
+ allow_custom_value=True,
207
+ visible=True,
208
+ )
209
+ files_in_github_repo = gr.Dropdown(
210
+ None,
211
+ label="Files in GitHub Repository to migrate",
212
+ allow_custom_value=True,
213
+ visible=True,
214
+ )
215
+ source_github_repository.change(
216
+ show_files_and_directories,
217
+ [source_github_repository],
218
+ [folder_in_github_repo, files_in_github_repo],
219
+ )
220
+
221
+ gr.Markdown("### Destination for your migrated dataset")
222
+ destination_hf_hub_repository = gr.Textbox(
223
+ label="Destination Hugging Face Repository",
224
+ placeholder="i.e. <hugging face username>/<repository_name>",
225
+ )
226
+
227
+ github_submit_btn = gr.Button("Migrate GitHub Dataset")
228
+ github_result = gr.Markdown(label="Summary", visible=True)
229
+
230
+ github_submit_btn.click(
231
+ push_to_hf,
232
+ [
233
+ source_github_repository,
234
+ destination_hf_hub_repository,
235
+ folder_in_github_repo,
236
+ ],
237
+ [github_result],
238
+ )
239
+
240
+ with gr.Tab("Kaggle"):
241
+ gr.Markdown("### Source Kaggle Dataset")
242
+ gr.Markdown("Enter the Kaggle dataset name and file path")
243
+ source_kaggle_dataset = gr.Textbox(
244
+ lines=1,
245
+ label="Source Kaggle Dataset",
246
+ placeholder="username/dataset-name",
247
+ )
248
+ kaggle_file_path = gr.Textbox(
249
+ label="File path in dataset",
250
+ placeholder="e.g., train.csv",
251
+ info="Specify the file to migrate from the dataset",
252
+ )
253
+
254
+ gr.Markdown("### Destination for your migrated dataset")
255
+ kaggle_destination_hf_hub = gr.Textbox(
256
+ label="Destination Hugging Face Repository",
257
+ placeholder="i.e. <hugging face username>/<repository_name>",
258
+ )
259
+
260
+ kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset")
261
+ kaggle_result = gr.Markdown(label="Summary", visible=True)
262
+
263
+ kaggle_submit_btn.click(
264
+ push_kaggle_to_hf,
265
+ [
266
+ source_kaggle_dataset,
267
+ kaggle_destination_hf_hub,
268
+ kaggle_file_path,
269
+ ],
270
+ [kaggle_result],
271
+ )
272
+
273
  gr.Markdown(
274
  """You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
275
  If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
276
  )
277
 
 
278
  demo.launch()