Spaces:
Sleeping
Sleeping
Commit
·
9d24b08
1
Parent(s):
c4b6820
Refactor dataset migration tool for GitHub and Kaggle datasets
Browse files
app.py
CHANGED
@@ -2,13 +2,15 @@ import contextlib
|
|
2 |
import re
|
3 |
import tempfile
|
4 |
from functools import lru_cache
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
from git import Repo
|
8 |
from httpx import Client
|
9 |
-
from typing import Optional
|
10 |
from huggingface_hub import create_repo, upload_folder
|
11 |
from toolz import groupby
|
|
|
|
|
12 |
|
13 |
client = Client()
|
14 |
|
@@ -46,7 +48,7 @@ def upload_directory_to_hf(
|
|
46 |
commit_message="Migrated from GitHub",
|
47 |
ignore_patterns=[
|
48 |
"*.git*",
|
49 |
-
|
50 |
"*.DS_Store",
|
51 |
"*.env",
|
52 |
], # ignore git files and .env files
|
@@ -132,6 +134,34 @@ def show_files_and_directories(url: str):
|
|
132 |
)
|
133 |
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
html_text_app_description = """
|
136 |
Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
|
137 |
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
|
@@ -148,71 +178,101 @@ This app will help you migrate a dataset currently hosted on GitHub to the Huggi
|
|
148 |
|
149 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
150 |
gr.HTML(
|
151 |
-
"""<h1 style='text-align: center;'>
|
152 |
-
<center><i> ✨ Migrate
|
153 |
-
)
|
154 |
-
gr.HTML(
|
155 |
-
"""<center> GitHub is a great place for sharing code but the Hugging Face Hub has many advantages for sharing datasets.
|
156 |
-
<br> This Space will guide you through the process of migrating a dataset from GitHub to the Hugging Face Hub. </center>"""
|
157 |
)
|
|
|
158 |
with gr.Row():
|
159 |
gr.LoginButton(size="sm")
|
160 |
-
|
161 |
-
gr.
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
gr.Markdown(
|
213 |
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
|
214 |
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
|
215 |
)
|
216 |
|
217 |
-
|
218 |
demo.launch()
|
|
|
2 |
import re
|
3 |
import tempfile
|
4 |
from functools import lru_cache
|
5 |
+
from typing import Optional
|
6 |
|
7 |
import gradio as gr
|
8 |
from git import Repo
|
9 |
from httpx import Client
|
|
|
10 |
from huggingface_hub import create_repo, upload_folder
|
11 |
from toolz import groupby
|
12 |
+
import kagglehub
|
13 |
+
from kagglehub import KaggleDatasetAdapter
|
14 |
|
15 |
client = Client()
|
16 |
|
|
|
48 |
commit_message="Migrated from GitHub",
|
49 |
ignore_patterns=[
|
50 |
"*.git*",
|
51 |
+
# "*README.md*",
|
52 |
"*.DS_Store",
|
53 |
"*.env",
|
54 |
], # ignore git files and .env files
|
|
|
134 |
)
|
135 |
|
136 |
|
137 |
+
def push_kaggle_to_hf(
|
138 |
+
source_kaggle_dataset: str,
|
139 |
+
destination_hf_hub_repository: str,
|
140 |
+
file_path: str,
|
141 |
+
oauth_token: gr.OAuthToken,
|
142 |
+
):
|
143 |
+
"""Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter"""
|
144 |
+
if not file_path:
|
145 |
+
raise ValueError("File path must be specified for Kaggle datasets")
|
146 |
+
|
147 |
+
gr.Info("Loading Kaggle dataset...")
|
148 |
+
dataset = kagglehub.load_dataset(
|
149 |
+
KaggleDatasetAdapter.HUGGING_FACE,
|
150 |
+
source_kaggle_dataset,
|
151 |
+
file_path,
|
152 |
+
)
|
153 |
+
gr.Info("Loading Kaggle dataset...Done")
|
154 |
+
|
155 |
+
gr.Info("Pushing to Hugging Face Hub...")
|
156 |
+
dataset.push_to_hub(
|
157 |
+
destination_hf_hub_repository,
|
158 |
+
token=oauth_token.token,
|
159 |
+
)
|
160 |
+
gr.Info("Pushing to Hugging Face Hub...Done")
|
161 |
+
|
162 |
+
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})"
|
163 |
+
|
164 |
+
|
165 |
html_text_app_description = """
|
166 |
Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
|
167 |
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
|
|
|
178 |
|
179 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
180 |
gr.HTML(
|
181 |
+
"""<h1 style='text-align: center;'> Dataset Migration Tool</h1>
|
182 |
+
<center><i> ✨ Migrate datasets to Hugging Face Hub in a few steps ✨</i></center>"""
|
|
|
|
|
|
|
|
|
183 |
)
|
184 |
+
|
185 |
with gr.Row():
|
186 |
gr.LoginButton(size="sm")
|
187 |
+
|
188 |
+
with gr.Tabs() as tabs:
|
189 |
+
with gr.Tab("GitHub"):
|
190 |
+
gr.Markdown("### Location of existing dataset")
|
191 |
+
gr.Markdown(
|
192 |
+
"URL for the GitHub repository where the dataset is currently hosted"
|
193 |
+
)
|
194 |
+
source_github_repository = gr.Textbox(
|
195 |
+
lines=1, label="Source GitHub Repository URL"
|
196 |
+
)
|
197 |
+
|
198 |
+
with gr.Accordion("Advanced Options", open=False):
|
199 |
+
gr.Markdown("### Select files and folder to migrate")
|
200 |
+
gr.Markdown(
|
201 |
+
"(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated."
|
202 |
+
)
|
203 |
+
folder_in_github_repo = gr.Dropdown(
|
204 |
+
None,
|
205 |
+
label="Folder in the GitHub Repository to migrate",
|
206 |
+
allow_custom_value=True,
|
207 |
+
visible=True,
|
208 |
+
)
|
209 |
+
files_in_github_repo = gr.Dropdown(
|
210 |
+
None,
|
211 |
+
label="Files in GitHub Repository to migrate",
|
212 |
+
allow_custom_value=True,
|
213 |
+
visible=True,
|
214 |
+
)
|
215 |
+
source_github_repository.change(
|
216 |
+
show_files_and_directories,
|
217 |
+
[source_github_repository],
|
218 |
+
[folder_in_github_repo, files_in_github_repo],
|
219 |
+
)
|
220 |
+
|
221 |
+
gr.Markdown("### Destination for your migrated dataset")
|
222 |
+
destination_hf_hub_repository = gr.Textbox(
|
223 |
+
label="Destination Hugging Face Repository",
|
224 |
+
placeholder="i.e. <hugging face username>/<repository_name>",
|
225 |
+
)
|
226 |
+
|
227 |
+
github_submit_btn = gr.Button("Migrate GitHub Dataset")
|
228 |
+
github_result = gr.Markdown(label="Summary", visible=True)
|
229 |
+
|
230 |
+
github_submit_btn.click(
|
231 |
+
push_to_hf,
|
232 |
+
[
|
233 |
+
source_github_repository,
|
234 |
+
destination_hf_hub_repository,
|
235 |
+
folder_in_github_repo,
|
236 |
+
],
|
237 |
+
[github_result],
|
238 |
+
)
|
239 |
+
|
240 |
+
with gr.Tab("Kaggle"):
|
241 |
+
gr.Markdown("### Source Kaggle Dataset")
|
242 |
+
gr.Markdown("Enter the Kaggle dataset name and file path")
|
243 |
+
source_kaggle_dataset = gr.Textbox(
|
244 |
+
lines=1,
|
245 |
+
label="Source Kaggle Dataset",
|
246 |
+
placeholder="username/dataset-name",
|
247 |
+
)
|
248 |
+
kaggle_file_path = gr.Textbox(
|
249 |
+
label="File path in dataset",
|
250 |
+
placeholder="e.g., train.csv",
|
251 |
+
info="Specify the file to migrate from the dataset",
|
252 |
+
)
|
253 |
+
|
254 |
+
gr.Markdown("### Destination for your migrated dataset")
|
255 |
+
kaggle_destination_hf_hub = gr.Textbox(
|
256 |
+
label="Destination Hugging Face Repository",
|
257 |
+
placeholder="i.e. <hugging face username>/<repository_name>",
|
258 |
+
)
|
259 |
+
|
260 |
+
kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset")
|
261 |
+
kaggle_result = gr.Markdown(label="Summary", visible=True)
|
262 |
+
|
263 |
+
kaggle_submit_btn.click(
|
264 |
+
push_kaggle_to_hf,
|
265 |
+
[
|
266 |
+
source_kaggle_dataset,
|
267 |
+
kaggle_destination_hf_hub,
|
268 |
+
kaggle_file_path,
|
269 |
+
],
|
270 |
+
[kaggle_result],
|
271 |
+
)
|
272 |
+
|
273 |
gr.Markdown(
|
274 |
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
|
275 |
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
|
276 |
)
|
277 |
|
|
|
278 |
demo.launch()
|