davanstrien's picture
davanstrien HF Staff
Refactor dataset migration tool for GitHub and Kaggle datasets
8614aa9
raw
history blame
9.85 kB
import contextlib
import re
import tempfile
from functools import lru_cache
from typing import Optional
import gradio as gr
from git import Repo
from httpx import Client
from huggingface_hub import create_repo, upload_folder
from toolz import groupby
import kagglehub
from kagglehub import KaggleDatasetAdapter
client = Client()
def clone_into_temp_dir(github_repo_url):
temp_dir = tempfile.TemporaryDirectory()
return Repo.clone_from(github_repo_url, temp_dir), temp_dir
# repo = clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/")
# clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/")
def upload_directory_to_hf(
repo_id: str,
directory: str,
oauth_token: str,
):
private = False
url = create_repo(
repo_id,
token=oauth_token,
exist_ok=True,
repo_type="dataset",
private=private,
)
commit_url = upload_folder(
repo_id=repo_id,
folder_path=directory,
path_in_repo="data",
repo_type="dataset",
token=oauth_token,
commit_message="Migrated from GitHub",
ignore_patterns=[
"*.git*",
# "*README.md*",
"*.DS_Store",
"*.env",
], # ignore git files and .env files
)
def push_to_hf(
source_github_repository,
destination_hf_hub_repository,
subdirectory,
oauth_token: gr.OAuthToken,
):
gr.Info("Cloning source GitHub repository...")
repo, temporary_directory = clone_into_temp_dir(source_github_repository)
gr.Info("Cloning source GitHub repository...Done")
gr.Info("Syncing with Hugging Face Hub...")
if subdirectory:
src_directory = f"{repo.working_dir}/{subdirectory[0]}"
else:
src_directory = repo.working_dir
upload_directory_to_hf(
repo_id=destination_hf_hub_repository,
directory=src_directory,
oauth_token=oauth_token.token,
)
gr.Info("Syncing with Hugging Face Hub...Done")
temporary_directory.cleanup()
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})"
def extract_user_name_and_repo_from_url(github_url: str):
pattern = r"https://github.com/([^/]+)/([^/]+)"
if match := re.search(pattern, github_url):
return match[1], match[2]
print("No match found in the GitHub URL.")
return None
def get_files_and_directories(response):
data = response.json()
grouped_by_type = groupby(lambda item: item["type"], data["tree"])
files = grouped_by_type.get("blob", [])
directories = grouped_by_type.get("tree", [])
if files:
files = [file["path"] for file in files]
if directories:
directories = [directory["path"] for directory in directories]
return {"files": files, "directories": directories}
@lru_cache(maxsize=128)
def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"):
user_name_and_repo = extract_user_name_and_repo_from_url(repo_url)
if user_name_and_repo is None:
return None
user_name, repo_name = user_name_and_repo
url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}"
response = client.get(url)
if response.status_code == 200:
return get_files_and_directories(response)
def show_files_and_directories(url: str):
with contextlib.suppress(Exception):
files_and_directories = list_git_repo_files_and_directories(url)
directories = files_and_directories.get("directories", [])
files = files_and_directories.get("files", [])
print(directories)
return gr.Dropdown(
label="Directories",
choices=directories,
max_choices=1,
visible=True,
interactive=True,
multiselect=True,
), gr.Dropdown(
label="Files",
choices=files,
max_choices=None,
visible=True,
interactive=True,
multiselect=True,
)
def push_kaggle_to_hf(
source_kaggle_dataset: str,
destination_hf_hub_repository: str,
file_path: str,
oauth_token: gr.OAuthToken,
):
"""Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter"""
if not file_path:
raise ValueError("File path must be specified for Kaggle datasets")
gr.Info("Loading Kaggle dataset...")
dataset = kagglehub.load_dataset(
KaggleDatasetAdapter.HUGGING_FACE,
source_kaggle_dataset,
file_path,
)
gr.Info("Loading Kaggle dataset...Done")
gr.Info("Pushing to Hugging Face Hub...")
dataset.push_to_hub(
destination_hf_hub_repository,
token=oauth_token.token,
)
gr.Info("Pushing to Hugging Face Hub...Done")
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})"
html_text_app_description = """
While GitHub and Kaggle are great platforms, the Hugging Face Datasets Hub is a better place to host and share datasets.
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
<br>
<ul>
<li>Hosting for large datasets</li>
<li>An interactive preview of your dataset</li>
<li>Access to the dataset via many tools and libraries including; datasets, pandas, polars, dask and DuckDB</li>
<li>Seamless integration with machine learning workflows</li>
<li>Version control and dataset versioning</li>
</ul>
<br>
This app will help you migrate datasets currently hosted on GitHub or Kaggle to the Hugging Face Datasets Hub.
"""
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.HTML(
"""<h1 style='text-align: center;'> Dataset Migration Tool</h1>
<center><i> &#x2728; Migrate datasets to Hugging Face Hub in a few steps &#x2728;</i></center>"""
)
with gr.Row():
gr.LoginButton(size="sm")
with gr.Tabs() as tabs:
with gr.Tab("GitHub"):
gr.Markdown("### Location of existing dataset")
gr.Markdown(
"URL for the GitHub repository where the dataset is currently hosted"
)
source_github_repository = gr.Textbox(
lines=1, label="Source GitHub Repository URL"
)
with gr.Accordion("Advanced Options", open=False):
gr.Markdown("### Select files and folder to migrate")
gr.Markdown(
"(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated."
)
folder_in_github_repo = gr.Dropdown(
None,
label="Folder in the GitHub Repository to migrate",
allow_custom_value=True,
visible=True,
)
files_in_github_repo = gr.Dropdown(
None,
label="Files in GitHub Repository to migrate",
allow_custom_value=True,
visible=True,
)
source_github_repository.change(
show_files_and_directories,
[source_github_repository],
[folder_in_github_repo, files_in_github_repo],
)
gr.Markdown("### Destination for your migrated dataset")
destination_hf_hub_repository = gr.Textbox(
label="Destination Hugging Face Repository",
placeholder="i.e. <hugging face username>/<repository_name>",
)
github_submit_btn = gr.Button("Migrate GitHub Dataset")
github_result = gr.Markdown(label="Summary", visible=True)
github_submit_btn.click(
push_to_hf,
[
source_github_repository,
destination_hf_hub_repository,
folder_in_github_repo,
],
[github_result],
)
with gr.Tab("Kaggle"):
gr.Markdown("### Source Kaggle Dataset")
gr.Markdown("Enter the Kaggle dataset name and file path")
source_kaggle_dataset = gr.Textbox(
lines=1,
label="Source Kaggle Dataset",
placeholder="username/dataset-name",
)
kaggle_file_path = gr.Textbox(
label="File path in dataset",
placeholder="e.g., train.csv",
info="Specify the file to migrate from the dataset",
)
gr.Markdown("### Destination for your migrated dataset")
kaggle_destination_hf_hub = gr.Textbox(
label="Destination Hugging Face Repository",
placeholder="i.e. <hugging face username>/<repository_name>",
)
kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset")
kaggle_result = gr.Markdown(label="Summary", visible=True)
kaggle_submit_btn.click(
push_kaggle_to_hf,
[
source_kaggle_dataset,
kaggle_destination_hf_hub,
kaggle_file_path,
],
[kaggle_result],
)
gr.Markdown(
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
)
demo.launch()