Spaces:
Sleeping
Sleeping
import contextlib | |
import re | |
import tempfile | |
from functools import lru_cache | |
import gradio as gr | |
from git import Repo | |
from httpx import Client | |
from typing import Optional | |
from huggingface_hub import create_repo, upload_folder | |
from toolz import groupby | |
client = Client() | |
def clone_into_temp_dir(github_repo_url): | |
temp_dir = tempfile.TemporaryDirectory() | |
return Repo.clone_from(github_repo_url, temp_dir), temp_dir | |
# repo = clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") | |
# clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") | |
def upload_directory_to_hf( | |
repo_id: str, | |
directory: str, | |
oauth_token: str, | |
): | |
private = False | |
url = create_repo( | |
repo_id, | |
token=oauth_token, | |
exist_ok=True, | |
repo_type="dataset", | |
private=private, | |
) | |
commit_url = upload_folder( | |
repo_id=repo_id, | |
folder_path=directory, | |
path_in_repo="data", | |
repo_type="dataset", | |
token=oauth_token, | |
commit_message="Migrated from GitHub", | |
ignore_patterns=[ | |
"*.git*", | |
"*README.md*", | |
"*.DS_Store", | |
"*.env", | |
], # ignore git files, README, and .env files | |
) | |
def push_to_hf( | |
source_github_repository, | |
destination_hf_hub_repository, | |
subdirectory, | |
oauth_token: gr.OAuthToken, | |
): | |
gr.Info("Cloning source GitHub repository...") | |
repo, temporary_directory = clone_into_temp_dir(source_github_repository) | |
gr.Info("Cloning source GitHub repository...Done") | |
gr.Info("Syncing with Hugging Face Hub...") | |
if subdirectory: | |
src_directory = f"{repo.working_dir}/{subdirectory[0]}" | |
else: | |
src_directory = repo.working_dir | |
upload_directory_to_hf( | |
repo_id=destination_hf_hub_repository, | |
directory=src_directory, | |
oauth_token=oauth_token.token, | |
) | |
gr.Info("Syncing with Hugging Face Hub...Done") | |
temporary_directory.cleanup() | |
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" | |
def extract_user_name_and_repo_from_url(github_url: str): | |
pattern = r"https://github.com/([^/]+)/([^/]+)" | |
if match := re.search(pattern, github_url): | |
return match[1], match[2] | |
print("No match found in the GitHub URL.") | |
return None | |
def get_files_and_directories(response): | |
data = response.json() | |
grouped_by_type = groupby(lambda item: item["type"], data["tree"]) | |
files = grouped_by_type.get("blob", []) | |
directories = grouped_by_type.get("tree", []) | |
if files: | |
files = [file["path"] for file in files] | |
if directories: | |
directories = [directory["path"] for directory in directories] | |
return {"files": files, "directories": directories} | |
def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"): | |
user_name_and_repo = extract_user_name_and_repo_from_url(repo_url) | |
if user_name_and_repo is None: | |
return None | |
user_name, repo_name = user_name_and_repo | |
url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}" | |
response = client.get(url) | |
if response.status_code == 200: | |
return get_files_and_directories(response) | |
def show_files_and_directories(url: str): | |
with contextlib.suppress(Exception): | |
files_and_directories = list_git_repo_files_and_directories(url) | |
directories = files_and_directories.get("directories", []) | |
files = files_and_directories.get("files", []) | |
print(directories) | |
return gr.Dropdown( | |
label="Directories", | |
choices=directories, | |
max_choices=1, | |
visible=True, | |
interactive=True, | |
multiselect=True, | |
), gr.Dropdown( | |
label="Files", | |
choices=files, | |
max_choices=None, | |
visible=True, | |
interactive=True, | |
multiselect=True, | |
) | |
html_text_app_description = """ | |
Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets. | |
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are: | |
<br> | |
<ul> | |
<li>Hosting for large datasets</li> | |
<li>An interactive preview of your dataset</li> | |
<li>Access to the dataset via many tools and libraries including; datasets, pandas, polars, dask and DuckDB</li> | |
</ul> | |
<br> | |
This app will help you migrate a dataset currently hosted on GitHub to the Hugging Face Datasets Hub. | |
""" | |
with gr.Blocks(theme=gr.themes.Base()) as demo: | |
gr.HTML( | |
"""<h1 style='text-align: center;'> GitHub to Hugging Face Hub Dataset Migration Tool</h1> | |
<center><i> ✨ Migrate a dataset in a few steps ✨</i></center>""" | |
) | |
gr.HTML( | |
"""<center> GitHub is a great place for sharing code but the Hugging Face Hub has many advantages for sharing datasets. | |
<br> This Space will guide you through the process of migrating a dataset from GitHub to the Hugging Face Hub. </center>""" | |
) | |
with gr.Row(): | |
gr.LoginButton(size="sm") | |
gr.LogoutButton(size="sm") | |
gr.Markdown("### Location of existing dataset") | |
gr.Markdown("URL for the GitHub repository where the dataset is currently hosted") | |
source_github_repository = gr.Textbox(lines=1, label="Source GitHub Repository URL") | |
with gr.Accordion("Advanced Options", open=False): | |
gr.Markdown("### Select files and folder to migrate") | |
gr.Markdown( | |
"(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated." | |
) | |
folder_in_github_repo = gr.Dropdown( | |
None, | |
label="Folder in the GitHub Repository to migrate", | |
allow_custom_value=True, | |
visible=True, | |
) | |
files_in_github_repo = gr.Dropdown( | |
None, | |
label="Files in GitHub Repository to migrate", | |
allow_custom_value=True, | |
visible=True, | |
) | |
source_github_repository.change( | |
show_files_and_directories, | |
[source_github_repository], | |
[folder_in_github_repo, files_in_github_repo], | |
) | |
gr.Markdown("### Destination for your migrated dataset") | |
gr.Markdown("Destination repository for your dataset on the Hugging Face Hub") | |
destination_hf_hub_repository = gr.Textbox( | |
label="Destination Hugging Face Repository", | |
placeholder="i.e. <hugging face username>/<repository_name>", | |
) | |
# gr.Markdown("## Authentication") | |
# gr.Markdown( | |
# """You need to provide a token with write access to the namespace you want to upload to. | |
# You can generate/access your Hugging FAce token from [here](https://huggingface.co/settings/token).""" | |
# ) | |
# hf_token = gr.Textbox(label="Hugging Face Token", type="password") | |
summit_btn = gr.Button("Migrate Dataset") | |
result = gr.Markdown(label="Summary", visible=True) | |
summit_btn.click( | |
push_to_hf, | |
[ | |
source_github_repository, | |
destination_hf_hub_repository, | |
folder_in_github_repo, | |
], | |
[result], | |
) | |
gr.Markdown( | |
"If you have any questions or feedback feel free to reach out to us on using the [Discussion tab]https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)" | |
) | |
demo.launch() | |