import os | |
import time | |
from multiprocessing import Pool | |
from tqdm import tqdm | |
from huggingface_hub import Repository | |
def save_shard(shard_tuple): | |
"""Save shard""" | |
filename, shard = shard_tuple | |
# use to_json instead to save as json file | |
shard.to_parquet(filename) | |
# def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): | |
# """Save sharded data | |
# Args: | |
# ds (Dataset): dataset to be saved | |
# user (str): user name | |
# remote_dataset_repo (str): remote dataset repository | |
# out_path (str): path to save the shards""" | |
# # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO | |
# # you can save the shards inside it and do git add/commit/push to push data to the hub | |
# out_path = remote_dataset_repo | |
# # if out path doesn't already exist | |
# if not os.path.exists(out_path): | |
# repo = Repository( | |
# local_dir=out_path, | |
# clone_from=user + "/" + remote_dataset_repo, | |
# repo_type="dataset", | |
# private=True, | |
# use_auth_token=True, | |
# git_user=user | |
# ) | |
# # files will be numerous we save them in a folder called data inside out_path | |
# os.mkdir(out_path + "/data") | |
# SHARD_SIZE = 1000 << 20 | |
# if ds._indices is not None: | |
# dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) | |
# else: | |
# dataset_nbytes = ds.data.nbytes | |
# num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 | |
# print(f"Number of shards: {num_shards}") | |
# print("sharding the dataset") | |
# t_start = time.time() | |
# shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) | |
# # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files | |
# filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) | |
# with Pool(16) as p: | |
# list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) | |
# print(f"Time to save dataset: {time.time()-t_start:.2f}") | |
# # to push dataset to hub do: git add/commit/push inside OUT_PATH | |
def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): | |
"""Save sharded data | |
Args: | |
ds (Dataset): dataset to be saved | |
user (str): user name | |
remote_dataset_repo (str): remote dataset repository | |
out_path (str): path to save the shards""" | |
# this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO | |
# you can save the shards inside it and do git add/commit/push to push data to the hub | |
out_path = remote_dataset_repo | |
# if out path doesn't already exist | |
if not os.path.exists(out_path): | |
repo_url = f'https://huggingface.co/{user}/{remote_dataset_repo}' | |
repo = Repository(local_dir=out_path, clone_from=repo_url, repo_type="dataset") | |
repo.create_repo(private=True, use_auth_token=True, git_user=user) | |
# files will be numerous we save them in a folder called data inside out_path | |
os.mkdir(out_path + "/data") | |
SHARD_SIZE = 1000 << 20 | |
if ds._indices is not None: | |
dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) | |
else: | |
dataset_nbytes = ds.data.nbytes | |
num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 | |
print(f"Number of shards: {num_shards}") | |
print("sharding the dataset") | |
t_start = time.time() | |
shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) | |
# use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files | |
filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) | |
with Pool(16) as p: | |
list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) | |
print(f"Time to save dataset: {time.time()-t_start:.2f}") | |
# to push dataset to hub do: git add/commit/push inside OUT_PATH |