|
|
|
""" |
|
Upload Data Files to HuggingFace |
|
|
|
This script uploads key data files to a private HuggingFace dataset repository: |
|
1. all_sources_data.jsonl - The raw document data |
|
2. all_sources_contextual_nodes.pkl - The processed nodes with added context |
|
|
|
This is useful for new team members who need the latest version of the data. |
|
|
|
Usage: |
|
python upload_data_to_hf.py [--repo REPO_ID] |
|
|
|
Arguments: |
|
--repo REPO_ID HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data) |
|
""" |
|
|
|
import argparse |
|
import os |
|
|
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfApi |
|
|
|
load_dotenv() |
|
|
|
|
|
def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"): |
|
"""Upload data files to a private HuggingFace repository.""" |
|
|
|
files_to_upload = [ |
|
|
|
"data/all_sources_data.jsonl", |
|
"data/all_sources_contextual_nodes.pkl", |
|
|
|
"data/transformers_data.jsonl", |
|
"data/peft_data.jsonl", |
|
"data/trl_data.jsonl", |
|
"data/llama_index_data.jsonl", |
|
"data/langchain_data.jsonl", |
|
"data/openai_cookbooks_data.jsonl", |
|
|
|
"data/tai_blog_data.jsonl", |
|
"data/8-hour_primer_data.jsonl", |
|
"data/llm_developer_data.jsonl", |
|
"data/python_primer_data.jsonl", |
|
] |
|
|
|
|
|
existing_files = [] |
|
missing_files = [] |
|
|
|
for file_path in files_to_upload: |
|
if os.path.exists(file_path): |
|
existing_files.append(file_path) |
|
else: |
|
missing_files.append(file_path) |
|
|
|
|
|
critical_files = [ |
|
"data/all_sources_data.jsonl", |
|
"data/all_sources_contextual_nodes.pkl", |
|
] |
|
critical_missing = [f for f in critical_files if f in missing_files] |
|
|
|
if critical_missing: |
|
print( |
|
f"Error: The following critical files were not found: {', '.join(critical_missing)}" |
|
) |
|
|
|
|
|
if missing_files: |
|
print( |
|
f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}" |
|
) |
|
print("This is normal if you're only updating certain sources.") |
|
|
|
try: |
|
api = HfApi(token=os.getenv("HF_TOKEN")) |
|
|
|
|
|
try: |
|
api.repo_info(repo_id=repo_id, repo_type="dataset") |
|
print(f"Repository {repo_id} exists") |
|
except Exception: |
|
print( |
|
f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform." |
|
) |
|
print("Make sure to set it as private if needed.") |
|
return False |
|
|
|
|
|
for file_path in existing_files: |
|
try: |
|
file_name = os.path.basename(file_path) |
|
print(f"Uploading {file_name}...") |
|
|
|
api.upload_file( |
|
path_or_fileobj=file_path, |
|
path_in_repo=file_name, |
|
repo_id=repo_id, |
|
repo_type="dataset", |
|
) |
|
print( |
|
f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}" |
|
) |
|
except Exception as e: |
|
print(f"Error uploading {file_path}: {e}") |
|
|
|
|
|
return True |
|
except Exception as e: |
|
print(f"Error uploading files: {e}") |
|
return False |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace") |
|
parser.add_argument( |
|
"--repo", |
|
default="towardsai-tutors/ai-tutor-data", |
|
help="HuggingFace dataset repository ID", |
|
) |
|
|
|
args = parser.parse_args() |
|
upload_files_to_huggingface(args.repo) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|