ai-tutor-chatbot / data /scraping_scripts /upload_data_to_hf.py
omarsol's picture
add course workflow and update data to hf scripts
0b1b256
#!/usr/bin/env python
"""
Upload Data Files to HuggingFace
This script uploads key data files to a private HuggingFace dataset repository:
1. all_sources_data.jsonl - The raw document data
2. all_sources_contextual_nodes.pkl - The processed nodes with added context
This is useful for new team members who need the latest version of the data.
Usage:
python upload_data_to_hf.py [--repo REPO_ID]
Arguments:
--repo REPO_ID HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data)
"""
import argparse
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi
load_dotenv()
def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"):
"""Upload data files to a private HuggingFace repository."""
# Main files to upload
files_to_upload = [
# Combined data and vector store
"data/all_sources_data.jsonl",
"data/all_sources_contextual_nodes.pkl",
# Individual source files
"data/transformers_data.jsonl",
"data/peft_data.jsonl",
"data/trl_data.jsonl",
"data/llama_index_data.jsonl",
"data/langchain_data.jsonl",
"data/openai_cookbooks_data.jsonl",
# Course files
"data/tai_blog_data.jsonl",
"data/8-hour_primer_data.jsonl",
"data/llm_developer_data.jsonl",
"data/python_primer_data.jsonl",
]
# Filter to only include files that exist
existing_files = []
missing_files = []
for file_path in files_to_upload:
if os.path.exists(file_path):
existing_files.append(file_path)
else:
missing_files.append(file_path)
# Critical files must exist
critical_files = [
"data/all_sources_data.jsonl",
"data/all_sources_contextual_nodes.pkl",
]
critical_missing = [f for f in critical_files if f in missing_files]
if critical_missing:
print(
f"Error: The following critical files were not found: {', '.join(critical_missing)}"
)
# return False
if missing_files:
print(
f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}"
)
print("This is normal if you're only updating certain sources.")
try:
api = HfApi(token=os.getenv("HF_TOKEN"))
# Check if repository exists, create if it doesn't
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
print(f"Repository {repo_id} exists")
except Exception:
print(
f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform."
)
print("Make sure to set it as private if needed.")
return False
# Upload all existing files
for file_path in existing_files:
try:
file_name = os.path.basename(file_path)
print(f"Uploading {file_name}...")
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset",
)
print(
f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}"
)
except Exception as e:
print(f"Error uploading {file_path}: {e}")
# Continue with other files even if one fails
return True
except Exception as e:
print(f"Error uploading files: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace")
parser.add_argument(
"--repo",
default="towardsai-tutors/ai-tutor-data",
help="HuggingFace dataset repository ID",
)
args = parser.parse_args()
upload_files_to_huggingface(args.repo)
if __name__ == "__main__":
main()