Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

File size: 3,983 Bytes

0b1b256

#!/usr/bin/env python
"""
Upload Data Files to HuggingFace

This script uploads key data files to a private HuggingFace dataset repository:
1. all_sources_data.jsonl - The raw document data
2. all_sources_contextual_nodes.pkl - The processed nodes with added context

This is useful for new team members who need the latest version of the data.

Usage:
    python upload_data_to_hf.py [--repo REPO_ID]

Arguments:
    --repo REPO_ID     HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data)
"""

import argparse
import os

from dotenv import load_dotenv
from huggingface_hub import HfApi

load_dotenv()


def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"):
    """Upload data files to a private HuggingFace repository."""
    # Main files to upload
    files_to_upload = [
        # Combined data and vector store
        "data/all_sources_data.jsonl",
        "data/all_sources_contextual_nodes.pkl",
        # Individual source files
        "data/transformers_data.jsonl",
        "data/peft_data.jsonl",
        "data/trl_data.jsonl",
        "data/llama_index_data.jsonl",
        "data/langchain_data.jsonl",
        "data/openai_cookbooks_data.jsonl",
        # Course files
        "data/tai_blog_data.jsonl",
        "data/8-hour_primer_data.jsonl",
        "data/llm_developer_data.jsonl",
        "data/python_primer_data.jsonl",
    ]

    # Filter to only include files that exist
    existing_files = []
    missing_files = []

    for file_path in files_to_upload:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            missing_files.append(file_path)

    # Critical files must exist
    critical_files = [
        "data/all_sources_data.jsonl",
        "data/all_sources_contextual_nodes.pkl",
    ]
    critical_missing = [f for f in critical_files if f in missing_files]

    if critical_missing:
        print(
            f"Error: The following critical files were not found: {', '.join(critical_missing)}"
        )
        # return False

    if missing_files:
        print(
            f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}"
        )
        print("This is normal if you're only updating certain sources.")

    try:
        api = HfApi(token=os.getenv("HF_TOKEN"))

        # Check if repository exists, create if it doesn't
        try:
            api.repo_info(repo_id=repo_id, repo_type="dataset")
            print(f"Repository {repo_id} exists")
        except Exception:
            print(
                f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform."
            )
            print("Make sure to set it as private if needed.")
            return False

        # Upload all existing files
        for file_path in existing_files:
            try:
                file_name = os.path.basename(file_path)
                print(f"Uploading {file_name}...")

                api.upload_file(
                    path_or_fileobj=file_path,
                    path_in_repo=file_name,
                    repo_id=repo_id,
                    repo_type="dataset",
                )
                print(
                    f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}"
                )
            except Exception as e:
                print(f"Error uploading {file_path}: {e}")
                # Continue with other files even if one fails

        return True
    except Exception as e:
        print(f"Error uploading files: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace")
    parser.add_argument(
        "--repo",
        default="towardsai-tutors/ai-tutor-data",
        help="HuggingFace dataset repository ID",
    )

    args = parser.parse_args()
    upload_files_to_huggingface(args.repo)


if __name__ == "__main__":
    main()