File size: 3,983 Bytes
0b1b256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
"""
Upload Data Files to HuggingFace

This script uploads key data files to a private HuggingFace dataset repository:
1. all_sources_data.jsonl - The raw document data
2. all_sources_contextual_nodes.pkl - The processed nodes with added context

This is useful for new team members who need the latest version of the data.

Usage:
    python upload_data_to_hf.py [--repo REPO_ID]

Arguments:
    --repo REPO_ID     HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data)
"""

import argparse
import os

from dotenv import load_dotenv
from huggingface_hub import HfApi

load_dotenv()


def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"):
    """Upload data files to a private HuggingFace repository."""
    # Main files to upload
    files_to_upload = [
        # Combined data and vector store
        "data/all_sources_data.jsonl",
        "data/all_sources_contextual_nodes.pkl",
        # Individual source files
        "data/transformers_data.jsonl",
        "data/peft_data.jsonl",
        "data/trl_data.jsonl",
        "data/llama_index_data.jsonl",
        "data/langchain_data.jsonl",
        "data/openai_cookbooks_data.jsonl",
        # Course files
        "data/tai_blog_data.jsonl",
        "data/8-hour_primer_data.jsonl",
        "data/llm_developer_data.jsonl",
        "data/python_primer_data.jsonl",
    ]

    # Filter to only include files that exist
    existing_files = []
    missing_files = []

    for file_path in files_to_upload:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            missing_files.append(file_path)

    # Critical files must exist
    critical_files = [
        "data/all_sources_data.jsonl",
        "data/all_sources_contextual_nodes.pkl",
    ]
    critical_missing = [f for f in critical_files if f in missing_files]

    if critical_missing:
        print(
            f"Error: The following critical files were not found: {', '.join(critical_missing)}"
        )
        # return False

    if missing_files:
        print(
            f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}"
        )
        print("This is normal if you're only updating certain sources.")

    try:
        api = HfApi(token=os.getenv("HF_TOKEN"))

        # Check if repository exists, create if it doesn't
        try:
            api.repo_info(repo_id=repo_id, repo_type="dataset")
            print(f"Repository {repo_id} exists")
        except Exception:
            print(
                f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform."
            )
            print("Make sure to set it as private if needed.")
            return False

        # Upload all existing files
        for file_path in existing_files:
            try:
                file_name = os.path.basename(file_path)
                print(f"Uploading {file_name}...")

                api.upload_file(
                    path_or_fileobj=file_path,
                    path_in_repo=file_name,
                    repo_id=repo_id,
                    repo_type="dataset",
                )
                print(
                    f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}"
                )
            except Exception as e:
                print(f"Error uploading {file_path}: {e}")
                # Continue with other files even if one fails

        return True
    except Exception as e:
        print(f"Error uploading files: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace")
    parser.add_argument(
        "--repo",
        default="towardsai-tutors/ai-tutor-data",
        help="HuggingFace dataset repository ID",
    )

    args = parser.parse_args()
    upload_files_to_huggingface(args.repo)


if __name__ == "__main__":
    main()