omarsol commited on
Commit
0b1b256
Β·
1 Parent(s): 4ecfe75

add course workflow and update data to hf scripts

Browse files
data/scraping_scripts/add_course_workflow.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ AI Tutor App - Course Addition Workflow
4
+
5
+ This script guides you through the complete process of adding a new course to the AI Tutor App:
6
+
7
+ 1. Process course markdown files to create JSONL data
8
+ 2. MANDATORY MANUAL STEP: Add URLs to course content in the generated JSONL
9
+ 3. Merge course JSONL into all_sources_data.jsonl
10
+ 4. Add contextual information to document nodes
11
+ 5. Create vector stores
12
+ 6. Upload databases to HuggingFace
13
+ 7. Update UI configuration
14
+
15
+ Usage:
16
+ python add_course_workflow.py --course [COURSE_NAME]
17
+
18
+ Additional flags to run specific steps (if you want to restart from a specific point):
19
+ --skip-process-md Skip the markdown processing step
20
+ --skip-merge Skip merging into all_sources_data.jsonl
21
+ --new-context-only Only process new content when adding context
22
+ --skip-context Skip the context addition step entirely
23
+ --skip-vectors Skip vector store creation
24
+ --skip-upload Skip uploading to HuggingFace
25
+ --skip-ui-update Skip updating the UI configuration
26
+ """
27
+
28
+ import argparse
29
+ import json
30
+ import logging
31
+ import os
32
+ import pickle
33
+ import subprocess
34
+ import sys
35
+ import time
36
+ from pathlib import Path
37
+ from typing import Dict, List, Set
38
+
39
+ from dotenv import load_dotenv
40
+ from huggingface_hub import HfApi, hf_hub_download
41
+
42
+ # Load environment variables from .env file
43
+ load_dotenv()
44
+
45
+ # Configure logging
46
+ logging.basicConfig(
47
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
48
+ )
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ def ensure_required_files_exist():
53
+ """Download required data files from HuggingFace if they don't exist locally."""
54
+ # List of files to check and download
55
+ required_files = {
56
+ # Critical files
57
+ "data/all_sources_data.jsonl": "all_sources_data.jsonl",
58
+ "data/all_sources_contextual_nodes.pkl": "all_sources_contextual_nodes.pkl",
59
+
60
+ # Documentation source files
61
+ "data/transformers_data.jsonl": "transformers_data.jsonl",
62
+ "data/peft_data.jsonl": "peft_data.jsonl",
63
+ "data/trl_data.jsonl": "trl_data.jsonl",
64
+ "data/llama_index_data.jsonl": "llama_index_data.jsonl",
65
+ "data/langchain_data.jsonl": "langchain_data.jsonl",
66
+ "data/openai_cookbooks_data.jsonl": "openai_cookbooks_data.jsonl",
67
+
68
+ # Course files
69
+ "data/tai_blog_data.jsonl": "tai_blog_data.jsonl",
70
+ "data/8-hour_primer_data.jsonl": "8-hour_primer_data.jsonl",
71
+ "data/llm_developer_data.jsonl": "llm_developer_data.jsonl",
72
+ "data/python_primer_data.jsonl": "python_primer_data.jsonl"
73
+ }
74
+
75
+ # Critical files that must be downloaded
76
+ critical_files = [
77
+ "data/all_sources_data.jsonl",
78
+ "data/all_sources_contextual_nodes.pkl"
79
+ ]
80
+
81
+ # Check and download each file
82
+ for local_path, remote_filename in required_files.items():
83
+ if not os.path.exists(local_path):
84
+ logger.info(f"{remote_filename} not found. Attempting to download from HuggingFace...")
85
+ try:
86
+ hf_hub_download(
87
+ token=os.getenv("HF_TOKEN"),
88
+ repo_id="towardsai-tutors/ai-tutor-data",
89
+ filename=remote_filename,
90
+ repo_type="dataset",
91
+ local_dir="data",
92
+ )
93
+ logger.info(f"Successfully downloaded {remote_filename} from HuggingFace")
94
+ except Exception as e:
95
+ logger.warning(f"Could not download {remote_filename}: {e}")
96
+
97
+ # Only create empty file for all_sources_data.jsonl if it's missing
98
+ if local_path == "data/all_sources_data.jsonl":
99
+ logger.warning("Creating a new all_sources_data.jsonl file. This will not include previously existing data.")
100
+ with open(local_path, "w") as f:
101
+ pass
102
+
103
+ # If critical file is missing, print a more serious warning
104
+ if local_path in critical_files:
105
+ logger.warning(f"Critical file {remote_filename} is missing. The workflow may not function correctly.")
106
+
107
+ if local_path == "data/all_sources_contextual_nodes.pkl":
108
+ logger.warning("The context addition step will process all documents since no existing contexts were found.")
109
+
110
+
111
+ def load_jsonl(file_path: str) -> List[Dict]:
112
+ """Load data from a JSONL file."""
113
+ data = []
114
+ with open(file_path, "r", encoding="utf-8") as f:
115
+ for line in f:
116
+ data.append(json.loads(line))
117
+ return data
118
+
119
+
120
+ def save_jsonl(data: List[Dict], file_path: str) -> None:
121
+ """Save data to a JSONL file."""
122
+ with open(file_path, "w", encoding="utf-8") as f:
123
+ for item in data:
124
+ json.dump(item, f, ensure_ascii=False)
125
+ f.write("\n")
126
+
127
+
128
+ def process_markdown_files(course_name: str) -> str:
129
+ """Process markdown files for a specific course. Returns path to output JSONL."""
130
+ logger.info(f"Processing markdown files for course: {course_name}")
131
+ cmd = ["python", "data/scraping_scripts/process_md_files.py", course_name]
132
+ result = subprocess.run(cmd)
133
+
134
+ if result.returncode != 0:
135
+ logger.error(f"Error processing markdown files - check output above")
136
+ sys.exit(1)
137
+
138
+ logger.info(f"Successfully processed markdown files for {course_name}")
139
+
140
+ # Determine the output file path from process_md_files.py
141
+ from data.scraping_scripts.process_md_files import SOURCE_CONFIGS
142
+
143
+ if course_name not in SOURCE_CONFIGS:
144
+ logger.error(f"Course {course_name} not found in SOURCE_CONFIGS")
145
+ sys.exit(1)
146
+
147
+ output_file = SOURCE_CONFIGS[course_name]["output_file"]
148
+ return output_file
149
+
150
+
151
+ def manual_url_addition(jsonl_path: str) -> None:
152
+ """Guide the user through manually adding URLs to the course JSONL."""
153
+ logger.info(f"=== MANDATORY MANUAL STEP: URL ADDITION ===")
154
+ logger.info(f"Please add the URLs to the course content in: {jsonl_path}")
155
+ logger.info(f"For each document in the JSONL file:")
156
+ logger.info(f"1. Open the file in a text editor")
157
+ logger.info(f"2. Find the empty 'url' field for each document")
158
+ logger.info(f"3. Add the appropriate URL from the live course platform")
159
+ logger.info(f" Example URL format: https://academy.towardsai.net/courses/take/python-for-genai/multimedia/62515980-course-structure")
160
+ logger.info(f"4. Save the file when done")
161
+
162
+ # Check if URLs are present
163
+ data = load_jsonl(jsonl_path)
164
+ missing_urls = sum(1 for item in data if not item.get("url"))
165
+
166
+ if missing_urls > 0:
167
+ logger.warning(f"Found {missing_urls} documents without URLs in {jsonl_path}")
168
+
169
+ answer = input(
170
+ f"\n{missing_urls} documents are missing URLs. Have you added all the URLs? (yes/no): "
171
+ )
172
+ if answer.lower() not in ["yes", "y"]:
173
+ logger.info("Please add the URLs and run the script again.")
174
+ sys.exit(0)
175
+ else:
176
+ logger.info("All documents have URLs. Continuing with the workflow.")
177
+
178
+
179
+ def merge_into_all_sources(course_jsonl_path: str) -> None:
180
+ """Merge the course JSONL into all_sources_data.jsonl."""
181
+ all_sources_path = "data/all_sources_data.jsonl"
182
+ logger.info(f"Merging {course_jsonl_path} into {all_sources_path}")
183
+
184
+ # Load course data
185
+ course_data = load_jsonl(course_jsonl_path)
186
+
187
+ # Load existing all_sources data if it exists
188
+ all_data = []
189
+ if os.path.exists(all_sources_path):
190
+ all_data = load_jsonl(all_sources_path)
191
+
192
+ # Get doc_ids from existing data
193
+ existing_ids = {item["doc_id"] for item in all_data}
194
+
195
+ # Add new course data (avoiding duplicates)
196
+ new_items = 0
197
+ for item in course_data:
198
+ if item["doc_id"] not in existing_ids:
199
+ all_data.append(item)
200
+ existing_ids.add(item["doc_id"])
201
+ new_items += 1
202
+
203
+ # Save the combined data
204
+ save_jsonl(all_data, all_sources_path)
205
+ logger.info(f"Added {new_items} new documents to {all_sources_path}")
206
+
207
+
208
+ def get_processed_doc_ids() -> Set[str]:
209
+ """Get set of doc_ids that have already been processed with context."""
210
+ if not os.path.exists("data/all_sources_contextual_nodes.pkl"):
211
+ return set()
212
+
213
+ try:
214
+ with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
215
+ nodes = pickle.load(f)
216
+ return {node.source_node.node_id for node in nodes}
217
+ except Exception as e:
218
+ logger.error(f"Error loading processed doc_ids: {e}")
219
+ return set()
220
+
221
+
222
+ def add_context_to_nodes(new_only: bool = False) -> None:
223
+ """Add context to document nodes, optionally processing only new content."""
224
+ logger.info("Adding context to document nodes")
225
+
226
+ if new_only:
227
+ # Load all documents
228
+ all_docs = load_jsonl("data/all_sources_data.jsonl")
229
+ processed_ids = get_processed_doc_ids()
230
+
231
+ # Filter for unprocessed documents
232
+ new_docs = [doc for doc in all_docs if doc["doc_id"] not in processed_ids]
233
+
234
+ if not new_docs:
235
+ logger.info("No new documents to process")
236
+ return
237
+
238
+ # Save temporary JSONL with only new documents
239
+ temp_file = "data/new_docs_temp.jsonl"
240
+ save_jsonl(new_docs, temp_file)
241
+
242
+ # Temporarily modify the add_context_to_nodes.py script to use the temp file
243
+ cmd = [
244
+ "python",
245
+ "-c",
246
+ f"""
247
+ import asyncio
248
+ import os
249
+ import pickle
250
+ import json
251
+ from data.scraping_scripts.add_context_to_nodes import create_docs, process
252
+
253
+ async def main():
254
+ # First, get the list of sources being updated from the temp file
255
+ updated_sources = set()
256
+ with open("{temp_file}", "r") as f:
257
+ for line in f:
258
+ data = json.loads(line)
259
+ updated_sources.add(data["source"])
260
+
261
+ print(f"Updating nodes for sources: {{updated_sources}}")
262
+
263
+ # Process new documents
264
+ documents = create_docs("{temp_file}")
265
+ enhanced_nodes = await process(documents)
266
+ print(f"Generated context for {{len(enhanced_nodes)}} new nodes")
267
+
268
+ # Load existing nodes if they exist
269
+ existing_nodes = []
270
+ if os.path.exists("data/all_sources_contextual_nodes.pkl"):
271
+ with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
272
+ existing_nodes = pickle.load(f)
273
+
274
+ # Filter out existing nodes for sources we're updating
275
+ filtered_nodes = []
276
+ removed_count = 0
277
+
278
+ for node in existing_nodes:
279
+ # Try to extract source from node metadata
280
+ try:
281
+ source = None
282
+ if hasattr(node, 'source_node') and hasattr(node.source_node, 'metadata'):
283
+ source = node.source_node.metadata.get("source")
284
+ elif hasattr(node, 'metadata'):
285
+ source = node.metadata.get("source")
286
+
287
+ if source not in updated_sources:
288
+ filtered_nodes.append(node)
289
+ else:
290
+ removed_count += 1
291
+ except Exception:
292
+ # Keep nodes where we can't determine the source
293
+ filtered_nodes.append(node)
294
+
295
+ print(f"Removed {{removed_count}} existing nodes for updated sources")
296
+ existing_nodes = filtered_nodes
297
+
298
+ # Combine filtered existing nodes with new nodes
299
+ all_nodes = existing_nodes + enhanced_nodes
300
+
301
+ # Save all nodes
302
+ with open("data/all_sources_contextual_nodes.pkl", "wb") as f:
303
+ pickle.dump(all_nodes, f)
304
+
305
+ print(f"Total nodes in updated file: {{len(all_nodes)}}")
306
+
307
+ asyncio.run(main())
308
+ """,
309
+ ]
310
+ else:
311
+ # Process all documents
312
+ cmd = ["python", "data/scraping_scripts/add_context_to_nodes.py"]
313
+
314
+ result = subprocess.run(cmd)
315
+
316
+ if result.returncode != 0:
317
+ logger.error(f"Error adding context to nodes - check output above")
318
+ sys.exit(1)
319
+
320
+ logger.info("Successfully added context to nodes")
321
+
322
+ # Clean up temp file if it exists
323
+ if new_only and os.path.exists("data/new_docs_temp.jsonl"):
324
+ os.remove("data/new_docs_temp.jsonl")
325
+
326
+
327
+ def create_vector_stores() -> None:
328
+ """Create vector stores from processed documents."""
329
+ logger.info("Creating vector stores")
330
+ cmd = ["python", "data/scraping_scripts/create_vector_stores.py", "all_sources"]
331
+ result = subprocess.run(cmd)
332
+
333
+ if result.returncode != 0:
334
+ logger.error(f"Error creating vector stores - check output above")
335
+ sys.exit(1)
336
+
337
+ logger.info("Successfully created vector stores")
338
+
339
+
340
+ def upload_to_huggingface(upload_jsonl: bool = False) -> None:
341
+ """Upload databases to HuggingFace."""
342
+ logger.info("Uploading databases to HuggingFace")
343
+ cmd = ["python", "data/scraping_scripts/upload_dbs_to_hf.py"]
344
+ result = subprocess.run(cmd)
345
+
346
+ if result.returncode != 0:
347
+ logger.error(f"Error uploading databases - check output above")
348
+ sys.exit(1)
349
+
350
+ logger.info("Successfully uploaded databases to HuggingFace")
351
+
352
+ if upload_jsonl:
353
+ logger.info("Uploading data files to HuggingFace")
354
+
355
+ try:
356
+ # Note: This uses a separate private repository
357
+ cmd = ["python", "data/scraping_scripts/upload_data_to_hf.py"]
358
+ result = subprocess.run(cmd)
359
+
360
+ if result.returncode != 0:
361
+ logger.error(f"Error uploading data files - check output above")
362
+ sys.exit(1)
363
+
364
+ logger.info("Successfully uploaded data files to HuggingFace")
365
+ except Exception as e:
366
+ logger.error(f"Error uploading JSONL file: {e}")
367
+ sys.exit(1)
368
+
369
+
370
+ def update_ui_files(course_name: str) -> None:
371
+ """Update main.py and setup.py with the new source."""
372
+ logger.info(f"Updating UI files with new course: {course_name}")
373
+
374
+ # Get the source configuration for display name
375
+ from data.scraping_scripts.process_md_files import SOURCE_CONFIGS
376
+
377
+ if course_name not in SOURCE_CONFIGS:
378
+ logger.error(f"Course {course_name} not found in SOURCE_CONFIGS")
379
+ return
380
+
381
+ # Get a readable display name for the UI
382
+ display_name = course_name.replace("_", " ").title()
383
+
384
+ # Update setup.py - add to AVAILABLE_SOURCES and AVAILABLE_SOURCES_UI
385
+ setup_path = Path("scripts/setup.py")
386
+ if setup_path.exists():
387
+ setup_content = setup_path.read_text()
388
+
389
+ # Check if already added
390
+ if f'"{course_name}"' in setup_content:
391
+ logger.info(f"Course {course_name} already in setup.py")
392
+ else:
393
+ # Add to AVAILABLE_SOURCES_UI
394
+ ui_list_start = setup_content.find("AVAILABLE_SOURCES_UI = [")
395
+ ui_list_end = setup_content.find("]", ui_list_start)
396
+ new_ui_content = (
397
+ setup_content[:ui_list_end]
398
+ + f' "{display_name}",\n'
399
+ + setup_content[ui_list_end:]
400
+ )
401
+
402
+ # Add to AVAILABLE_SOURCES
403
+ sources_list_start = new_ui_content.find("AVAILABLE_SOURCES = [")
404
+ sources_list_end = new_ui_content.find("]", sources_list_start)
405
+ new_content = (
406
+ new_ui_content[:sources_list_end]
407
+ + f' "{course_name}",\n'
408
+ + new_ui_content[sources_list_end:]
409
+ )
410
+
411
+ # Write updated content
412
+ setup_path.write_text(new_content)
413
+ logger.info(f"Updated setup.py with {course_name}")
414
+ else:
415
+ logger.warning(f"setup.py not found at {setup_path}")
416
+
417
+ # Update main.py - add to source_mapping
418
+ main_path = Path("scripts/main.py")
419
+ if main_path.exists():
420
+ main_content = main_path.read_text()
421
+
422
+ # Check if already added
423
+ if f'"{display_name}": "{course_name}"' in main_content:
424
+ logger.info(f"Course {course_name} already in main.py")
425
+ else:
426
+ # Add to source_mapping
427
+ mapping_start = main_content.find("source_mapping = {")
428
+ mapping_end = main_content.find("}", mapping_start)
429
+ new_main_content = (
430
+ main_content[:mapping_end]
431
+ + f' "{display_name}": "{course_name}",\n'
432
+ + main_content[mapping_end:]
433
+ )
434
+
435
+ # Add to default selected sources if not there
436
+ value_start = new_main_content.find("value=[")
437
+ value_end = new_main_content.find("]", value_start)
438
+
439
+ if f'"{display_name}"' not in new_main_content[value_start:value_end]:
440
+ new_main_content = (
441
+ new_main_content[: value_start + 7]
442
+ + f' "{display_name}",\n'
443
+ + new_main_content[value_start + 7 :]
444
+ )
445
+
446
+ # Write updated content
447
+ main_path.write_text(new_main_content)
448
+ logger.info(f"Updated main.py with {course_name}")
449
+ else:
450
+ logger.warning(f"main.py not found at {main_path}")
451
+
452
+
453
+ def main():
454
+ parser = argparse.ArgumentParser(
455
+ description="AI Tutor App Course Addition Workflow"
456
+ )
457
+ parser.add_argument(
458
+ "--course",
459
+ required=True,
460
+ help="Name of the course to process (must match SOURCE_CONFIGS)",
461
+ )
462
+ parser.add_argument(
463
+ "--skip-process-md",
464
+ action="store_true",
465
+ help="Skip the markdown processing step",
466
+ )
467
+ parser.add_argument(
468
+ "--skip-merge",
469
+ action="store_true",
470
+ help="Skip merging into all_sources_data.jsonl",
471
+ )
472
+ parser.add_argument(
473
+ "--process-all-context",
474
+ action="store_true",
475
+ help="Process all content when adding context (default: only process new content)",
476
+ )
477
+ parser.add_argument(
478
+ "--skip-context",
479
+ action="store_true",
480
+ help="Skip the context addition step entirely",
481
+ )
482
+ parser.add_argument(
483
+ "--skip-vectors", action="store_true", help="Skip vector store creation"
484
+ )
485
+ parser.add_argument(
486
+ "--skip-upload", action="store_true", help="Skip uploading to HuggingFace"
487
+ )
488
+ parser.add_argument(
489
+ "--skip-ui-update",
490
+ action="store_true",
491
+ help="Skip updating the UI configuration",
492
+ )
493
+ parser.add_argument(
494
+ "--skip-data-upload",
495
+ action="store_true",
496
+ help="Skip uploading data files to private HuggingFace repo (they are uploaded by default)",
497
+ )
498
+
499
+ args = parser.parse_args()
500
+ course_name = args.course
501
+
502
+ # Ensure required data files exist before proceeding
503
+ ensure_required_files_exist()
504
+
505
+ # Get the output file path
506
+ from data.scraping_scripts.process_md_files import SOURCE_CONFIGS
507
+
508
+ if course_name not in SOURCE_CONFIGS:
509
+ logger.error(f"Course {course_name} not found in SOURCE_CONFIGS")
510
+ sys.exit(1)
511
+
512
+ course_jsonl_path = SOURCE_CONFIGS[course_name]["output_file"]
513
+
514
+ # Execute the workflow steps
515
+ if not args.skip_process_md:
516
+ course_jsonl_path = process_markdown_files(course_name)
517
+
518
+ # Always do the manual URL addition step for courses
519
+ manual_url_addition(course_jsonl_path)
520
+
521
+ if not args.skip_merge:
522
+ merge_into_all_sources(course_jsonl_path)
523
+
524
+ if not args.skip_context:
525
+ add_context_to_nodes(not args.process_all_context)
526
+
527
+ if not args.skip_vectors:
528
+ create_vector_stores()
529
+
530
+ if not args.skip_upload:
531
+ # By default, also upload the data files (JSONL and PKL) unless explicitly skipped
532
+ upload_to_huggingface(not args.skip_data_upload)
533
+
534
+ if not args.skip_ui_update:
535
+ update_ui_files(course_name)
536
+
537
+ logger.info("Course addition workflow completed successfully")
538
+
539
+
540
+ if __name__ == "__main__":
541
+ main()
data/scraping_scripts/upload_data_to_hf.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Upload Data Files to HuggingFace
4
+
5
+ This script uploads key data files to a private HuggingFace dataset repository:
6
+ 1. all_sources_data.jsonl - The raw document data
7
+ 2. all_sources_contextual_nodes.pkl - The processed nodes with added context
8
+
9
+ This is useful for new team members who need the latest version of the data.
10
+
11
+ Usage:
12
+ python upload_data_to_hf.py [--repo REPO_ID]
13
+
14
+ Arguments:
15
+ --repo REPO_ID HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data)
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+
21
+ from dotenv import load_dotenv
22
+ from huggingface_hub import HfApi
23
+
24
+ load_dotenv()
25
+
26
+
27
+ def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"):
28
+ """Upload data files to a private HuggingFace repository."""
29
+ # Main files to upload
30
+ files_to_upload = [
31
+ # Combined data and vector store
32
+ "data/all_sources_data.jsonl",
33
+ "data/all_sources_contextual_nodes.pkl",
34
+ # Individual source files
35
+ "data/transformers_data.jsonl",
36
+ "data/peft_data.jsonl",
37
+ "data/trl_data.jsonl",
38
+ "data/llama_index_data.jsonl",
39
+ "data/langchain_data.jsonl",
40
+ "data/openai_cookbooks_data.jsonl",
41
+ # Course files
42
+ "data/tai_blog_data.jsonl",
43
+ "data/8-hour_primer_data.jsonl",
44
+ "data/llm_developer_data.jsonl",
45
+ "data/python_primer_data.jsonl",
46
+ ]
47
+
48
+ # Filter to only include files that exist
49
+ existing_files = []
50
+ missing_files = []
51
+
52
+ for file_path in files_to_upload:
53
+ if os.path.exists(file_path):
54
+ existing_files.append(file_path)
55
+ else:
56
+ missing_files.append(file_path)
57
+
58
+ # Critical files must exist
59
+ critical_files = [
60
+ "data/all_sources_data.jsonl",
61
+ "data/all_sources_contextual_nodes.pkl",
62
+ ]
63
+ critical_missing = [f for f in critical_files if f in missing_files]
64
+
65
+ if critical_missing:
66
+ print(
67
+ f"Error: The following critical files were not found: {', '.join(critical_missing)}"
68
+ )
69
+ # return False
70
+
71
+ if missing_files:
72
+ print(
73
+ f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}"
74
+ )
75
+ print("This is normal if you're only updating certain sources.")
76
+
77
+ try:
78
+ api = HfApi(token=os.getenv("HF_TOKEN"))
79
+
80
+ # Check if repository exists, create if it doesn't
81
+ try:
82
+ api.repo_info(repo_id=repo_id, repo_type="dataset")
83
+ print(f"Repository {repo_id} exists")
84
+ except Exception:
85
+ print(
86
+ f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform."
87
+ )
88
+ print("Make sure to set it as private if needed.")
89
+ return False
90
+
91
+ # Upload all existing files
92
+ for file_path in existing_files:
93
+ try:
94
+ file_name = os.path.basename(file_path)
95
+ print(f"Uploading {file_name}...")
96
+
97
+ api.upload_file(
98
+ path_or_fileobj=file_path,
99
+ path_in_repo=file_name,
100
+ repo_id=repo_id,
101
+ repo_type="dataset",
102
+ )
103
+ print(
104
+ f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}"
105
+ )
106
+ except Exception as e:
107
+ print(f"Error uploading {file_path}: {e}")
108
+ # Continue with other files even if one fails
109
+
110
+ return True
111
+ except Exception as e:
112
+ print(f"Error uploading files: {e}")
113
+ return False
114
+
115
+
116
+ def main():
117
+ parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace")
118
+ parser.add_argument(
119
+ "--repo",
120
+ default="towardsai-tutors/ai-tutor-data",
121
+ help="HuggingFace dataset repository ID",
122
+ )
123
+
124
+ args = parser.parse_args()
125
+ upload_files_to_huggingface(args.repo)
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()