omarsol commited on
Commit
4ecfe75
Β·
1 Parent(s): a54e637

add update docs workflow

Browse files
data/scraping_scripts/update_docs_workflow.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ AI Tutor App - Documentation Update Workflow
4
+
5
+ This script automates the process of updating documentation from GitHub repositories:
6
+ 1. Download documentation from GitHub using the API
7
+ 2. Process markdown files to create JSONL data
8
+ 3. Add contextual information to document nodes
9
+ 4. Create vector stores
10
+ 5. Upload databases to HuggingFace
11
+
12
+ This workflow is specific to updating library documentation (Transformers, PEFT, LlamaIndex, etc.).
13
+ For adding courses, use the add_course_workflow.py script instead.
14
+
15
+ Usage:
16
+ python update_docs_workflow.py --sources [SOURCE1] [SOURCE2] ...
17
+
18
+ Additional flags to run specific steps (if you want to restart from a specific point):
19
+ --skip-download Skip the GitHub download step
20
+ --skip-process Skip the markdown processing step
21
+ --new-context-only Only process new content when adding context
22
+ --skip-context Skip the context addition step entirely
23
+ --skip-vectors Skip vector store creation
24
+ --skip-upload Skip uploading to HuggingFace
25
+ """
26
+
27
+ import argparse
28
+ import json
29
+ import logging
30
+ import os
31
+ import pickle
32
+ import subprocess
33
+ import sys
34
+ from typing import Dict, List, Set
35
+
36
+ from dotenv import load_dotenv
37
+ from huggingface_hub import HfApi, hf_hub_download
38
+
39
+ # Load environment variables from .env file
40
+ load_dotenv()
41
+
42
+ # Configure logging
43
+ logging.basicConfig(
44
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
45
+ )
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ def ensure_required_files_exist():
50
+ """Download required data files from HuggingFace if they don't exist locally."""
51
+ # List of files to check and download
52
+ required_files = {
53
+ # Critical files
54
+ "data/all_sources_data.jsonl": "all_sources_data.jsonl",
55
+ "data/all_sources_contextual_nodes.pkl": "all_sources_contextual_nodes.pkl",
56
+ # Documentation source files
57
+ "data/transformers_data.jsonl": "transformers_data.jsonl",
58
+ "data/peft_data.jsonl": "peft_data.jsonl",
59
+ "data/trl_data.jsonl": "trl_data.jsonl",
60
+ "data/llama_index_data.jsonl": "llama_index_data.jsonl",
61
+ "data/langchain_data.jsonl": "langchain_data.jsonl",
62
+ "data/openai_cookbooks_data.jsonl": "openai_cookbooks_data.jsonl",
63
+ # Course files
64
+ "data/tai_blog_data.jsonl": "tai_blog_data.jsonl",
65
+ "data/8-hour_primer_data.jsonl": "8-hour_primer_data.jsonl",
66
+ "data/llm_developer_data.jsonl": "llm_developer_data.jsonl",
67
+ "data/python_primer_data.jsonl": "python_primer_data.jsonl",
68
+ }
69
+
70
+ # Critical files that must be downloaded
71
+ critical_files = [
72
+ "data/all_sources_data.jsonl",
73
+ "data/all_sources_contextual_nodes.pkl",
74
+ ]
75
+
76
+ # Check and download each file
77
+ for local_path, remote_filename in required_files.items():
78
+ if not os.path.exists(local_path):
79
+ logger.info(
80
+ f"{remote_filename} not found. Attempting to download from HuggingFace..."
81
+ )
82
+ try:
83
+ hf_hub_download(
84
+ token=os.getenv("HF_TOKEN"),
85
+ repo_id="towardsai-tutors/ai-tutor-data",
86
+ filename=remote_filename,
87
+ repo_type="dataset",
88
+ local_dir="data",
89
+ )
90
+ logger.info(
91
+ f"Successfully downloaded {remote_filename} from HuggingFace"
92
+ )
93
+ except Exception as e:
94
+ logger.warning(f"Could not download {remote_filename}: {e}")
95
+
96
+ # Only create empty file for all_sources_data.jsonl if it's missing
97
+ if local_path == "data/all_sources_data.jsonl":
98
+ logger.warning(
99
+ "Creating a new all_sources_data.jsonl file. This will not include previously existing data."
100
+ )
101
+ with open(local_path, "w") as f:
102
+ pass
103
+
104
+ # If critical file is missing, print a more serious warning
105
+ if local_path in critical_files:
106
+ logger.warning(
107
+ f"Critical file {remote_filename} is missing. The workflow may not function correctly."
108
+ )
109
+
110
+ if local_path == "data/all_sources_contextual_nodes.pkl":
111
+ logger.warning(
112
+ "The context addition step will process all documents since no existing contexts were found."
113
+ )
114
+
115
+
116
+ # Documentation sources that can be updated via GitHub API
117
+ GITHUB_SOURCES = [
118
+ "transformers",
119
+ "peft",
120
+ "trl",
121
+ "llama_index",
122
+ "openai_cookbooks",
123
+ "langchain",
124
+ ]
125
+
126
+
127
+ def load_jsonl(file_path: str) -> List[Dict]:
128
+ """Load data from a JSONL file."""
129
+ data = []
130
+ with open(file_path, "r", encoding="utf-8") as f:
131
+ for line in f:
132
+ data.append(json.loads(line))
133
+ return data
134
+
135
+
136
+ def save_jsonl(data: List[Dict], file_path: str) -> None:
137
+ """Save data to a JSONL file."""
138
+ with open(file_path, "w", encoding="utf-8") as f:
139
+ for item in data:
140
+ json.dump(item, f, ensure_ascii=False)
141
+ f.write("\n")
142
+
143
+
144
+ def download_from_github(sources: List[str]) -> None:
145
+ """Download documentation from GitHub repositories."""
146
+ logger.info(f"Downloading documentation from GitHub for sources: {sources}")
147
+
148
+ for source in sources:
149
+ if source not in GITHUB_SOURCES:
150
+ logger.warning(f"Source {source} is not a GitHub source, skipping download")
151
+ continue
152
+
153
+ logger.info(f"Downloading {source} documentation")
154
+ cmd = ["python", "data/scraping_scripts/github_to_markdown_ai_docs.py", source]
155
+ result = subprocess.run(cmd)
156
+
157
+ if result.returncode != 0:
158
+ logger.error(
159
+ f"Error downloading {source} documentation - check output above"
160
+ )
161
+ # Continue with other sources instead of exiting
162
+ continue
163
+
164
+ logger.info(f"Successfully downloaded {source} documentation")
165
+
166
+
167
+ def process_markdown_files(sources: List[str]) -> None:
168
+ """Process markdown files for specific sources."""
169
+ logger.info(f"Processing markdown files for sources: {sources}")
170
+
171
+ cmd = ["python", "data/scraping_scripts/process_md_files.py"] + sources
172
+ result = subprocess.run(cmd)
173
+
174
+ if result.returncode != 0:
175
+ logger.error(f"Error processing markdown files - check output above")
176
+ sys.exit(1)
177
+
178
+ logger.info(f"Successfully processed markdown files")
179
+
180
+
181
+ def get_processed_doc_ids() -> Set[str]:
182
+ """Get set of doc_ids that have already been processed with context."""
183
+ if not os.path.exists("data/all_sources_contextual_nodes.pkl"):
184
+ return set()
185
+
186
+ try:
187
+ with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
188
+ nodes = pickle.load(f)
189
+ return {node.source_node.node_id for node in nodes}
190
+ except Exception as e:
191
+ logger.error(f"Error loading processed doc_ids: {e}")
192
+ return set()
193
+
194
+
195
+ def add_context_to_nodes(new_only: bool = False) -> None:
196
+ """Add context to document nodes, optionally processing only new content."""
197
+ logger.info("Adding context to document nodes")
198
+
199
+ if new_only:
200
+ # Load all documents
201
+ all_docs = load_jsonl("data/all_sources_data.jsonl")
202
+ processed_ids = get_processed_doc_ids()
203
+
204
+ # Filter for unprocessed documents
205
+ new_docs = [doc for doc in all_docs if doc["doc_id"] not in processed_ids]
206
+
207
+ if not new_docs:
208
+ logger.info("No new documents to process")
209
+ return
210
+
211
+ # Save temporary JSONL with only new documents
212
+ temp_file = "data/new_docs_temp.jsonl"
213
+ save_jsonl(new_docs, temp_file)
214
+
215
+ # Temporarily modify the add_context_to_nodes.py script to use the temp file
216
+ cmd = [
217
+ "python",
218
+ "-c",
219
+ f"""
220
+ import asyncio
221
+ import os
222
+ import pickle
223
+ import json
224
+ from data.scraping_scripts.add_context_to_nodes import create_docs, process
225
+
226
+ async def main():
227
+ # First, get the list of sources being updated from the temp file
228
+ updated_sources = set()
229
+ with open("{temp_file}", "r") as f:
230
+ for line in f:
231
+ data = json.loads(line)
232
+ updated_sources.add(data["source"])
233
+
234
+ print(f"Updating nodes for sources: {{updated_sources}}")
235
+
236
+ # Process new documents
237
+ documents = create_docs("{temp_file}")
238
+ enhanced_nodes = await process(documents)
239
+ print(f"Generated context for {{len(enhanced_nodes)}} new nodes")
240
+
241
+ # Load existing nodes if they exist
242
+ existing_nodes = []
243
+ if os.path.exists("data/all_sources_contextual_nodes.pkl"):
244
+ with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
245
+ existing_nodes = pickle.load(f)
246
+
247
+ # Filter out existing nodes for sources we're updating
248
+ filtered_nodes = []
249
+ removed_count = 0
250
+
251
+ for node in existing_nodes:
252
+ # Try to extract source from node metadata
253
+ try:
254
+ source = None
255
+ if hasattr(node, 'source_node') and hasattr(node.source_node, 'metadata'):
256
+ source = node.source_node.metadata.get("source")
257
+ elif hasattr(node, 'metadata'):
258
+ source = node.metadata.get("source")
259
+
260
+ if source not in updated_sources:
261
+ filtered_nodes.append(node)
262
+ else:
263
+ removed_count += 1
264
+ except Exception:
265
+ # Keep nodes where we can't determine the source
266
+ filtered_nodes.append(node)
267
+
268
+ print(f"Removed {{removed_count}} existing nodes for updated sources")
269
+ existing_nodes = filtered_nodes
270
+
271
+ # Combine filtered existing nodes with new nodes
272
+ all_nodes = existing_nodes + enhanced_nodes
273
+
274
+ # Save all nodes
275
+ with open("data/all_sources_contextual_nodes.pkl", "wb") as f:
276
+ pickle.dump(all_nodes, f)
277
+
278
+ print(f"Total nodes in updated file: {{len(all_nodes)}}")
279
+
280
+ asyncio.run(main())
281
+ """,
282
+ ]
283
+ else:
284
+ # Process all documents
285
+ logger.info("Adding context to all nodes")
286
+ cmd = ["python", "data/scraping_scripts/add_context_to_nodes.py"]
287
+
288
+ result = subprocess.run(cmd)
289
+
290
+ if result.returncode != 0:
291
+ logger.error(f"Error adding context to nodes - check output above")
292
+ sys.exit(1)
293
+
294
+ logger.info("Successfully added context to nodes")
295
+
296
+ # Clean up temp file if it exists
297
+ if new_only and os.path.exists("data/new_docs_temp.jsonl"):
298
+ os.remove("data/new_docs_temp.jsonl")
299
+
300
+
301
+ def create_vector_stores() -> None:
302
+ """Create vector stores from processed documents."""
303
+ logger.info("Creating vector stores")
304
+ cmd = ["python", "data/scraping_scripts/create_vector_stores.py", "all_sources"]
305
+ result = subprocess.run(cmd)
306
+
307
+ if result.returncode != 0:
308
+ logger.error(f"Error creating vector stores - check output above")
309
+ sys.exit(1)
310
+
311
+ logger.info("Successfully created vector stores")
312
+
313
+
314
+ def upload_to_huggingface(upload_jsonl: bool = False) -> None:
315
+ """Upload databases to HuggingFace."""
316
+ logger.info("Uploading databases to HuggingFace")
317
+ cmd = ["python", "data/scraping_scripts/upload_dbs_to_hf.py"]
318
+ result = subprocess.run(cmd)
319
+
320
+ if result.returncode != 0:
321
+ logger.error(f"Error uploading databases - check output above")
322
+ sys.exit(1)
323
+
324
+ logger.info("Successfully uploaded databases to HuggingFace")
325
+
326
+ if upload_jsonl:
327
+ logger.info("Uploading data files to HuggingFace")
328
+
329
+ try:
330
+ # Note: This uses a separate private repository
331
+ cmd = ["python", "data/scraping_scripts/upload_data_to_hf.py"]
332
+ result = subprocess.run(cmd)
333
+
334
+ if result.returncode != 0:
335
+ logger.error(f"Error uploading data files - check output above")
336
+ sys.exit(1)
337
+
338
+ logger.info("Successfully uploaded data files to HuggingFace")
339
+ except Exception as e:
340
+ logger.error(f"Error uploading JSONL file: {e}")
341
+ sys.exit(1)
342
+
343
+
344
+ def main():
345
+ parser = argparse.ArgumentParser(
346
+ description="AI Tutor App Documentation Update Workflow"
347
+ )
348
+ parser.add_argument(
349
+ "--sources",
350
+ nargs="+",
351
+ choices=GITHUB_SOURCES,
352
+ default=GITHUB_SOURCES,
353
+ help="GitHub documentation sources to update",
354
+ )
355
+ parser.add_argument(
356
+ "--skip-download", action="store_true", help="Skip downloading from GitHub"
357
+ )
358
+ parser.add_argument(
359
+ "--skip-process", action="store_true", help="Skip processing markdown files"
360
+ )
361
+ parser.add_argument(
362
+ "--process-all-context",
363
+ action="store_true",
364
+ help="Process all content when adding context (default: only process new content)",
365
+ )
366
+ parser.add_argument(
367
+ "--skip-context",
368
+ action="store_true",
369
+ help="Skip the context addition step entirely",
370
+ )
371
+ parser.add_argument(
372
+ "--skip-vectors", action="store_true", help="Skip vector store creation"
373
+ )
374
+ parser.add_argument(
375
+ "--skip-upload", action="store_true", help="Skip uploading to HuggingFace"
376
+ )
377
+ parser.add_argument(
378
+ "--skip-data-upload",
379
+ action="store_true",
380
+ help="Skip uploading data files (.jsonl and .pkl) to private HuggingFace repo (they are uploaded by default)",
381
+ )
382
+
383
+ args = parser.parse_args()
384
+
385
+ # Ensure required data files exist before proceeding
386
+ ensure_required_files_exist()
387
+
388
+ # Execute the workflow steps
389
+ if not args.skip_download:
390
+ download_from_github(args.sources)
391
+
392
+ if not args.skip_process:
393
+ process_markdown_files(args.sources)
394
+
395
+ if not args.skip_context:
396
+ add_context_to_nodes(not args.process_all_context)
397
+
398
+ if not args.skip_vectors:
399
+ create_vector_stores()
400
+
401
+ if not args.skip_upload:
402
+ # By default, also upload the data files (JSONL and PKL) unless explicitly skipped
403
+ upload_to_huggingface(not args.skip_data_upload)
404
+
405
+ logger.info("Documentation update workflow completed successfully")
406
+
407
+
408
+ if __name__ == "__main__":
409
+ main()