Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
""" | |
AI Tutor App - Documentation Update Workflow | |
This script automates the process of updating documentation from GitHub repositories: | |
1. Download documentation from GitHub using the API | |
2. Process markdown files to create JSONL data | |
3. Add contextual information to document nodes | |
4. Create vector stores | |
5. Upload databases to HuggingFace | |
This workflow is specific to updating library documentation (Transformers, PEFT, LlamaIndex, etc.). | |
For adding courses, use the add_course_workflow.py script instead. | |
Usage: | |
python update_docs_workflow.py --sources [SOURCE1] [SOURCE2] ... | |
Additional flags to run specific steps (if you want to restart from a specific point): | |
--skip-download Skip the GitHub download step | |
--skip-process Skip the markdown processing step | |
--new-context-only Only process new content when adding context | |
--skip-context Skip the context addition step entirely | |
--skip-vectors Skip vector store creation | |
--skip-upload Skip uploading to HuggingFace | |
""" | |
import argparse | |
import json | |
import logging | |
import os | |
import pickle | |
import subprocess | |
import sys | |
from typing import Dict, List, Set | |
from dotenv import load_dotenv | |
from huggingface_hub import HfApi, hf_hub_download | |
# Load environment variables from .env file | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
def ensure_required_files_exist(): | |
"""Download required data files from HuggingFace if they don't exist locally.""" | |
# List of files to check and download | |
required_files = { | |
# Critical files | |
"data/all_sources_data.jsonl": "all_sources_data.jsonl", | |
"data/all_sources_contextual_nodes.pkl": "all_sources_contextual_nodes.pkl", | |
# Documentation source files | |
"data/transformers_data.jsonl": "transformers_data.jsonl", | |
"data/peft_data.jsonl": "peft_data.jsonl", | |
"data/trl_data.jsonl": "trl_data.jsonl", | |
"data/llama_index_data.jsonl": "llama_index_data.jsonl", | |
"data/langchain_data.jsonl": "langchain_data.jsonl", | |
"data/openai_cookbooks_data.jsonl": "openai_cookbooks_data.jsonl", | |
# Course files | |
"data/tai_blog_data.jsonl": "tai_blog_data.jsonl", | |
"data/8-hour_primer_data.jsonl": "8-hour_primer_data.jsonl", | |
"data/llm_developer_data.jsonl": "llm_developer_data.jsonl", | |
"data/python_primer_data.jsonl": "python_primer_data.jsonl", | |
} | |
# Critical files that must be downloaded | |
critical_files = [ | |
"data/all_sources_data.jsonl", | |
"data/all_sources_contextual_nodes.pkl", | |
] | |
# Check and download each file | |
for local_path, remote_filename in required_files.items(): | |
if not os.path.exists(local_path): | |
logger.info( | |
f"{remote_filename} not found. Attempting to download from HuggingFace..." | |
) | |
try: | |
hf_hub_download( | |
token=os.getenv("HF_TOKEN"), | |
repo_id="towardsai-tutors/ai-tutor-data", | |
filename=remote_filename, | |
repo_type="dataset", | |
local_dir="data", | |
) | |
logger.info( | |
f"Successfully downloaded {remote_filename} from HuggingFace" | |
) | |
except Exception as e: | |
logger.warning(f"Could not download {remote_filename}: {e}") | |
# Only create empty file for all_sources_data.jsonl if it's missing | |
if local_path == "data/all_sources_data.jsonl": | |
logger.warning( | |
"Creating a new all_sources_data.jsonl file. This will not include previously existing data." | |
) | |
with open(local_path, "w") as f: | |
pass | |
# If critical file is missing, print a more serious warning | |
if local_path in critical_files: | |
logger.warning( | |
f"Critical file {remote_filename} is missing. The workflow may not function correctly." | |
) | |
if local_path == "data/all_sources_contextual_nodes.pkl": | |
logger.warning( | |
"The context addition step will process all documents since no existing contexts were found." | |
) | |
# Documentation sources that can be updated via GitHub API | |
GITHUB_SOURCES = [ | |
"transformers", | |
"peft", | |
"trl", | |
"llama_index", | |
"openai_cookbooks", | |
"langchain", | |
] | |
def load_jsonl(file_path: str) -> List[Dict]: | |
"""Load data from a JSONL file.""" | |
data = [] | |
with open(file_path, "r", encoding="utf-8") as f: | |
for line in f: | |
data.append(json.loads(line)) | |
return data | |
def save_jsonl(data: List[Dict], file_path: str) -> None: | |
"""Save data to a JSONL file.""" | |
with open(file_path, "w", encoding="utf-8") as f: | |
for item in data: | |
json.dump(item, f, ensure_ascii=False) | |
f.write("\n") | |
def download_from_github(sources: List[str]) -> None: | |
"""Download documentation from GitHub repositories.""" | |
logger.info(f"Downloading documentation from GitHub for sources: {sources}") | |
for source in sources: | |
if source not in GITHUB_SOURCES: | |
logger.warning(f"Source {source} is not a GitHub source, skipping download") | |
continue | |
logger.info(f"Downloading {source} documentation") | |
cmd = ["python", "data/scraping_scripts/github_to_markdown_ai_docs.py", source] | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error( | |
f"Error downloading {source} documentation - check output above" | |
) | |
# Continue with other sources instead of exiting | |
continue | |
logger.info(f"Successfully downloaded {source} documentation") | |
def process_markdown_files(sources: List[str]) -> None: | |
"""Process markdown files for specific sources.""" | |
logger.info(f"Processing markdown files for sources: {sources}") | |
cmd = ["python", "data/scraping_scripts/process_md_files.py"] + sources | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error(f"Error processing markdown files - check output above") | |
sys.exit(1) | |
logger.info(f"Successfully processed markdown files") | |
def get_processed_doc_ids() -> Set[str]: | |
"""Get set of doc_ids that have already been processed with context.""" | |
if not os.path.exists("data/all_sources_contextual_nodes.pkl"): | |
return set() | |
try: | |
with open("data/all_sources_contextual_nodes.pkl", "rb") as f: | |
nodes = pickle.load(f) | |
return {node.source_node.node_id for node in nodes} | |
except Exception as e: | |
logger.error(f"Error loading processed doc_ids: {e}") | |
return set() | |
def add_context_to_nodes(new_only: bool = False) -> None: | |
"""Add context to document nodes, optionally processing only new content.""" | |
logger.info("Adding context to document nodes") | |
if new_only: | |
# Load all documents | |
all_docs = load_jsonl("data/all_sources_data.jsonl") | |
processed_ids = get_processed_doc_ids() | |
# Filter for unprocessed documents | |
new_docs = [doc for doc in all_docs if doc["doc_id"] not in processed_ids] | |
if not new_docs: | |
logger.info("No new documents to process") | |
return | |
# Save temporary JSONL with only new documents | |
temp_file = "data/new_docs_temp.jsonl" | |
save_jsonl(new_docs, temp_file) | |
# Temporarily modify the add_context_to_nodes.py script to use the temp file | |
cmd = [ | |
"python", | |
"-c", | |
f""" | |
import asyncio | |
import os | |
import pickle | |
import json | |
from data.scraping_scripts.add_context_to_nodes import create_docs, process | |
async def main(): | |
# First, get the list of sources being updated from the temp file | |
updated_sources = set() | |
with open("{temp_file}", "r") as f: | |
for line in f: | |
data = json.loads(line) | |
updated_sources.add(data["source"]) | |
print(f"Updating nodes for sources: {{updated_sources}}") | |
# Process new documents | |
documents = create_docs("{temp_file}") | |
enhanced_nodes = await process(documents) | |
print(f"Generated context for {{len(enhanced_nodes)}} new nodes") | |
# Load existing nodes if they exist | |
existing_nodes = [] | |
if os.path.exists("data/all_sources_contextual_nodes.pkl"): | |
with open("data/all_sources_contextual_nodes.pkl", "rb") as f: | |
existing_nodes = pickle.load(f) | |
# Filter out existing nodes for sources we're updating | |
filtered_nodes = [] | |
removed_count = 0 | |
for node in existing_nodes: | |
# Try to extract source from node metadata | |
try: | |
source = None | |
if hasattr(node, 'source_node') and hasattr(node.source_node, 'metadata'): | |
source = node.source_node.metadata.get("source") | |
elif hasattr(node, 'metadata'): | |
source = node.metadata.get("source") | |
if source not in updated_sources: | |
filtered_nodes.append(node) | |
else: | |
removed_count += 1 | |
except Exception: | |
# Keep nodes where we can't determine the source | |
filtered_nodes.append(node) | |
print(f"Removed {{removed_count}} existing nodes for updated sources") | |
existing_nodes = filtered_nodes | |
# Combine filtered existing nodes with new nodes | |
all_nodes = existing_nodes + enhanced_nodes | |
# Save all nodes | |
with open("data/all_sources_contextual_nodes.pkl", "wb") as f: | |
pickle.dump(all_nodes, f) | |
print(f"Total nodes in updated file: {{len(all_nodes)}}") | |
asyncio.run(main()) | |
""", | |
] | |
else: | |
# Process all documents | |
logger.info("Adding context to all nodes") | |
cmd = ["python", "data/scraping_scripts/add_context_to_nodes.py"] | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error(f"Error adding context to nodes - check output above") | |
sys.exit(1) | |
logger.info("Successfully added context to nodes") | |
# Clean up temp file if it exists | |
if new_only and os.path.exists("data/new_docs_temp.jsonl"): | |
os.remove("data/new_docs_temp.jsonl") | |
def create_vector_stores() -> None: | |
"""Create vector stores from processed documents.""" | |
logger.info("Creating vector stores") | |
cmd = ["python", "data/scraping_scripts/create_vector_stores.py", "all_sources"] | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error(f"Error creating vector stores - check output above") | |
sys.exit(1) | |
logger.info("Successfully created vector stores") | |
def upload_to_huggingface(upload_jsonl: bool = False) -> None: | |
"""Upload databases to HuggingFace.""" | |
logger.info("Uploading databases to HuggingFace") | |
cmd = ["python", "data/scraping_scripts/upload_dbs_to_hf.py"] | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error(f"Error uploading databases - check output above") | |
sys.exit(1) | |
logger.info("Successfully uploaded databases to HuggingFace") | |
if upload_jsonl: | |
logger.info("Uploading data files to HuggingFace") | |
try: | |
# Note: This uses a separate private repository | |
cmd = ["python", "data/scraping_scripts/upload_data_to_hf.py"] | |
result = subprocess.run(cmd) | |
if result.returncode != 0: | |
logger.error(f"Error uploading data files - check output above") | |
sys.exit(1) | |
logger.info("Successfully uploaded data files to HuggingFace") | |
except Exception as e: | |
logger.error(f"Error uploading JSONL file: {e}") | |
sys.exit(1) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="AI Tutor App Documentation Update Workflow" | |
) | |
parser.add_argument( | |
"--sources", | |
nargs="+", | |
choices=GITHUB_SOURCES, | |
default=GITHUB_SOURCES, | |
help="GitHub documentation sources to update", | |
) | |
parser.add_argument( | |
"--skip-download", action="store_true", help="Skip downloading from GitHub" | |
) | |
parser.add_argument( | |
"--skip-process", action="store_true", help="Skip processing markdown files" | |
) | |
parser.add_argument( | |
"--process-all-context", | |
action="store_true", | |
help="Process all content when adding context (default: only process new content)", | |
) | |
parser.add_argument( | |
"--skip-context", | |
action="store_true", | |
help="Skip the context addition step entirely", | |
) | |
parser.add_argument( | |
"--skip-vectors", action="store_true", help="Skip vector store creation" | |
) | |
parser.add_argument( | |
"--skip-upload", action="store_true", help="Skip uploading to HuggingFace" | |
) | |
parser.add_argument( | |
"--skip-data-upload", | |
action="store_true", | |
help="Skip uploading data files (.jsonl and .pkl) to private HuggingFace repo (they are uploaded by default)", | |
) | |
args = parser.parse_args() | |
# Ensure required data files exist before proceeding | |
ensure_required_files_exist() | |
# Execute the workflow steps | |
if not args.skip_download: | |
download_from_github(args.sources) | |
if not args.skip_process: | |
process_markdown_files(args.sources) | |
if not args.skip_context: | |
add_context_to_nodes(not args.process_all_context) | |
if not args.skip_vectors: | |
create_vector_stores() | |
if not args.skip_upload: | |
# By default, also upload the data files (JSONL and PKL) unless explicitly skipped | |
upload_to_huggingface(not args.skip_data_upload) | |
logger.info("Documentation update workflow completed successfully") | |
if __name__ == "__main__": | |
main() | |