Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

omarsol commited on Mar 12

Commit

4ecfe75

1 Parent(s): a54e637

add update docs workflow

Browse files

Files changed (1) hide show

data/scraping_scripts/update_docs_workflow.py +409 -0

data/scraping_scripts/update_docs_workflow.py ADDED Viewed

	@@ -0,0 +1,409 @@

+#!/usr/bin/env python
+"""
+AI Tutor App - Documentation Update Workflow
+This script automates the process of updating documentation from GitHub repositories:
+1. Download documentation from GitHub using the API
+2. Process markdown files to create JSONL data
+3. Add contextual information to document nodes
+4. Create vector stores
+5. Upload databases to HuggingFace
+This workflow is specific to updating library documentation (Transformers, PEFT, LlamaIndex, etc.).
+For adding courses, use the add_course_workflow.py script instead.
+Usage:
+    python update_docs_workflow.py --sources [SOURCE1] [SOURCE2] ...
+    Additional flags to run specific steps (if you want to restart from a specific point):
+    --skip-download         Skip the GitHub download step
+    --skip-process          Skip the markdown processing step
+    --new-context-only      Only process new content when adding context
+    --skip-context          Skip the context addition step entirely
+    --skip-vectors          Skip vector store creation
+    --skip-upload           Skip uploading to HuggingFace
+"""
+import argparse
+import json
+import logging
+import os
+import pickle
+import subprocess
+import sys
+from typing import Dict, List, Set
+from dotenv import load_dotenv
+from huggingface_hub import HfApi, hf_hub_download
+# Load environment variables from .env file
+load_dotenv()
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+def ensure_required_files_exist():
+    """Download required data files from HuggingFace if they don't exist locally."""
+    # List of files to check and download
+    required_files = {
+        # Critical files
+        "data/all_sources_data.jsonl": "all_sources_data.jsonl",
+        "data/all_sources_contextual_nodes.pkl": "all_sources_contextual_nodes.pkl",
+        # Documentation source files
+        "data/transformers_data.jsonl": "transformers_data.jsonl",
+        "data/peft_data.jsonl": "peft_data.jsonl",
+        "data/trl_data.jsonl": "trl_data.jsonl",
+        "data/llama_index_data.jsonl": "llama_index_data.jsonl",
+        "data/langchain_data.jsonl": "langchain_data.jsonl",
+        "data/openai_cookbooks_data.jsonl": "openai_cookbooks_data.jsonl",
+        # Course files
+        "data/tai_blog_data.jsonl": "tai_blog_data.jsonl",
+        "data/8-hour_primer_data.jsonl": "8-hour_primer_data.jsonl",
+        "data/llm_developer_data.jsonl": "llm_developer_data.jsonl",
+        "data/python_primer_data.jsonl": "python_primer_data.jsonl",
+    }
+    # Critical files that must be downloaded
+    critical_files = [
+        "data/all_sources_data.jsonl",
+        "data/all_sources_contextual_nodes.pkl",
+    ]
+    # Check and download each file
+    for local_path, remote_filename in required_files.items():
+        if not os.path.exists(local_path):
+            logger.info(
+                f"{remote_filename} not found. Attempting to download from HuggingFace..."
+            )
+            try:
+                hf_hub_download(
+                    token=os.getenv("HF_TOKEN"),
+                    repo_id="towardsai-tutors/ai-tutor-data",
+                    filename=remote_filename,
+                    repo_type="dataset",
+                    local_dir="data",
+                )
+                logger.info(
+                    f"Successfully downloaded {remote_filename} from HuggingFace"
+                )
+            except Exception as e:
+                logger.warning(f"Could not download {remote_filename}: {e}")
+                # Only create empty file for all_sources_data.jsonl if it's missing
+                if local_path == "data/all_sources_data.jsonl":
+                    logger.warning(
+                        "Creating a new all_sources_data.jsonl file. This will not include previously existing data."
+                    )
+                    with open(local_path, "w") as f:
+                        pass
+                # If critical file is missing, print a more serious warning
+                if local_path in critical_files:
+                    logger.warning(
+                        f"Critical file {remote_filename} is missing. The workflow may not function correctly."
+                    )
+                    if local_path == "data/all_sources_contextual_nodes.pkl":
+                        logger.warning(
+                            "The context addition step will process all documents since no existing contexts were found."
+                        )
+# Documentation sources that can be updated via GitHub API
+GITHUB_SOURCES = [
+    "transformers",
+    "peft",
+    "trl",
+    "llama_index",
+    "openai_cookbooks",
+    "langchain",
+]
+def load_jsonl(file_path: str) -> List[Dict]:
+    """Load data from a JSONL file."""
+    data = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+def save_jsonl(data: List[Dict], file_path: str) -> None:
+    """Save data to a JSONL file."""
+    with open(file_path, "w", encoding="utf-8") as f:
+        for item in data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+def download_from_github(sources: List[str]) -> None:
+    """Download documentation from GitHub repositories."""
+    logger.info(f"Downloading documentation from GitHub for sources: {sources}")
+    for source in sources:
+        if source not in GITHUB_SOURCES:
+            logger.warning(f"Source {source} is not a GitHub source, skipping download")
+            continue
+        logger.info(f"Downloading {source} documentation")
+        cmd = ["python", "data/scraping_scripts/github_to_markdown_ai_docs.py", source]
+        result = subprocess.run(cmd)
+        if result.returncode != 0:
+            logger.error(
+                f"Error downloading {source} documentation - check output above"
+            )
+            # Continue with other sources instead of exiting
+            continue
+        logger.info(f"Successfully downloaded {source} documentation")
+def process_markdown_files(sources: List[str]) -> None:
+    """Process markdown files for specific sources."""
+    logger.info(f"Processing markdown files for sources: {sources}")
+    cmd = ["python", "data/scraping_scripts/process_md_files.py"] + sources
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        logger.error(f"Error processing markdown files - check output above")
+        sys.exit(1)
+    logger.info(f"Successfully processed markdown files")
+def get_processed_doc_ids() -> Set[str]:
+    """Get set of doc_ids that have already been processed with context."""
+    if not os.path.exists("data/all_sources_contextual_nodes.pkl"):
+        return set()
+    try:
+        with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
+            nodes = pickle.load(f)
+            return {node.source_node.node_id for node in nodes}
+    except Exception as e:
+        logger.error(f"Error loading processed doc_ids: {e}")
+        return set()
+def add_context_to_nodes(new_only: bool = False) -> None:
+    """Add context to document nodes, optionally processing only new content."""
+    logger.info("Adding context to document nodes")
+    if new_only:
+        # Load all documents
+        all_docs = load_jsonl("data/all_sources_data.jsonl")
+        processed_ids = get_processed_doc_ids()
+        # Filter for unprocessed documents
+        new_docs = [doc for doc in all_docs if doc["doc_id"] not in processed_ids]
+        if not new_docs:
+            logger.info("No new documents to process")
+            return
+        # Save temporary JSONL with only new documents
+        temp_file = "data/new_docs_temp.jsonl"
+        save_jsonl(new_docs, temp_file)
+        # Temporarily modify the add_context_to_nodes.py script to use the temp file
+        cmd = [
+            "python",
+            "-c",
+            f"""
+import asyncio
+import os
+import pickle
+import json
+from data.scraping_scripts.add_context_to_nodes import create_docs, process
+async def main():
+    # First, get the list of sources being updated from the temp file
+    updated_sources = set()
+    with open("{temp_file}", "r") as f:
+        for line in f:
+            data = json.loads(line)
+            updated_sources.add(data["source"])
+    print(f"Updating nodes for sources: {{updated_sources}}")
+    # Process new documents
+    documents = create_docs("{temp_file}")
+    enhanced_nodes = await process(documents)
+    print(f"Generated context for {{len(enhanced_nodes)}} new nodes")
+    # Load existing nodes if they exist
+    existing_nodes = []
+    if os.path.exists("data/all_sources_contextual_nodes.pkl"):
+        with open("data/all_sources_contextual_nodes.pkl", "rb") as f:
+            existing_nodes = pickle.load(f)
+        # Filter out existing nodes for sources we're updating
+        filtered_nodes = []
+        removed_count = 0
+        for node in existing_nodes:
+            # Try to extract source from node metadata
+            try:
+                source = None
+                if hasattr(node, 'source_node') and hasattr(node.source_node, 'metadata'):
+                    source = node.source_node.metadata.get("source")
+                elif hasattr(node, 'metadata'):
+                    source = node.metadata.get("source")
+                if source not in updated_sources:
+                    filtered_nodes.append(node)
+                else:
+                    removed_count += 1
+            except Exception:
+                # Keep nodes where we can't determine the source
+                filtered_nodes.append(node)
+        print(f"Removed {{removed_count}} existing nodes for updated sources")
+        existing_nodes = filtered_nodes
+    # Combine filtered existing nodes with new nodes
+    all_nodes = existing_nodes + enhanced_nodes
+    # Save all nodes
+    with open("data/all_sources_contextual_nodes.pkl", "wb") as f:
+        pickle.dump(all_nodes, f)
+    print(f"Total nodes in updated file: {{len(all_nodes)}}")
+asyncio.run(main())
+            """,
+        ]
+    else:
+        # Process all documents
+        logger.info("Adding context to all nodes")
+        cmd = ["python", "data/scraping_scripts/add_context_to_nodes.py"]
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        logger.error(f"Error adding context to nodes - check output above")
+        sys.exit(1)
+    logger.info("Successfully added context to nodes")
+    # Clean up temp file if it exists
+    if new_only and os.path.exists("data/new_docs_temp.jsonl"):
+        os.remove("data/new_docs_temp.jsonl")
+def create_vector_stores() -> None:
+    """Create vector stores from processed documents."""
+    logger.info("Creating vector stores")
+    cmd = ["python", "data/scraping_scripts/create_vector_stores.py", "all_sources"]
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        logger.error(f"Error creating vector stores - check output above")
+        sys.exit(1)
+    logger.info("Successfully created vector stores")
+def upload_to_huggingface(upload_jsonl: bool = False) -> None:
+    """Upload databases to HuggingFace."""
+    logger.info("Uploading databases to HuggingFace")
+    cmd = ["python", "data/scraping_scripts/upload_dbs_to_hf.py"]
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        logger.error(f"Error uploading databases - check output above")
+        sys.exit(1)
+    logger.info("Successfully uploaded databases to HuggingFace")
+    if upload_jsonl:
+        logger.info("Uploading data files to HuggingFace")
+        try:
+            # Note: This uses a separate private repository
+            cmd = ["python", "data/scraping_scripts/upload_data_to_hf.py"]
+            result = subprocess.run(cmd)
+            if result.returncode != 0:
+                logger.error(f"Error uploading data files - check output above")
+                sys.exit(1)
+            logger.info("Successfully uploaded data files to HuggingFace")
+        except Exception as e:
+            logger.error(f"Error uploading JSONL file: {e}")
+            sys.exit(1)
+def main():
+    parser = argparse.ArgumentParser(
+        description="AI Tutor App Documentation Update Workflow"
+    )
+    parser.add_argument(
+        "--sources",
+        nargs="+",
+        choices=GITHUB_SOURCES,
+        default=GITHUB_SOURCES,
+        help="GitHub documentation sources to update",
+    )
+    parser.add_argument(
+        "--skip-download", action="store_true", help="Skip downloading from GitHub"
+    )
+    parser.add_argument(
+        "--skip-process", action="store_true", help="Skip processing markdown files"
+    )
+    parser.add_argument(
+        "--process-all-context",
+        action="store_true",
+        help="Process all content when adding context (default: only process new content)",
+    )
+    parser.add_argument(
+        "--skip-context",
+        action="store_true",
+        help="Skip the context addition step entirely",
+    )
+    parser.add_argument(
+        "--skip-vectors", action="store_true", help="Skip vector store creation"
+    )
+    parser.add_argument(
+        "--skip-upload", action="store_true", help="Skip uploading to HuggingFace"
+    )
+    parser.add_argument(
+        "--skip-data-upload",
+        action="store_true",
+        help="Skip uploading data files (.jsonl and .pkl) to private HuggingFace repo (they are uploaded by default)",
+    )
+    args = parser.parse_args()
+    # Ensure required data files exist before proceeding
+    ensure_required_files_exist()
+    # Execute the workflow steps
+    if not args.skip_download:
+        download_from_github(args.sources)
+    if not args.skip_process:
+        process_markdown_files(args.sources)
+    if not args.skip_context:
+        add_context_to_nodes(not args.process_all_context)
+    if not args.skip_vectors:
+        create_vector_stores()
+    if not args.skip_upload:
+        # By default, also upload the data files (JSONL and PKL) unless explicitly skipped
+        upload_to_huggingface(not args.skip_data_upload)
+    logger.info("Documentation update workflow completed successfully")
+if __name__ == "__main__":
+    main()