Spaces:

Agents-MCP-Hackathon
/

doc-mcp

Running

File size: 2,534 Bytes

56f7920

from typing import List

from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.schema import Document

from .config import get_vector_store, store_ingested_repo


async def ingest_documents_async(documents: List[Document], repo_name: str = None):
    """Async version of document ingestion with detailed logging and repo tracking"""
    print(f"🔄 Starting async ingestion of {len(documents)} documents")
    
    if repo_name:
        print(f"📍 Repository: {repo_name}")

    try:
        # Get vector store
        vector_store = get_vector_store()
        print(f"✅ Vector store retrieved: {type(vector_store)}")

        # Create storage context
        vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
        print(f"✅ Vector Store context created: {type(vector_store_context)}")

        # Process documents and ensure repo metadata
        print("🔄 Processing documents through pipeline...")
        ingested_files = []
        
        for i, doc in enumerate(documents):
            print(f"📄 Doc {i + 1}: {doc.doc_id} - {len(doc.text)} chars")
            print(f"   Metadata: {doc.metadata}")
            
            # Ensure repo metadata is properly set
            if repo_name and "repo" not in doc.metadata:
                doc.metadata["repo"] = repo_name
                print(f"   ✅ Added repo metadata: {repo_name}")
            
            # Track ingested file paths
            file_path = doc.metadata.get("file_path", doc.doc_id)
            if file_path not in ingested_files:
                ingested_files.append(file_path)

        # Run the ingestion
        print("🚀 Starting vector store ingestion...")
        vc_store_index = VectorStoreIndex.from_documents(
            documents=documents,
            storage_context=vector_store_context,
            show_progress=True,
        )
        print("✅ Document Ingestion completed Successfully")

        # Store repository metadata if repo_name is provided
        if repo_name and ingested_files:
            store_success = store_ingested_repo(repo_name, ingested_files)
            if store_success:
                print(f"✅ Repository metadata stored for {repo_name}")
            else:
                print(f"⚠️ Failed to store repository metadata for {repo_name}")

        return vc_store_index

    except Exception as e:
        print(f"❌ Error in async ingestion: {str(e)}")
        import traceback
        traceback.print_exc()
        raise e