doc-mcp / rag /ingest.py
mdabidhussain's picture
created doc-mcp
56f7920
from typing import List
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.schema import Document
from .config import get_vector_store, store_ingested_repo
async def ingest_documents_async(documents: List[Document], repo_name: str = None):
"""Async version of document ingestion with detailed logging and repo tracking"""
print(f"πŸ”„ Starting async ingestion of {len(documents)} documents")
if repo_name:
print(f"πŸ“ Repository: {repo_name}")
try:
# Get vector store
vector_store = get_vector_store()
print(f"βœ… Vector store retrieved: {type(vector_store)}")
# Create storage context
vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
print(f"βœ… Vector Store context created: {type(vector_store_context)}")
# Process documents and ensure repo metadata
print("πŸ”„ Processing documents through pipeline...")
ingested_files = []
for i, doc in enumerate(documents):
print(f"πŸ“„ Doc {i + 1}: {doc.doc_id} - {len(doc.text)} chars")
print(f" Metadata: {doc.metadata}")
# Ensure repo metadata is properly set
if repo_name and "repo" not in doc.metadata:
doc.metadata["repo"] = repo_name
print(f" βœ… Added repo metadata: {repo_name}")
# Track ingested file paths
file_path = doc.metadata.get("file_path", doc.doc_id)
if file_path not in ingested_files:
ingested_files.append(file_path)
# Run the ingestion
print("πŸš€ Starting vector store ingestion...")
vc_store_index = VectorStoreIndex.from_documents(
documents=documents,
storage_context=vector_store_context,
show_progress=True,
)
print("βœ… Document Ingestion completed Successfully")
# Store repository metadata if repo_name is provided
if repo_name and ingested_files:
store_success = store_ingested_repo(repo_name, ingested_files)
if store_success:
print(f"βœ… Repository metadata stored for {repo_name}")
else:
print(f"⚠️ Failed to store repository metadata for {repo_name}")
return vc_store_index
except Exception as e:
print(f"❌ Error in async ingestion: {str(e)}")
import traceback
traceback.print_exc()
raise e