import os from typing import List from llama_index.embeddings.nebius import NebiusEmbedding from llama_index.llms.nebius import NebiusLLM from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch from pymongo import MongoClient from pymongo.operations import SearchIndexModel llm = NebiusLLM( model="meta-llama/Llama-3.3-70B-Instruct-fast", api_key=os.getenv("NEBIUS_API_KEY") ) embed_model = NebiusEmbedding( model_name="BAAI/bge-en-icl", api_key=os.getenv("NEBIUS_API_KEY"), embed_batch_size=10, ) MONGO_DB_URI = os.getenv("MONGO_DB_URI") mongo_client = MongoClient(MONGO_DB_URI) # Database and collection names DB_NAME = "docmcp" COLLECTION_NAME = "doc_rag" REPOS_COLLECTION_NAME = "ingested_repos" VS_INDEX_NAME = "vector_index" FTS_INDEX_NAME = "fts_index" vs_model = SearchIndexModel( definition={ "fields": [ { "type": "vector", "path": "embedding", "numDimensions": 4096, "similarity": "cosine", }, {"type": "filter", "path": "metadata.repo"}, ] }, name=VS_INDEX_NAME, type="vectorSearch", ) fts_model = SearchIndexModel( definition={"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}}, name=FTS_INDEX_NAME, type="search", ) def get_vector_store(): collection = mongo_client[DB_NAME][COLLECTION_NAME] vector_store = MongoDBAtlasVectorSearch( mongodb_client=mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name=VS_INDEX_NAME, fulltext_index_name=FTS_INDEX_NAME, embedding_key="embedding", text_key="text", ) collection.create_search_indexes(models=[vs_model, fts_model]) return vector_store def get_repos_collection(): return mongo_client[DB_NAME][REPOS_COLLECTION_NAME] def store_ingested_repo(repo_name: str, ingested_files: List[str]) -> bool: try: repos_collection = get_repos_collection() # Simple document format repo_doc = { "_id": repo_name, # Use repo name as unique ID "repo_name": repo_name, "ingested_files": ingested_files, "file_count": len(ingested_files), } # Upsert the document (update if exists, insert if not) repos_collection.replace_one({"_id": repo_name}, repo_doc, upsert=True) print(f"✅ Stored repository: {repo_name} with {len(ingested_files)} files") return True except Exception as e: print(f"❌ Error storing repository data: {e}") return False def get_available_repos(): try: repos_collection = get_repos_collection() # Get all repository names repos = repos_collection.find({}, {"repo_name": 1}) repo_list = [repo["repo_name"] for repo in repos] if repo_list: return sorted(repo_list) else: # Fallback to hardcoded list if no repos in database return [] except Exception as e: print(f"Error getting repos from database: {e}") # Fallback to hardcoded list return [] def get_repo_details(): """Get detailed information about all repositories""" try: repos_collection = get_repos_collection() # Get all repository details repos = repos_collection.find({}) repo_details = [] for repo in repos: repo_info = { "repo_name": repo.get("repo_name", "Unknown"), "file_count": repo.get("file_count", 0), "last_updated": repo.get("last_updated", "Unknown"), "ingested_files": repo.get("ingested_files", []) } repo_details.append(repo_info) return repo_details except Exception as e: print(f"Error getting repo details: {e}") return [] def delete_repository_data(repo_name): try: result = { "success": False, "message": "", "vector_docs_deleted": 0, "repo_record_deleted": False, } # Delete from vector store (documents with this repo metadata) collection = mongo_client[DB_NAME][COLLECTION_NAME] vector_delete_result = collection.delete_many({"metadata.repo": repo_name}) result["vector_docs_deleted"] = vector_delete_result.deleted_count # Delete from repos tracking collection repos_collection = get_repos_collection() repo_delete_result = repos_collection.delete_one({"_id": repo_name}) result["repo_record_deleted"] = repo_delete_result.deleted_count > 0 if result["vector_docs_deleted"] > 0 or result["repo_record_deleted"]: result["success"] = True result["message"] = f"✅ Successfully deleted repository '{repo_name}'" if result["vector_docs_deleted"] > 0: result["message"] += ( f" ({result['vector_docs_deleted']} documents removed)" ) else: result["message"] = ( f"⚠️ Repository '{repo_name}' not found or already deleted" ) print(result["message"]) return result except Exception as e: error_msg = f"❌ Error deleting repository '{repo_name}': {str(e)}" print(error_msg) return { "success": False, "message": error_msg, "vector_docs_deleted": 0, "repo_record_deleted": False, } def get_repository_stats(): try: repos_collection = get_repos_collection() collection = mongo_client[DB_NAME][COLLECTION_NAME] # Count total repositories total_repos = repos_collection.count_documents({}) # Count total documents in vector store total_docs = collection.count_documents({}) # Get total files across all repos total_files = 0 repos = repos_collection.find({}, {"file_count": 1}) for repo in repos: total_files += repo.get("file_count", 0) return { "total_repositories": total_repos, "total_documents": total_docs, "total_files": total_files, } except Exception as e: print(f"Error getting repository stats: {e}") return {"total_repositories": 0, "total_documents": 0, "total_files": 0}