doc-mcp / rag /config.py
mdabidhussain's picture
created doc-mcp
56f7920
import os
from typing import List
from llama_index.embeddings.nebius import NebiusEmbedding
from llama_index.llms.nebius import NebiusLLM
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel
llm = NebiusLLM(
model="meta-llama/Llama-3.3-70B-Instruct-fast", api_key=os.getenv("NEBIUS_API_KEY")
)
embed_model = NebiusEmbedding(
model_name="BAAI/bge-en-icl",
api_key=os.getenv("NEBIUS_API_KEY"),
embed_batch_size=10,
)
MONGO_DB_URI = os.getenv("MONGO_DB_URI")
mongo_client = MongoClient(MONGO_DB_URI)
# Database and collection names
DB_NAME = "docmcp"
COLLECTION_NAME = "doc_rag"
REPOS_COLLECTION_NAME = "ingested_repos"
VS_INDEX_NAME = "vector_index"
FTS_INDEX_NAME = "fts_index"
vs_model = SearchIndexModel(
definition={
"fields": [
{
"type": "vector",
"path": "embedding",
"numDimensions": 4096,
"similarity": "cosine",
},
{"type": "filter", "path": "metadata.repo"},
]
},
name=VS_INDEX_NAME,
type="vectorSearch",
)
fts_model = SearchIndexModel(
definition={"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}},
name=FTS_INDEX_NAME,
type="search",
)
def get_vector_store():
collection = mongo_client[DB_NAME][COLLECTION_NAME]
vector_store = MongoDBAtlasVectorSearch(
mongodb_client=mongo_client,
db_name=DB_NAME,
collection_name=COLLECTION_NAME,
vector_index_name=VS_INDEX_NAME,
fulltext_index_name=FTS_INDEX_NAME,
embedding_key="embedding",
text_key="text",
)
collection.create_search_indexes(models=[vs_model, fts_model])
return vector_store
def get_repos_collection():
return mongo_client[DB_NAME][REPOS_COLLECTION_NAME]
def store_ingested_repo(repo_name: str, ingested_files: List[str]) -> bool:
try:
repos_collection = get_repos_collection()
# Simple document format
repo_doc = {
"_id": repo_name, # Use repo name as unique ID
"repo_name": repo_name,
"ingested_files": ingested_files,
"file_count": len(ingested_files),
}
# Upsert the document (update if exists, insert if not)
repos_collection.replace_one({"_id": repo_name}, repo_doc, upsert=True)
print(f"βœ… Stored repository: {repo_name} with {len(ingested_files)} files")
return True
except Exception as e:
print(f"❌ Error storing repository data: {e}")
return False
def get_available_repos():
try:
repos_collection = get_repos_collection()
# Get all repository names
repos = repos_collection.find({}, {"repo_name": 1})
repo_list = [repo["repo_name"] for repo in repos]
if repo_list:
return sorted(repo_list)
else:
# Fallback to hardcoded list if no repos in database
return []
except Exception as e:
print(f"Error getting repos from database: {e}")
# Fallback to hardcoded list
return []
def get_repo_details():
"""Get detailed information about all repositories"""
try:
repos_collection = get_repos_collection()
# Get all repository details
repos = repos_collection.find({})
repo_details = []
for repo in repos:
repo_info = {
"repo_name": repo.get("repo_name", "Unknown"),
"file_count": repo.get("file_count", 0),
"last_updated": repo.get("last_updated", "Unknown"),
"ingested_files": repo.get("ingested_files", [])
}
repo_details.append(repo_info)
return repo_details
except Exception as e:
print(f"Error getting repo details: {e}")
return []
def delete_repository_data(repo_name):
try:
result = {
"success": False,
"message": "",
"vector_docs_deleted": 0,
"repo_record_deleted": False,
}
# Delete from vector store (documents with this repo metadata)
collection = mongo_client[DB_NAME][COLLECTION_NAME]
vector_delete_result = collection.delete_many({"metadata.repo": repo_name})
result["vector_docs_deleted"] = vector_delete_result.deleted_count
# Delete from repos tracking collection
repos_collection = get_repos_collection()
repo_delete_result = repos_collection.delete_one({"_id": repo_name})
result["repo_record_deleted"] = repo_delete_result.deleted_count > 0
if result["vector_docs_deleted"] > 0 or result["repo_record_deleted"]:
result["success"] = True
result["message"] = f"βœ… Successfully deleted repository '{repo_name}'"
if result["vector_docs_deleted"] > 0:
result["message"] += (
f" ({result['vector_docs_deleted']} documents removed)"
)
else:
result["message"] = (
f"⚠️ Repository '{repo_name}' not found or already deleted"
)
print(result["message"])
return result
except Exception as e:
error_msg = f"❌ Error deleting repository '{repo_name}': {str(e)}"
print(error_msg)
return {
"success": False,
"message": error_msg,
"vector_docs_deleted": 0,
"repo_record_deleted": False,
}
def get_repository_stats():
try:
repos_collection = get_repos_collection()
collection = mongo_client[DB_NAME][COLLECTION_NAME]
# Count total repositories
total_repos = repos_collection.count_documents({})
# Count total documents in vector store
total_docs = collection.count_documents({})
# Get total files across all repos
total_files = 0
repos = repos_collection.find({}, {"file_count": 1})
for repo in repos:
total_files += repo.get("file_count", 0)
return {
"total_repositories": total_repos,
"total_documents": total_docs,
"total_files": total_files,
}
except Exception as e:
print(f"Error getting repository stats: {e}")
return {"total_repositories": 0, "total_documents": 0, "total_files": 0}