Spaces:
Running
Running
import os | |
from typing import List | |
from llama_index.embeddings.nebius import NebiusEmbedding | |
from llama_index.llms.nebius import NebiusLLM | |
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch | |
from pymongo import MongoClient | |
from pymongo.operations import SearchIndexModel | |
llm = NebiusLLM( | |
model="meta-llama/Llama-3.3-70B-Instruct-fast", api_key=os.getenv("NEBIUS_API_KEY") | |
) | |
embed_model = NebiusEmbedding( | |
model_name="BAAI/bge-en-icl", | |
api_key=os.getenv("NEBIUS_API_KEY"), | |
embed_batch_size=10, | |
) | |
MONGO_DB_URI = os.getenv("MONGO_DB_URI") | |
mongo_client = MongoClient(MONGO_DB_URI) | |
# Database and collection names | |
DB_NAME = "docmcp" | |
COLLECTION_NAME = "doc_rag" | |
REPOS_COLLECTION_NAME = "ingested_repos" | |
VS_INDEX_NAME = "vector_index" | |
FTS_INDEX_NAME = "fts_index" | |
vs_model = SearchIndexModel( | |
definition={ | |
"fields": [ | |
{ | |
"type": "vector", | |
"path": "embedding", | |
"numDimensions": 4096, | |
"similarity": "cosine", | |
}, | |
{"type": "filter", "path": "metadata.repo"}, | |
] | |
}, | |
name=VS_INDEX_NAME, | |
type="vectorSearch", | |
) | |
fts_model = SearchIndexModel( | |
definition={"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}}, | |
name=FTS_INDEX_NAME, | |
type="search", | |
) | |
def get_vector_store(): | |
collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
vector_store = MongoDBAtlasVectorSearch( | |
mongodb_client=mongo_client, | |
db_name=DB_NAME, | |
collection_name=COLLECTION_NAME, | |
vector_index_name=VS_INDEX_NAME, | |
fulltext_index_name=FTS_INDEX_NAME, | |
embedding_key="embedding", | |
text_key="text", | |
) | |
collection.create_search_indexes(models=[vs_model, fts_model]) | |
return vector_store | |
def get_repos_collection(): | |
return mongo_client[DB_NAME][REPOS_COLLECTION_NAME] | |
def store_ingested_repo(repo_name: str, ingested_files: List[str]) -> bool: | |
try: | |
repos_collection = get_repos_collection() | |
# Simple document format | |
repo_doc = { | |
"_id": repo_name, # Use repo name as unique ID | |
"repo_name": repo_name, | |
"ingested_files": ingested_files, | |
"file_count": len(ingested_files), | |
} | |
# Upsert the document (update if exists, insert if not) | |
repos_collection.replace_one({"_id": repo_name}, repo_doc, upsert=True) | |
print(f"β Stored repository: {repo_name} with {len(ingested_files)} files") | |
return True | |
except Exception as e: | |
print(f"β Error storing repository data: {e}") | |
return False | |
def get_available_repos(): | |
try: | |
repos_collection = get_repos_collection() | |
# Get all repository names | |
repos = repos_collection.find({}, {"repo_name": 1}) | |
repo_list = [repo["repo_name"] for repo in repos] | |
if repo_list: | |
return sorted(repo_list) | |
else: | |
# Fallback to hardcoded list if no repos in database | |
return [] | |
except Exception as e: | |
print(f"Error getting repos from database: {e}") | |
# Fallback to hardcoded list | |
return [] | |
def get_repo_details(): | |
"""Get detailed information about all repositories""" | |
try: | |
repos_collection = get_repos_collection() | |
# Get all repository details | |
repos = repos_collection.find({}) | |
repo_details = [] | |
for repo in repos: | |
repo_info = { | |
"repo_name": repo.get("repo_name", "Unknown"), | |
"file_count": repo.get("file_count", 0), | |
"last_updated": repo.get("last_updated", "Unknown"), | |
"ingested_files": repo.get("ingested_files", []) | |
} | |
repo_details.append(repo_info) | |
return repo_details | |
except Exception as e: | |
print(f"Error getting repo details: {e}") | |
return [] | |
def delete_repository_data(repo_name): | |
try: | |
result = { | |
"success": False, | |
"message": "", | |
"vector_docs_deleted": 0, | |
"repo_record_deleted": False, | |
} | |
# Delete from vector store (documents with this repo metadata) | |
collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
vector_delete_result = collection.delete_many({"metadata.repo": repo_name}) | |
result["vector_docs_deleted"] = vector_delete_result.deleted_count | |
# Delete from repos tracking collection | |
repos_collection = get_repos_collection() | |
repo_delete_result = repos_collection.delete_one({"_id": repo_name}) | |
result["repo_record_deleted"] = repo_delete_result.deleted_count > 0 | |
if result["vector_docs_deleted"] > 0 or result["repo_record_deleted"]: | |
result["success"] = True | |
result["message"] = f"β Successfully deleted repository '{repo_name}'" | |
if result["vector_docs_deleted"] > 0: | |
result["message"] += ( | |
f" ({result['vector_docs_deleted']} documents removed)" | |
) | |
else: | |
result["message"] = ( | |
f"β οΈ Repository '{repo_name}' not found or already deleted" | |
) | |
print(result["message"]) | |
return result | |
except Exception as e: | |
error_msg = f"β Error deleting repository '{repo_name}': {str(e)}" | |
print(error_msg) | |
return { | |
"success": False, | |
"message": error_msg, | |
"vector_docs_deleted": 0, | |
"repo_record_deleted": False, | |
} | |
def get_repository_stats(): | |
try: | |
repos_collection = get_repos_collection() | |
collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
# Count total repositories | |
total_repos = repos_collection.count_documents({}) | |
# Count total documents in vector store | |
total_docs = collection.count_documents({}) | |
# Get total files across all repos | |
total_files = 0 | |
repos = repos_collection.find({}, {"file_count": 1}) | |
for repo in repos: | |
total_files += repo.get("file_count", 0) | |
return { | |
"total_repositories": total_repos, | |
"total_documents": total_docs, | |
"total_files": total_files, | |
} | |
except Exception as e: | |
print(f"Error getting repository stats: {e}") | |
return {"total_repositories": 0, "total_documents": 0, "total_files": 0} | |