Spaces:
Running
Running
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
# src/db/mongodb_store.py | |
from motor.motor_asyncio import AsyncIOMotorClient | |
from datetime import datetime | |
from typing import List, Dict, Optional, Any | |
from bson import ObjectId | |
class MongoDBStore: | |
def __init__(self, mongo_uri: str = "mongodb://localhost:27017"): | |
"""Initialize MongoDB connection""" | |
self.client = AsyncIOMotorClient(mongo_uri) | |
self.db = self.client.db_chatbot | |
self.chat_history = self.db.chat_history | |
self.conversations = self.db.conversations | |
self.documents = self.db.knowledge_base | |
# Document-related methods | |
async def store_document( | |
self, | |
document_id: str, | |
filename: str, | |
content_type: str, | |
file_size: int, | |
url_path: str, | |
source: str | |
) -> str: | |
"""Store document metadata in MongoDB""" | |
document = { | |
"document_id": document_id, | |
"filename": filename, | |
"content_type": content_type, | |
"file_size": file_size, | |
"url_path": url_path, | |
"source": source, | |
"upload_timestamp": datetime.now() | |
} | |
await self.documents.insert_one(document) | |
return document_id | |
async def get_document(self, document_id: str) -> Optional[Dict]: | |
"""Retrieve document by ID""" | |
return await self.documents.find_one( | |
{"document_id": document_id}, | |
{"_id": 0} | |
) | |
async def get_all_documents(self) -> List[Dict]: | |
"""Retrieve all documents""" | |
cursor = self.documents.find( | |
{}, | |
{ | |
"_id": 0, | |
"document_id": 1, | |
"filename": 1, | |
"content_type": 1, | |
"file_size": 1, | |
"url_path": 1, | |
"upload_timestamp": 1, | |
"source": 1 | |
} | |
) | |
return await cursor.to_list(length=None) | |
async def delete_document(self, document_id: str) -> bool: | |
"""Delete document from MongoDB""" | |
result = await self.documents.delete_one({"document_id": document_id}) | |
return result.deleted_count > 0 | |
async def find_existing_user( | |
self, | |
email: str, | |
phone_number: str | |
) -> Optional[str]: | |
""" | |
Find existing user by email or phone number | |
Args: | |
email (str): User's email | |
phone_number (str): User's phone number | |
Returns: | |
Optional[str]: Conversation ID if found, None otherwise | |
""" | |
result = await self.conversations.find_one({ | |
"$or": [ | |
{"email": email}, | |
{"phone_number": phone_number} | |
] | |
}) | |
return result["conversation_id"] if result else None | |
# Conversation and chat history methods | |
async def create_conversation( | |
self, | |
conversation_id: str, | |
metadata: Optional[Dict] = None, | |
full_name: Optional[str] = None, | |
email: Optional[str] = None, | |
phone_number: Optional[str] = None | |
) -> str: | |
""" | |
Create a new conversation | |
Args: | |
conversation_id (str): Unique conversation ID | |
metadata (Optional[Dict]): Additional metadata | |
full_name (Optional[str]): User's full name | |
email (Optional[str]): User's email | |
phone_number (Optional[str]): User's phone number | |
Returns: | |
str: Conversation ID | |
""" | |
conversation = { | |
"conversation_id": conversation_id, | |
"created_at": datetime.now(), | |
"last_updated": datetime.now(), | |
"message_count": 0, | |
"metadata": metadata or {} | |
} | |
# Add user information if provided | |
if full_name: | |
conversation["full_name"] = full_name | |
if email: | |
conversation["email"] = email | |
if phone_number: | |
conversation["phone_number"] = phone_number | |
await self.conversations.insert_one(conversation) | |
return conversation_id | |
async def get_conversation_metadata( | |
self, | |
conversation_id: str | |
) -> Optional[Dict]: | |
"""Get conversation metadata""" | |
result = await self.conversations.find_one( | |
{"conversation_id": conversation_id} | |
) | |
if result: | |
result["_id"] = str(result["_id"]) | |
return result | |
async def update_conversation_metadata( | |
self, | |
conversation_id: str, | |
metadata: Dict | |
) -> bool: | |
"""Update conversation metadata""" | |
result = await self.conversations.update_one( | |
{"conversation_id": conversation_id}, | |
{ | |
"$set": { | |
"metadata": metadata, | |
"last_updated": datetime.now() | |
} | |
} | |
) | |
return result.modified_count > 0 | |
# Update the store_message method: | |
async def store_message( | |
self, | |
conversation_id: str, | |
query: str, | |
response: str, | |
context: List[str], | |
sources: List[Dict], | |
llm_provider: str | |
) -> str: | |
"""Store chat message in MongoDB""" | |
# Store user message | |
user_message = { | |
"conversation_id": conversation_id, | |
"timestamp": datetime.now(), | |
"role": "user", | |
"content": query, | |
"query": query, # Keep for backward compatibility | |
"response": None, | |
"context": context, | |
"sources": sources, | |
"llm_provider": llm_provider, | |
"feedback": None, | |
"rating": None | |
} | |
await self.chat_history.insert_one(user_message) | |
# Store assistant message | |
assistant_message = { | |
"conversation_id": conversation_id, | |
"timestamp": datetime.now(), | |
"role": "assistant", | |
"content": response, | |
"query": None, | |
"response": response, # Keep for backward compatibility | |
"context": context, | |
"sources": sources, | |
"llm_provider": llm_provider, | |
"feedback": None, | |
"rating": None | |
} | |
result = await self.chat_history.insert_one(assistant_message) | |
# Update conversation metadata | |
await self.conversations.update_one( | |
{"conversation_id": conversation_id}, | |
{ | |
"$set": {"last_updated": datetime.now()}, | |
# Increment by 2 since we store both messages | |
"$inc": {"message_count": 2} | |
}, | |
upsert=True | |
) | |
return str(result.inserted_id) | |
async def get_conversation_history(self, conversation_id: str) -> List[Dict]: | |
"""Retrieve complete conversation history""" | |
cursor = self.chat_history.find( | |
{"conversation_id": conversation_id} | |
).sort("timestamp", 1) | |
history = [] | |
async for document in cursor: | |
document["_id"] = str(document["_id"]) | |
history.append(document) | |
return history | |
async def get_recent_messages( | |
self, | |
conversation_id: str, | |
limit: int = 5 | |
) -> List[Dict]: | |
"""Get most recent messages from conversation""" | |
cursor = self.chat_history.find( | |
{"conversation_id": conversation_id} | |
# Multiply limit by 2 to account for user-assistant pairs | |
).sort("timestamp", -1).limit(limit * 2) | |
messages = [] | |
async for doc in cursor: | |
messages.append(self._format_message(doc)) | |
return list(reversed(messages)) | |
async def update_feedback( | |
self, | |
conversation_id: str, | |
feedback: Optional[str], | |
rating: Optional[int] | |
) -> bool: | |
""" | |
Update feedback for a conversation | |
Args: | |
conversation_id (str): Conversation ID | |
feedback (Optional[str]): Feedback text | |
rating (Optional[int]): Numeric rating | |
Returns: | |
bool: True if update successful | |
""" | |
update_fields = {} | |
if feedback is not None: | |
update_fields["feedback"] = feedback | |
if rating is not None: | |
from config.config import settings | |
formatted_rating = f"{rating}/{settings.MAX_RATING}" | |
update_fields.update({ | |
"rating": rating, # Store numeric value | |
"formatted_rating": formatted_rating # Store formatted string | |
}) | |
if not update_fields: | |
return False | |
result = await self.chat_history.update_many( | |
{"conversation_id": conversation_id}, | |
{"$set": update_fields} | |
) | |
# Also update conversation metadata | |
if result.modified_count > 0: | |
await self.update_conversation_metadata( | |
conversation_id, | |
{ | |
"last_feedback": datetime.now(), | |
"last_rating": rating if rating is not None else None, | |
"formatted_rating": formatted_rating if rating is not None else None | |
} | |
) | |
return result.modified_count > 0 | |
async def get_messages_for_summary( | |
self, | |
conversation_id: str | |
) -> List[Dict]: | |
"""Get messages in format suitable for summarization""" | |
cursor = self.chat_history.find( | |
{"conversation_id": conversation_id} | |
).sort("timestamp", 1) | |
messages = [] | |
async for doc in cursor: | |
formatted = self._format_message(doc) | |
# For summary, we only need specific fields | |
messages.append({ | |
'role': formatted['role'], | |
'content': formatted['content'], | |
'timestamp': formatted['timestamp'], | |
'sources': formatted['sources'] | |
}) | |
return messages | |
def _format_message(self, doc: Dict) -> Dict: | |
"""Helper method to format message documents consistently""" | |
return { | |
"_id": str(doc["_id"]) if "_id" in doc else None, | |
"conversation_id": doc.get("conversation_id"), | |
"timestamp": doc.get("timestamp"), | |
"role": doc.get("role", "user" if doc.get("query") else "assistant"), | |
"content": doc.get("content", doc.get("query") or doc.get("response", "")), | |
"context": doc.get("context", []), | |
"sources": doc.get("sources", []), | |
"llm_provider": doc.get("llm_provider"), | |
"feedback": doc.get("feedback"), | |
"rating": doc.get("rating") | |
} | |
# Vector store related methods | |
async def store_vector_metadata( | |
self, | |
document_id: str, | |
chunk_id: str, | |
metadata: Dict[str, Any] | |
) -> str: | |
"""Store vector chunk metadata""" | |
vector_metadata = { | |
"document_id": document_id, | |
"chunk_id": chunk_id, | |
"metadata": metadata, | |
"created_at": datetime.now() | |
} | |
result = await self.db.vector_metadata.insert_one(vector_metadata) | |
return str(result.inserted_id) | |
async def get_vector_metadata( | |
self, | |
document_id: str | |
) -> List[Dict]: | |
"""Get vector metadata for a document""" | |
cursor = self.db.vector_metadata.find( | |
{"document_id": document_id} | |
) | |
return await cursor.to_list(length=None) | |
async def delete_vector_metadata( | |
self, | |
document_id: str | |
) -> bool: | |
"""Delete vector metadata for a document""" | |
result = await self.db.vector_metadata.delete_many( | |
{"document_id": document_id} | |
) | |
return result.deleted_count > 0 | |