TalatMasood commited on
Commit
4daad35
·
1 Parent(s): d161383

Updated Knowledge base with Upload, get and delete endpoints

Browse files
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED
Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ
 
src/db/mongodb_store.py CHANGED
@@ -1,7 +1,6 @@
1
  # src/db/mongodb_store.py
2
  from motor.motor_asyncio import AsyncIOMotorClient
3
  from datetime import datetime
4
- import json
5
  from typing import List, Dict, Optional, Any
6
  from bson import ObjectId
7
 
@@ -9,25 +8,25 @@ class MongoDBStore:
9
  def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
10
  """Initialize MongoDB connection"""
11
  self.client = AsyncIOMotorClient(mongo_uri)
12
- self.db = self.client.rag_chatbot
13
  self.chat_history = self.db.chat_history
14
- self.documents = self.db.documents # Collection for original documents
15
 
16
  async def store_document(
17
  self,
18
  document_id: str,
19
  filename: str,
20
- content: str,
21
  content_type: str,
22
- file_size: int
 
23
  ) -> str:
24
- """Store original document in MongoDB"""
25
  document = {
26
  "document_id": document_id,
27
  "filename": filename,
28
- "content": content,
29
  "content_type": content_type,
30
  "file_size": file_size,
 
31
  "upload_timestamp": datetime.now()
32
  }
33
 
@@ -38,12 +37,23 @@ class MongoDBStore:
38
  """Retrieve document by ID"""
39
  return await self.documents.find_one(
40
  {"document_id": document_id},
41
- {"_id": 0} # Exclude MongoDB's _id
42
  )
43
 
44
  async def get_all_documents(self) -> List[Dict]:
45
  """Retrieve all documents"""
46
- cursor = self.documents.find({}, {"_id": 0})
 
 
 
 
 
 
 
 
 
 
 
47
  return await cursor.to_list(length=None)
48
 
49
  async def store_message(
@@ -117,4 +127,9 @@ class MongoDBStore:
117
  'sources': doc['sources']
118
  })
119
 
120
- return messages
 
 
 
 
 
 
1
  # src/db/mongodb_store.py
2
  from motor.motor_asyncio import AsyncIOMotorClient
3
  from datetime import datetime
 
4
  from typing import List, Dict, Optional, Any
5
  from bson import ObjectId
6
 
 
8
  def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
9
  """Initialize MongoDB connection"""
10
  self.client = AsyncIOMotorClient(mongo_uri)
11
+ self.db = self.client.db_chatbot
12
  self.chat_history = self.db.chat_history
13
+ self.documents = self.db.knowledge_base
14
 
15
  async def store_document(
16
  self,
17
  document_id: str,
18
  filename: str,
 
19
  content_type: str,
20
+ file_size: int,
21
+ url_path: str
22
  ) -> str:
23
+ """Store document metadata in MongoDB"""
24
  document = {
25
  "document_id": document_id,
26
  "filename": filename,
 
27
  "content_type": content_type,
28
  "file_size": file_size,
29
+ "url_path": url_path,
30
  "upload_timestamp": datetime.now()
31
  }
32
 
 
37
  """Retrieve document by ID"""
38
  return await self.documents.find_one(
39
  {"document_id": document_id},
40
+ {"_id": 0}
41
  )
42
 
43
  async def get_all_documents(self) -> List[Dict]:
44
  """Retrieve all documents"""
45
+ cursor = self.documents.find(
46
+ {},
47
+ {
48
+ "_id": 0,
49
+ "document_id": 1,
50
+ "filename": 1,
51
+ "content_type": 1,
52
+ "file_size": 1,
53
+ "url_path": 1,
54
+ "upload_timestamp": 1
55
+ }
56
+ )
57
  return await cursor.to_list(length=None)
58
 
59
  async def store_message(
 
127
  'sources': doc['sources']
128
  })
129
 
130
+ return messages
131
+
132
+ async def delete_document(self, document_id: str) -> bool:
133
+ """Delete document from MongoDB"""
134
+ result = await self.documents.delete_one({"document_id": document_id})
135
+ return result.deleted_count > 0
src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED
Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ
 
src/implementations/document_service.py CHANGED
@@ -1,4 +1,5 @@
1
  # src/implementations/document_service.py
 
2
  from pathlib import Path
3
  import shutil
4
  import os
@@ -20,8 +21,21 @@ class DocumentService:
20
  ):
21
  self.doc_processor = doc_processor
22
  self.mongodb = mongodb
23
- self.upload_dir = Path("temp_uploads")
24
- self.upload_dir.mkdir(exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  async def process_documents(
27
  self,
@@ -54,6 +68,14 @@ class DocumentService:
54
 
55
  for file in files:
56
  try:
 
 
 
 
 
 
 
 
57
  if not self._is_supported_format(file.filename):
58
  failed_files.append(self._create_failed_file_entry(
59
  file.filename,
@@ -86,30 +108,31 @@ class DocumentService:
86
  """Process a single file upload"""
87
  # Generate UUID for document
88
  document_id = str(uuid4())
89
- temp_path = self.upload_dir / f"{document_id}_{file.filename}"
 
 
90
 
91
  try:
92
- # Save file temporarily
93
- with open(temp_path, "wb") as buffer:
94
  shutil.copyfileobj(file.file, buffer)
95
 
96
- # Process the document to get content and metadata
97
- processed_doc = await self.doc_processor.process_document(temp_path)
98
- content = processed_doc['content']
99
 
100
- # First, store in MongoDB
101
  await self.mongodb.store_document(
102
  document_id=document_id,
103
  filename=file.filename,
104
- content=content,
105
  content_type=file.content_type,
106
- file_size=os.path.getsize(temp_path)
 
107
  )
108
 
109
- # Then process for vector store in background
110
  background_tasks.add_task(
111
  self._process_for_vector_store,
112
- processed_doc['chunks'], # Use the chunks from processed document
113
  vector_store,
114
  document_id,
115
  file.filename
@@ -121,18 +144,32 @@ class DocumentService:
121
  status="processing",
122
  document_info=DocumentInfo(
123
  original_filename=file.filename,
124
- size=os.path.getsize(temp_path),
125
- content_type=file.content_type
 
126
  )
127
  )
128
- finally:
129
- # Clean up temporary file
130
- if temp_path.exists():
131
- temp_path.unlink()
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  async def _process_for_vector_store(
134
  self,
135
- chunks: List[str], # Now accepting pre-processed chunks
136
  vector_store: ChromaVectorStore,
137
  document_id: str,
138
  filename: str
@@ -147,7 +184,7 @@ class DocumentService:
147
 
148
  # Prepare metadata for each chunk
149
  metadatas = [{
150
- 'document_id': document_id, # MongoDB document ID
151
  'source_file': filename,
152
  'chunk_index': i,
153
  'total_chunks': len(chunks)
@@ -179,7 +216,29 @@ class DocumentService:
179
  "error": error
180
  }
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def cleanup(self):
183
- """Clean up upload directory"""
184
- if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
185
- self.upload_dir.rmdir()
 
1
  # src/implementations/document_service.py
2
+ from fastapi import HTTPException
3
  from pathlib import Path
4
  import shutil
5
  import os
 
21
  ):
22
  self.doc_processor = doc_processor
23
  self.mongodb = mongodb
24
+ self.permanent_dir = Path("uploads")
25
+ self.permanent_dir.mkdir(exist_ok=True)
26
+
27
+ async def check_duplicate_filename(self, filename: str) -> bool:
28
+ """
29
+ Check if a file with the same name exists
30
+
31
+ Args:
32
+ filename (str): Original filename to check
33
+
34
+ Returns:
35
+ bool: True if duplicate exists, False otherwise
36
+ """
37
+ documents = await self.mongodb.get_all_documents()
38
+ return any(doc.get('filename') == filename for doc in documents)
39
 
40
  async def process_documents(
41
  self,
 
68
 
69
  for file in files:
70
  try:
71
+ # Check for duplicate filename
72
+ if await self.check_duplicate_filename(file.filename):
73
+ failed_files.append(self._create_failed_file_entry(
74
+ file.filename,
75
+ "A document with this name already exists. Please upload another document."
76
+ ))
77
+ continue
78
+
79
  if not self._is_supported_format(file.filename):
80
  failed_files.append(self._create_failed_file_entry(
81
  file.filename,
 
108
  """Process a single file upload"""
109
  # Generate UUID for document
110
  document_id = str(uuid4())
111
+ filename = f"{document_id}_{file.filename}"
112
+ file_path = self.permanent_dir / filename
113
+ url_path = f"/docs/{filename}"
114
 
115
  try:
116
+ # Save file to permanent location
117
+ with open(file_path, "wb") as buffer:
118
  shutil.copyfileobj(file.file, buffer)
119
 
120
+ # Process the document for vector store
121
+ processed_doc = await self.doc_processor.process_document(file_path)
 
122
 
123
+ # Store in MongoDB with url_path
124
  await self.mongodb.store_document(
125
  document_id=document_id,
126
  filename=file.filename,
 
127
  content_type=file.content_type,
128
+ file_size=os.path.getsize(file_path),
129
+ url_path=url_path
130
  )
131
 
132
+ # Process for vector store in background
133
  background_tasks.add_task(
134
  self._process_for_vector_store,
135
+ processed_doc['chunks'],
136
  vector_store,
137
  document_id,
138
  file.filename
 
144
  status="processing",
145
  document_info=DocumentInfo(
146
  original_filename=file.filename,
147
+ size=os.path.getsize(file_path),
148
+ content_type=file.content_type,
149
+ url_path=url_path
150
  )
151
  )
152
+
153
+ except Exception as e:
154
+ # Clean up file if it was created
155
+ if file_path.exists():
156
+ try:
157
+ file_path.unlink()
158
+ except Exception as cleanup_error:
159
+ logger.error(f"Error cleaning up file {file_path}: {str(cleanup_error)}")
160
+
161
+ # Clean up from MongoDB if document was created
162
+ try:
163
+ await self.mongodb.delete_document(document_id)
164
+ except Exception as db_cleanup_error:
165
+ logger.error(f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
166
+
167
+ logger.error(f"Error processing file {file.filename}: {str(e)}")
168
+ raise
169
 
170
  async def _process_for_vector_store(
171
  self,
172
+ chunks: List[str],
173
  vector_store: ChromaVectorStore,
174
  document_id: str,
175
  filename: str
 
184
 
185
  # Prepare metadata for each chunk
186
  metadatas = [{
187
+ 'document_id': document_id,
188
  'source_file': filename,
189
  'chunk_index': i,
190
  'total_chunks': len(chunks)
 
216
  "error": error
217
  }
218
 
219
+ async def delete_document(self, document_id: str) -> bool:
220
+ """Delete document from storage and MongoDB"""
221
+ try:
222
+ # Get document details from MongoDB
223
+ doc = await self.mongodb.get_document(document_id)
224
+ if doc:
225
+ # Get filename from url_path
226
+ filename = doc['url_path'].split('/')[-1]
227
+ file_path = self.permanent_dir / filename
228
+
229
+ # Delete physical file if it exists
230
+ if file_path.exists():
231
+ file_path.unlink()
232
+
233
+ # Delete from MongoDB
234
+ return await self.mongodb.delete_document(document_id)
235
+ return False
236
+
237
+ except Exception as e:
238
+ logger.error(f"Error deleting document: {str(e)}")
239
+ raise
240
+
241
  def cleanup(self):
242
+ """Clean up permanent directory if empty"""
243
+ if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
244
+ self.permanent_dir.rmdir()
src/main.py CHANGED
@@ -1,9 +1,12 @@
1
  # src/main.py
2
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
3
- from fastapi.responses import StreamingResponse
 
4
  from typing import List
5
  import uuid
6
  from datetime import datetime
 
 
7
 
8
  # Import custom modules
9
  from src.agents.rag_agent import RAGAgent
@@ -25,20 +28,82 @@ from src.models import (
25
  )
26
  from config.config import settings
27
 
28
- app = FastAPI(title="RAG Chatbot API")
29
 
30
  # Initialize MongoDB
31
  mongodb = MongoDBStore(settings.MONGODB_URI)
32
 
33
  # Initialize core components
34
- doc_processor = DocumentProcessor(
35
- chunk_size=1000,
36
- chunk_overlap=200,
37
- max_file_size=10 * 1024 * 1024
38
- )
39
  summarizer = ConversationSummarizer()
40
  document_service = DocumentService(doc_processor, mongodb)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  @app.post("/documents/upload", response_model=BatchUploadResponse)
43
  async def upload_documents(
44
  files: List[UploadFile] = File(...),
@@ -55,36 +120,7 @@ async def upload_documents(
55
  return response
56
  except Exception as e:
57
  logger.error(f"Error in document upload: {str(e)}")
58
- raise HTTPException(status_code=500, detail=str(e))
59
- finally:
60
- document_service.cleanup()
61
-
62
- @app.get("/documents", response_model=AllDocumentsResponse)
63
- async def get_all_documents(include_embeddings: bool = False):
64
- """
65
- Get all documents stored in the system
66
-
67
- Args:
68
- include_embeddings (bool): Whether to include embeddings in the response
69
- """
70
- try:
71
- vector_store, _ = await get_vector_store()
72
- documents = vector_store.get_all_documents(include_embeddings=include_embeddings)
73
-
74
- return AllDocumentsResponse(
75
- total_documents=len(documents),
76
- documents=[
77
- StoredDocument(
78
- id=doc['id'],
79
- text=doc['text'],
80
- embedding=doc.get('embedding'),
81
- metadata=doc.get('metadata')
82
- ) for doc in documents
83
- ]
84
- )
85
- except Exception as e:
86
- logger.error(f"Error retrieving documents: {str(e)}")
87
- raise HTTPException(status_code=500, detail=str(e))
88
 
89
  @app.get("/documentchunks/{document_id}")
90
  async def get_document_chunks(document_id: str):
@@ -104,6 +140,47 @@ async def get_document_chunks(document_id: str):
104
  except Exception as e:
105
  logger.error(f"Error retrieving document chunks: {str(e)}")
106
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  @app.post("/chat", response_model=ChatResponse)
109
  async def chat_endpoint(
 
1
  # src/main.py
2
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
3
+ from fastapi.responses import StreamingResponse, FileResponse
4
+ from fastapi.staticfiles import StaticFiles
5
  from typing import List
6
  import uuid
7
  from datetime import datetime
8
+ from pathlib import Path
9
+ import os
10
 
11
  # Import custom modules
12
  from src.agents.rag_agent import RAGAgent
 
28
  )
29
  from config.config import settings
30
 
31
+ app = FastAPI(title="Chatbot API")
32
 
33
  # Initialize MongoDB
34
  mongodb = MongoDBStore(settings.MONGODB_URI)
35
 
36
  # Initialize core components
37
+ doc_processor = DocumentProcessor()
 
 
 
 
38
  summarizer = ConversationSummarizer()
39
  document_service = DocumentService(doc_processor, mongodb)
40
 
41
+ # Create uploads directory if it doesn't exist
42
+ UPLOADS_DIR = Path("uploads")
43
+ UPLOADS_DIR.mkdir(exist_ok=True)
44
+
45
+ # Mount the uploads directory for static file serving
46
+ app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
47
+
48
+ @app.get("/documents")
49
+ async def get_all_documents():
50
+ """Get all documents from MongoDB"""
51
+ try:
52
+ documents = await mongodb.get_all_documents()
53
+
54
+ formatted_documents = []
55
+ for doc in documents:
56
+ try:
57
+ formatted_doc = {
58
+ "document_id": doc.get("document_id"),
59
+ "filename": doc.get("filename"),
60
+ "content_type": doc.get("content_type"),
61
+ "file_size": doc.get("file_size"),
62
+ "url_path": doc.get("url_path"),
63
+ "upload_timestamp": doc.get("upload_timestamp")
64
+ }
65
+ formatted_documents.append(formatted_doc)
66
+ except Exception as e:
67
+ logger.error(f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
68
+ continue
69
+
70
+ return {
71
+ "total_documents": len(formatted_documents),
72
+ "documents": formatted_documents
73
+ }
74
+ except Exception as e:
75
+ logger.error(f"Error retrieving documents: {str(e)}")
76
+ raise HTTPException(status_code=500, detail=str(e))
77
+
78
+ @app.get("/documents/{document_id}/download")
79
+ async def get_document_file(document_id: str):
80
+ """Serve a document file by its ID"""
81
+ try:
82
+ # Get document info from MongoDB
83
+ doc = await mongodb.get_document(document_id)
84
+ if not doc:
85
+ raise HTTPException(status_code=404, detail="Document not found")
86
+
87
+ # Extract filename from url_path
88
+ filename = doc["url_path"].split("/")[-1]
89
+ file_path = UPLOADS_DIR / filename
90
+
91
+ if not file_path.exists():
92
+ raise HTTPException(
93
+ status_code=404,
94
+ detail=f"File not found on server: {filename}"
95
+ )
96
+
97
+ return FileResponse(
98
+ path=str(file_path),
99
+ filename=doc["filename"],
100
+ media_type=doc["content_type"]
101
+ )
102
+
103
+ except Exception as e:
104
+ logger.error(f"Error serving document file: {str(e)}")
105
+ raise HTTPException(status_code=500, detail=str(e))
106
+
107
  @app.post("/documents/upload", response_model=BatchUploadResponse)
108
  async def upload_documents(
109
  files: List[UploadFile] = File(...),
 
120
  return response
121
  except Exception as e:
122
  logger.error(f"Error in document upload: {str(e)}")
123
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  @app.get("/documentchunks/{document_id}")
126
  async def get_document_chunks(document_id: str):
 
140
  except Exception as e:
141
  logger.error(f"Error retrieving document chunks: {str(e)}")
142
  raise HTTPException(status_code=500, detail=str(e))
143
+
144
+ @app.delete("/documents/{document_id}")
145
+ async def delete_document(document_id: str):
146
+ """Delete document from MongoDB, ChromaDB, and physical storage"""
147
+ try:
148
+ # First get document details from MongoDB to get file path
149
+ document = await mongodb.get_document(document_id)
150
+ if not document:
151
+ raise HTTPException(status_code=404, detail="Document not found")
152
+
153
+ # Get vector store instance
154
+ vector_store, _ = await get_vector_store()
155
+
156
+ # Delete physical file using document service
157
+ deletion_success = await document_service.delete_document(document_id)
158
+ if not deletion_success:
159
+ logger.warning(f"Failed to delete physical file for document {document_id}")
160
+
161
+ # Delete from vector store
162
+ try:
163
+ vector_store.delete_document(document_id)
164
+ except Exception as e:
165
+ logger.error(f"Error deleting document from vector store: {str(e)}")
166
+ raise HTTPException(
167
+ status_code=500,
168
+ detail=f"Failed to delete document from vector store: {str(e)}"
169
+ )
170
+
171
+ # Delete from MongoDB - don't check return value since document might already be deleted
172
+ await mongodb.delete_document(document_id)
173
+
174
+ return {
175
+ "status": "success",
176
+ "message": f"Document {document_id} successfully deleted from all stores"
177
+ }
178
+
179
+ except HTTPException:
180
+ raise
181
+ except Exception as e:
182
+ logger.error(f"Error in delete_document endpoint: {str(e)}")
183
+ raise HTTPException(status_code=500, detail=str(e))
184
 
185
  @app.post("/chat", response_model=ChatResponse)
186
  async def chat_endpoint(
src/models/__pycache__/document.cpython-312.pyc CHANGED
Binary files a/src/models/__pycache__/document.cpython-312.pyc and b/src/models/__pycache__/document.cpython-312.pyc differ
 
src/models/document.py CHANGED
@@ -1,4 +1,4 @@
1
- # src/models/document.py
2
  from pydantic import BaseModel
3
  from typing import Optional, List, Dict, Any
4
 
@@ -7,6 +7,7 @@ class DocumentInfo(BaseModel):
7
  original_filename: str
8
  size: int
9
  content_type: str
 
10
 
11
  class DocumentResponse(BaseModel):
12
  """Response model for document processing"""
 
1
+ # src/models/document.py
2
  from pydantic import BaseModel
3
  from typing import Optional, List, Dict, Any
4
 
 
7
  original_filename: str
8
  size: int
9
  content_type: str
10
+ url_path: str
11
 
12
  class DocumentResponse(BaseModel):
13
  """Response model for document processing"""
src/utils/__pycache__/logger.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/logger.cpython-312.pyc and b/src/utils/__pycache__/logger.cpython-312.pyc differ
 
src/utils/logger.py CHANGED
@@ -4,7 +4,7 @@ import sys
4
  from typing import Optional
5
 
6
  def setup_logger(
7
- name: str = "rag_chatbot",
8
  log_level: str = "INFO",
9
  log_file: Optional[str] = None
10
  ) -> logging.Logger:
 
4
  from typing import Optional
5
 
6
  def setup_logger(
7
+ name: str = "chatbot",
8
  log_level: str = "INFO",
9
  log_file: Optional[str] = None
10
  ) -> logging.Logger:
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ