TalatMasood commited on
Commit
415595f
·
1 Parent(s): f36ab64

Updarte chatbot with deployment configurations on the Render

Browse files
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+
24
+ # Virtual Environment
25
+ venv/
26
+ ENV/
27
+
28
+ # Environment Variables
29
+ .env
30
+ .env.local
31
+ .env.*.local
32
+
33
+ # IDE
34
+ .idea/
35
+ .vscode/
36
+ *.swp
37
+ *.swo
38
+
39
+ # Logs
40
+ *.log
41
+
42
+ # Database
43
+ chroma_db/
44
+ uploads/
45
+ temp_downloads/
46
+
47
+ # OS
48
+ .DS_Store
49
+ Thumbs.db
DockerComposeConfiguration DELETED
@@ -1,33 +0,0 @@
1
- version: '3.8'
2
-
3
- services:
4
- app:
5
- build: .
6
- ports:
7
- - "8000:8000"
8
- env_file:
9
- - .env
10
- volumes:
11
- - ./:/app
12
- depends_on:
13
- - ollama
14
-
15
- ollama:
16
- image: ollama/ollama
17
- ports:
18
- - "11434:11434"
19
- volumes:
20
- - ollama-data:/root/.ollama
21
-
22
- chroma:
23
- image: chromadb/chroma
24
- ports:
25
- - "8000:8000"
26
- volumes:
27
- - chroma-data:/chroma
28
- environment:
29
- - PERSIST_DIRECTORY=/chroma
30
-
31
- volumes:
32
- ollama-data:
33
- chroma-data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile DELETED
@@ -1,25 +0,0 @@
1
- # Use an official Python runtime as a parent image
2
- FROM python:3.9-slim
3
-
4
- # Set the working directory in the container
5
- WORKDIR /app
6
-
7
- # Install system dependencies
8
- RUN apt-get update && apt-get install -y \
9
- build-essential \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
- # Copy the current directory contents into the container at /app
13
- COPY . /app
14
-
15
- # Install any needed packages specified in requirements.txt
16
- RUN pip install --no-cache-dir -r requirements.txt
17
-
18
- # Make port 8000 available to the world outside this container
19
- EXPOSE 8000
20
-
21
- # Define environment variable
22
- ENV NAME RAGChatbot
23
-
24
- # Run the application
25
- CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
 
config/config.py CHANGED
@@ -7,32 +7,33 @@ from google_auth_oauthlib.flow import Flow
7
  # Load environment variables
8
  load_dotenv()
9
 
 
10
  class Settings:
11
  # OpenAI Configuration
12
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
13
  OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
14
-
15
  ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
16
 
17
  # Ollama Configuration
18
  OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
19
  OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
20
-
21
  # Anthropic Configuration
22
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
23
-
24
  # Embedding Configuration
25
  EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
26
-
27
  # Vector Store Configuration
28
  CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
29
-
30
  # MongoDB Configuration
31
  MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
32
-
33
  # Feedback Configuration
34
  MAX_RATING = int(os.getenv('MAX_RATING', '5'))
35
-
36
  # Temporary directory for downloaded files
37
  TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
38
 
@@ -40,27 +41,44 @@ class Settings:
40
  DEBUG = os.getenv('DEBUG', 'False') == 'True'
41
 
42
  # Google Drive Configuration
43
- GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
44
- GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv('GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
45
-
46
- # GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
47
- # GOOGLE_OAUTH_CLIENT_ID = os.getenv('GOOGLE_OAUTH_CLIENT_ID', '')
48
- # GOOGLE_OAUTH_CLIENT_SECRET = os.getenv('GOOGLE_OAUTH_CLIENT_SECRET', '')
49
- # GOOGLE_OAUTH_REDIRECT_URI = os.getenv('GOOGLE_OAUTH_REDIRECT_URI', 'http://127.0.0.1:8000/google/oauth2callback')
50
-
51
- # @property
52
- # def google_oauth_flow(self):
53
- # flow = Flow.from_client_config({
54
- # "web": {
55
- # "client_id": self.GOOGLE_OAUTH_CLIENT_ID,
56
- # "client_secret": self.GOOGLE_OAUTH_CLIENT_SECRET,
57
- # "auth_uri": "https://accounts.google.com/o/oauth2/auth",
58
- # "token_uri": "https://oauth2.googleapis.com/token",
59
- # "redirect_uris": [self.GOOGLE_OAUTH_REDIRECT_URI],
60
- # "javascript_origins": ["http://localhost:8000", "http://127.0.0.1:8000"]
61
- # }
62
- # }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
63
- # flow.redirect_uri = self.GOOGLE_OAUTH_REDIRECT_URI
64
- # return flow
65
-
66
- settings = Settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Load environment variables
8
  load_dotenv()
9
 
10
+
11
  class Settings:
12
  # OpenAI Configuration
13
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
14
  OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
15
+
16
  ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
17
 
18
  # Ollama Configuration
19
  OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
20
  OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
21
+
22
  # Anthropic Configuration
23
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
24
+
25
  # Embedding Configuration
26
  EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
27
+
28
  # Vector Store Configuration
29
  CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
30
+
31
  # MongoDB Configuration
32
  MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
33
+
34
  # Feedback Configuration
35
  MAX_RATING = int(os.getenv('MAX_RATING', '5'))
36
+
37
  # Temporary directory for downloaded files
38
  TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
39
 
 
41
  DEBUG = os.getenv('DEBUG', 'False') == 'True'
42
 
43
  # Google Drive Configuration
44
+ GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
45
+ GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv(
46
+ 'GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
47
+
48
+ # Use explicit type conversion to ensure correct types
49
+ DOCUMENT_PROCESSOR = {
50
+ 'chunk_size': int(os.getenv('DOCUMENT_CHUNK_SIZE', '1000')),
51
+ 'chunk_overlap': int(os.getenv('DOCUMENT_CHUNK_OVERLAP', '200')),
52
+ # 20MB in bytes
53
+ 'max_file_size': int(os.getenv('DOCUMENT_MAX_FILE_SIZE', str(20 * 1024 * 1024))),
54
+ 'supported_formats': [
55
+ '.txt', '.pdf', '.docx', '.csv', '.json',
56
+ '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
57
+ ]
58
+ }
59
+
60
+ @classmethod
61
+ def get_document_processor_settings(cls) -> dict:
62
+ """
63
+ Get document processor settings with validation
64
+
65
+ Returns:
66
+ dict: Validated document processor settings
67
+ """
68
+ settings = cls.DOCUMENT_PROCESSOR.copy()
69
+
70
+ # Ensure positive values for numeric settings
71
+ settings['chunk_size'] = max(
72
+ 100, settings['chunk_size']) # Minimum 100
73
+ settings['chunk_overlap'] = min(
74
+ settings['chunk_overlap'],
75
+ # Ensure overlap is less than chunk size
76
+ settings['chunk_size'] - 50
77
+ )
78
+ settings['max_file_size'] = max(
79
+ 1024 * 1024, settings['max_file_size']) # Minimum 1MB
80
+
81
+ return settings
82
+
83
+
84
+ settings = Settings()
render.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ - type: web
3
+ name: chatbot-backend
4
+ env: python
5
+ region: ohio # Choose appropriate region
6
+ plan: starter # Or choose appropriate plan
7
+ buildCommand: pip install -r requirements.txt
8
+ startCommand: uvicorn src.main:app --host 0.0.0.0 --port $PORT
9
+ envVars:
10
+ - key: MONGODB_URI
11
+ sync: false
12
+ - key: OPENAI_API_KEY
13
+ sync: false
14
+ - key: ANTHROPIC_API_KEY
15
+ sync: false
16
+ - key: ADMIN_API_KEY
17
+ sync: false
18
+ - key: CHROMA_PATH
19
+ value: ./chroma_db
20
+ - key: DEBUG
21
+ value: "False"
22
+ - key: ENVIRONMENT
23
+ value: "production"
24
+ healthCheckPath: /health
25
+ autoDeploy: true
requirements.txt CHANGED
@@ -1,23 +1,38 @@
1
- # Requirements for RAG Chatbot
2
- fastapi==0.109.0
3
- uvicorn==0.24.0
4
- pydantic==2.6.1
5
- python-dotenv==1.0.0
6
-
7
- # LLM Providers
8
- openai==1.12.0
9
- anthropic==0.18.0
10
- ollama==0.1.6
11
-
12
- # Embedding and Vector Store
13
- sentence-transformers==2.3.1
14
- chromadb==0.4.22
15
- huggingface_hub==0.20.3
16
-
17
- # Optional: Additional dependencies
18
- numpy==1.26.3
19
- torch==2.1.2
20
-
21
- PyPDF2==3.0.1
22
- python-docx==1.0.1
23
- requests==2.31.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ transformers
5
+ openai
6
+ anthropic
7
+ sentence-transformers
8
+ accelerate
9
+ bitsandbytes
10
+ pydantic
11
+ email-validator
12
+ numpy
13
+ pandas
14
+ scipy
15
+ scikit-learn
16
+ pymongo
17
+ motor
18
+ chromadb
19
+ aiosqlite
20
+ python-dotenv
21
+ box
22
+ PyPDF2
23
+ python-docx
24
+ python-magic-bin==0.4.14
25
+ openpyxl
26
+ xlrd
27
+ striprtf
28
+ beautifulsoup4
29
+ pydrive2==1.14.0
30
+ google-auth-oauthlib==0.4.6
31
+ requests
32
+ tqdm
33
+ matplotlib
34
+ plotly
35
+ tiktoken
36
+ psutil
37
+ huggingface_hub
38
+ setuptools
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.12
setup.py DELETED
@@ -1,53 +0,0 @@
1
- from setuptools import setup, find_packages
2
-
3
- setup(
4
- name="chatbot",
5
- version="1.0.0",
6
- packages=find_packages(),
7
- install_requires=[
8
- # Web Framework
9
- "fastapi",
10
- "uvicorn",
11
-
12
- # AI/ML
13
- "torch",
14
- "transformers",
15
- "sentence-transformers",
16
- "huggingface_hub",
17
-
18
- # LLM Providers
19
- "openai",
20
- "anthropic",
21
- "ollama",
22
-
23
- # Data Validation & Processing
24
- "pydantic",
25
- "email-validator",
26
- "numpy",
27
- "pandas",
28
-
29
- # Database & Storage
30
- "pymongo",
31
- "motor",
32
- "chromadb",
33
- "aiosqlite",
34
-
35
- # Document Processing
36
- "PyPDF2",
37
- "python-docx",
38
- "python-magic-bin==0.4.14",
39
- "openpyxl",
40
- "xlrd",
41
- "striprtf",
42
- "beautifulsoup4",
43
-
44
- # Utilities
45
- "python-dotenv",
46
- "requests",
47
- "tiktoken",
48
- "psutil",
49
-
50
- # Google Integration
51
- "google-auth-oauthlib==0.4.6"
52
- ]
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc and b/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc differ
 
src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
 
src/agents/__pycache__/rag_agent_manager.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc and b/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc differ
 
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
 
src/agents/enhanced_context_manager.py DELETED
@@ -1,202 +0,0 @@
1
- from typing import List, Dict, Optional, Tuple
2
- import spacy
3
- from collections import defaultdict
4
-
5
- class EnhancedContextManager:
6
- def __init__(self):
7
- """Initialize the context manager with NLP components"""
8
- # Load spaCy model for NER and dependency parsing
9
- self.nlp = spacy.load("en_core_web_sm")
10
- # Track entities and their mentions across conversation
11
- self.entity_mentions = defaultdict(list)
12
- # Track conversation turns
13
- self.conversation_turns = []
14
- # Track last processed entity
15
- self.last_entity = None
16
- # Track last full response context
17
- self.last_full_context = None
18
-
19
- def process_turn(self, query: str, response: str) -> None:
20
- """Process a conversation turn to extract and track entities"""
21
- # Parse query and response
22
- query_doc = self.nlp(query)
23
- response_doc = self.nlp(response)
24
-
25
- # Extract and track entities from both query and response
26
- turn_entities = self._extract_entities(query_doc, response_doc)
27
-
28
- # Store the turn with its entities
29
- self.conversation_turns.append({
30
- 'query': query,
31
- 'response': response,
32
- 'entities': turn_entities
33
- })
34
-
35
- # Update entity mentions
36
- for entity, info in turn_entities.items():
37
- self.entity_mentions[entity].append({
38
- 'turn_index': len(self.conversation_turns) - 1,
39
- 'info': info
40
- })
41
-
42
- # Update last entity and full context
43
- if turn_entities:
44
- # Prioritize entities in response, then query
45
- primary_entity = (
46
- list(turn_entities.keys())[0] if turn_entities
47
- else None
48
- )
49
- self.last_entity = primary_entity
50
-
51
- # Store full context for potential reference
52
- self.last_full_context = f"{query} {response}"
53
-
54
- def _extract_entities(self, query_doc, response_doc) -> Dict:
55
- """Extract named entities and their properties"""
56
- entities = {}
57
-
58
- # Process both query and response documents
59
- for doc in [query_doc, response_doc]:
60
- for ent in doc.ents:
61
- # Store entity with its type and text
62
- entities[ent.text] = {
63
- 'type': ent.label_,
64
- 'text': ent.text,
65
- 'mentions': [tok.text for tok in doc if tok.head == ent.root]
66
- }
67
-
68
- return entities
69
-
70
- def resolve_pronouns(self, current_query: str) -> Optional[str]:
71
- """
72
- Resolve pronouns in the current query based on conversation history
73
-
74
- Args:
75
- current_query (str): Current query with potential pronouns
76
-
77
- Returns:
78
- Optional[str]: Query with resolved pronouns, or None if no resolution needed
79
- """
80
- if not self.conversation_turns:
81
- return None
82
-
83
- query_doc = self.nlp(current_query)
84
-
85
- # Find pronouns in current query
86
- pronouns = [token for token in query_doc if token.pos_ == "PRON"]
87
- if not pronouns:
88
- return None
89
-
90
- for pronoun in pronouns:
91
- replacement = self._find_antecedent(pronoun.text)
92
- if replacement:
93
- # Replace the pronoun with the most likely antecedent
94
- new_query = current_query.replace(pronoun.text, replacement)
95
- return new_query
96
-
97
- return None
98
-
99
- def _find_antecedent(self, pronoun: str) -> Optional[str]:
100
- """
101
- Find the most recent matching entity for a pronoun
102
-
103
- Args:
104
- pronoun (str): Pronoun to resolve
105
-
106
- Returns:
107
- Optional[str]: Resolved entity or None
108
- """
109
- # Pronoun to gender/number mapping
110
- pronoun_properties = {
111
- 'he': {'gender': 'male', 'number': 'singular'},
112
- 'she': {'gender': 'female', 'number': 'singular'},
113
- 'they': {'gender': None, 'number': 'plural'},
114
- 'his': {'gender': 'male', 'number': 'singular'},
115
- 'her': {'gender': 'female', 'number': 'singular'},
116
- 'their': {'gender': None, 'number': 'plural'}
117
- }
118
-
119
- # Normalize pronoun
120
- pronoun_lower = pronoun.lower().rstrip('s')
121
-
122
- # If not a known pronoun, return None
123
- if pronoun_lower not in pronoun_properties:
124
- return None
125
-
126
- # If a named entity was recently mentioned, use it first
127
- if self.last_entity:
128
- return self.last_entity
129
-
130
- # Fallback to last full context if no specific entity found
131
- if self.last_full_context:
132
- return self.last_full_context.split()[0]
133
-
134
- return None
135
-
136
- def enhance_query(self, current_query: str) -> str:
137
- """
138
- Enhance current query with context and resolved pronouns
139
-
140
- Args:
141
- current_query (str): Original query
142
-
143
- Returns:
144
- str: Enhanced query with additional context
145
- """
146
- # First try to resolve pronouns
147
- resolved_query = self.resolve_pronouns(current_query)
148
-
149
- # If pronouns are resolved, use the resolved query
150
- if resolved_query:
151
- return resolved_query
152
-
153
- # Get relevant context
154
- context = self._get_relevant_context(current_query)
155
-
156
- # If context found, prepend it to the query
157
- if context:
158
- return f"{context} {current_query}"
159
-
160
- # If no context resolution, return original query
161
- return current_query
162
-
163
- def _get_relevant_context(self, query: str) -> Optional[str]:
164
- """
165
- Get relevant context from conversation history
166
-
167
- Args:
168
- query (str): Current query
169
-
170
- Returns:
171
- Optional[str]: Relevant context or None
172
- """
173
- if not self.conversation_turns:
174
- return None
175
-
176
- # Get the most recent turn
177
- recent_turn = self.conversation_turns[-1]
178
-
179
- # If the current query contains a pronoun and we have last full context
180
- if any(token.pos_ == "PRON" for token in self.nlp(query)):
181
- return self.last_full_context
182
-
183
- return None
184
-
185
- def get_conversation_context(self) -> List[Dict]:
186
- """Get processed conversation context"""
187
- return self.conversation_turns
188
-
189
- def record_last_context(self, last_context: Optional[str] = None) -> None:
190
- """
191
- Manually record last context if needed
192
-
193
- Args:
194
- last_context (Optional[str]): Last context to manually set
195
- """
196
- if last_context:
197
- self.last_full_context = last_context
198
- # Try to extract an entity from the context
199
- doc = self.nlp(last_context)
200
- entities = [ent.text for ent in doc.ents]
201
- if entities:
202
- self.last_entity = entities[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/agents/rag_agent.py CHANGED
@@ -1,8 +1,8 @@
1
- from typing import List, Dict, Optional, Tuple
 
2
  import uuid
3
 
4
  from .excel_aware_rag import ExcelAwareRAGAgent
5
- from .enhanced_context_manager import EnhancedContextManager
6
  from ..llms.base_llm import BaseLLM
7
  from src.embeddings.base_embedding import BaseEmbedding
8
  from src.vectorstores.base_vectorstore import BaseVectorStore
@@ -11,6 +11,7 @@ from src.db.mongodb_store import MongoDBStore
11
  from src.models.rag import RAGResponse
12
  from src.utils.logger import logger
13
 
 
14
  class RAGAgent(ExcelAwareRAGAgent):
15
  def __init__(
16
  self,
@@ -21,7 +22,17 @@ class RAGAgent(ExcelAwareRAGAgent):
21
  max_history_tokens: int = 4000,
22
  max_history_messages: int = 10
23
  ):
24
- """Initialize RAG Agent with enhanced context management"""
 
 
 
 
 
 
 
 
 
 
25
  super().__init__() # Initialize ExcelAwareRAGAgent
26
  self.llm = llm
27
  self.embedding = embedding
@@ -31,9 +42,6 @@ class RAGAgent(ExcelAwareRAGAgent):
31
  max_tokens=max_history_tokens,
32
  max_messages=max_history_messages
33
  )
34
- # Add enhanced context management while preserving existing functionality
35
- self.context_manager = EnhancedContextManager()
36
- logger.info("RAGAgent initialized with enhanced context management")
37
 
38
  async def generate_response(
39
  self,
@@ -41,46 +49,19 @@ class RAGAgent(ExcelAwareRAGAgent):
41
  conversation_id: Optional[str],
42
  temperature: float,
43
  max_tokens: Optional[int] = None,
44
- context_docs: Optional[List[str]] = None,
45
- stream: bool = False,
46
- custom_roles: Optional[List[Dict[str, str]]] = None
47
  ) -> RAGResponse:
48
- """
49
- Generate a response with comprehensive context and role management
50
-
51
- Args:
52
- query (str): User query
53
- conversation_id (Optional[str]): Conversation identifier
54
- temperature (float): LLM temperature for response generation
55
- max_tokens (Optional[int]): Maximum tokens for response
56
- context_docs (Optional[List[str]]): Pre-retrieved context documents
57
- stream (bool): Whether to stream the response
58
- custom_roles (Optional[List[Dict[str, str]]]): Custom role instructions
59
-
60
- Returns:
61
- RAGResponse: Generated response with context and metadata
62
- """
63
  try:
64
- logger.info(f"Generating response for query: {query}")
65
-
66
- # Apply custom roles if provided
67
- if custom_roles:
68
- for role in custom_roles:
69
- # Modify query or context based on role
70
- if role.get('name') == 'introduction_specialist':
71
- query += " Provide a concise, welcoming response."
72
- elif role.get('name') == 'knowledge_based_specialist':
73
- query += " Ensure response is precise and directly from available knowledge."
74
-
75
- # Introduction Handling
76
  is_introduction = (
77
- "wants support" in query and
78
- "This is Introduction" in query and
79
  ("A new user with name:" in query or "An old user with name:" in query)
80
  )
81
 
82
  if is_introduction:
83
- logger.info("Processing introduction message")
84
  welcome_message = self._handle_contact_query(query)
85
  return RAGResponse(
86
  response=welcome_message,
@@ -89,118 +70,70 @@ class RAGAgent(ExcelAwareRAGAgent):
89
  scores=None
90
  )
91
 
92
- # Conversation History Processing
93
  history = []
94
- last_context = None
95
  if conversation_id:
96
- logger.info(f"Retrieving conversation history for ID: {conversation_id}")
97
  history = await self.mongodb.get_recent_messages(
98
  conversation_id,
99
  limit=self.conversation_manager.max_messages
100
  )
101
-
102
- # Process history for conversation manager
103
  history = self.conversation_manager.get_relevant_history(
104
  messages=history,
105
  current_query=query
106
  )
107
-
108
- # Process in enhanced context manager
109
- for msg in history:
110
- self.context_manager.process_turn(
111
- msg.get('query', ''),
112
- msg.get('response', '')
113
- )
114
-
115
- # Get last context if available
116
- if history and history[-1].get('response'):
117
- last_context = history[-1]['response']
118
-
119
- # Query Enhancement
120
- enhanced_query = self.context_manager.enhance_query(query)
121
-
122
- # Manual Pronoun Handling Fallback
123
- if enhanced_query == query:
124
- pronoun_map = {
125
- 'his': 'he',
126
- 'her': 'she',
127
- 'their': 'they'
128
- }
129
- words = query.lower().split()
130
- for pronoun, replacement in pronoun_map.items():
131
- if pronoun in words:
132
- # Try to use last context
133
- if last_context:
134
- self.context_manager.record_last_context(last_context)
135
- enhanced_query = self.context_manager.enhance_query(query)
136
- break
137
-
138
- logger.info(f"Enhanced query: {enhanced_query}")
139
-
140
- # Context Retrieval
141
  if not context_docs:
142
- logger.info("Retrieving context for enhanced query")
143
  context_docs, sources, scores = await self.retrieve_context(
144
- enhanced_query,
145
  conversation_history=history
146
  )
147
  else:
148
- sources = []
149
  scores = None
150
 
151
- # Context Fallback Mechanism
152
  if not context_docs:
153
- # If no context and last context exists, use it
154
- if last_context:
155
- context_docs = [last_context]
156
- sources = [{"source": "previous_context"}]
157
- scores = [1.0]
158
- else:
159
- logger.info("No relevant context found")
160
- return RAGResponse(
161
- response="Information about this is not available, do you want to inquire about something else?",
162
- context_docs=[],
163
- sources=[],
164
- scores=None
165
- )
166
 
167
- # Excel-specific Content Handling
168
  has_excel_content = any('Sheet:' in doc for doc in context_docs)
169
  if has_excel_content:
170
- logger.info("Processing Excel-specific content")
171
  try:
172
- context_docs = self._process_excel_context(context_docs, enhanced_query)
 
173
  except Exception as e:
174
  logger.warning(f"Error processing Excel context: {str(e)}")
175
 
176
- # Prompt Generation with Conversation History
177
- prompt = self.conversation_manager.generate_prompt_with_history(
178
- current_query=enhanced_query,
179
  history=history,
180
  context_docs=context_docs
181
  )
182
 
183
- # Streaming Response Generation
184
- if stream:
185
- # TODO: Implement actual streaming logic
186
- # This is a placeholder and needs proper implementation
187
- logger.warning("Streaming not fully implemented")
188
-
189
- # Standard Response Generation
190
  response = self.llm.generate(
191
- prompt=prompt,
192
  temperature=temperature,
193
  max_tokens=max_tokens
194
  )
195
 
196
- # Response Cleaning
197
  cleaned_response = self._clean_response(response)
198
-
199
- # Excel Response Enhancement
200
  if has_excel_content:
201
  try:
202
  enhanced_response = await self.enhance_excel_response(
203
- query=enhanced_query,
204
  response=cleaned_response,
205
  context_docs=context_docs
206
  )
@@ -209,158 +142,122 @@ class RAGAgent(ExcelAwareRAGAgent):
209
  except Exception as e:
210
  logger.warning(f"Error enhancing Excel response: {str(e)}")
211
 
212
- # Context Tracking
213
- self.context_manager.process_turn(query, cleaned_response)
214
-
215
- # Metadata Generation
216
- metadata = {
217
- 'llm_provider': getattr(self.llm, 'model_name', 'unknown'),
218
- 'temperature': temperature,
219
- 'conversation_id': conversation_id,
220
- 'context_sources': sources,
221
- 'has_excel_content': has_excel_content
222
- }
223
-
224
- logger.info("Successfully generated response")
225
  return RAGResponse(
226
  response=cleaned_response,
227
  context_docs=context_docs,
228
  sources=sources,
229
- scores=scores,
230
- metadata=metadata # Added metadata
231
  )
232
 
233
  except Exception as e:
234
- logger.error(f"Error in generate_response: {str(e)}")
235
  raise
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  async def retrieve_context(
238
  self,
239
  query: str,
240
  conversation_history: Optional[List[Dict]] = None,
241
  top_k: int = 3
242
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
243
- """Retrieve context with both original and enhanced handling"""
244
- try:
245
- logger.info(f"Retrieving context for query: {query}")
246
-
247
- # Enhance query using both managers
248
- if conversation_history:
249
- # Get the last two messages for immediate context
250
- recent_messages = conversation_history[-2:]
251
-
252
- # Extract queries and responses for context
253
- context_parts = []
254
- for msg in recent_messages:
255
- if msg.get('query'):
256
- context_parts.append(msg['query'])
257
- if msg.get('response'):
258
- response = msg['response']
259
- if "Information about this is not available" not in response:
260
- context_parts.append(response)
261
-
262
- # Combine with current query
263
- enhanced_query = f"{' '.join(context_parts)} {query}".strip()
264
- logger.info(f"Enhanced query with history: {enhanced_query}")
265
- else:
266
- enhanced_query = query
267
-
268
- # Debug log the enhanced query
269
- logger.info(f"Final enhanced query: {enhanced_query}")
270
-
271
- # Embed the enhanced query
272
- query_embedding = self.embedding.embed_query(enhanced_query)
273
-
274
- # Debug log embedding shape
275
- logger.info(f"Query embedding shape: {len(query_embedding)}")
276
-
277
- # Retrieve similar documents
278
- results = self.vector_store.similarity_search(
279
- query_embedding,
280
- top_k=top_k
281
- )
282
 
283
- # Debug log search results
284
- logger.info(f"Number of search results: {len(results)}")
285
- for i, result in enumerate(results):
286
- logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
287
- logger.info(f"Result {i} text preview: {result.get('text', '')[:100]}...")
288
-
289
- if not results:
290
- logger.info("No results found in similarity search")
291
- return [], [], None
292
-
293
- # Process results
294
- documents = [doc['text'] for doc in results]
295
- sources = [self._convert_metadata_to_strings(doc['metadata'])
296
- for doc in results]
297
- scores = [doc['score'] for doc in results
298
- if doc.get('score') is not None]
299
-
300
- # Return scores only if available for all documents
301
- if len(scores) != len(documents):
302
- scores = None
303
 
304
- logger.info(f"Retrieved {len(documents)} relevant documents")
305
- return documents, sources, scores
 
 
 
 
306
 
307
- except Exception as e:
308
- logger.error(f"Error in retrieve_context: {str(e)}")
309
- raise
310
 
311
- def _clean_response(self, response: str) -> str:
312
- """Clean response text while preserving key information"""
313
- if not response:
314
- return response
315
-
316
- # Keep only the most common phrases to remove
317
- phrases_to_remove = [
318
- "Based on the context,",
319
- "According to the documents,",
320
- "From the information available,",
321
- "Based on the provided information,",
322
- "I apologize,"
323
- ]
324
-
325
- cleaned_response = response
326
- for phrase in phrases_to_remove:
327
- cleaned_response = cleaned_response.replace(phrase, "").strip()
328
-
329
- cleaned_response = " ".join(cleaned_response.split())
330
-
331
- if not cleaned_response:
332
- return response
333
-
334
- if cleaned_response[0].islower():
335
- cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
336
-
337
- return cleaned_response
338
 
339
  def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
340
- """Convert metadata values to strings"""
341
- try:
342
- return {
343
- key: str(value) if isinstance(value, (int, float)) else value
344
- for key, value in metadata.items()
345
- }
346
- except Exception as e:
347
- logger.error(f"Error converting metadata: {str(e)}")
348
- return metadata
349
-
350
- def _handle_contact_query(self, query: str) -> str:
351
- """Handle contact/introduction queries"""
352
- try:
353
- name_start = query.find('name: "') + 7
354
- name_end = query.find('"', name_start)
355
- name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
356
-
357
- is_returning = (
358
- "An old user with name:" in query and
359
- "wants support again" in query
360
- )
361
-
362
- return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
363
-
364
- except Exception as e:
365
- logger.error(f"Error handling contact query: {str(e)}")
366
- return "Welcome, How can I help you?"
 
1
+ # src/agents/rag_agent.py
2
+ from typing import List, Optional, Tuple, Dict
3
  import uuid
4
 
5
  from .excel_aware_rag import ExcelAwareRAGAgent
 
6
  from ..llms.base_llm import BaseLLM
7
  from src.embeddings.base_embedding import BaseEmbedding
8
  from src.vectorstores.base_vectorstore import BaseVectorStore
 
11
  from src.models.rag import RAGResponse
12
  from src.utils.logger import logger
13
 
14
+
15
  class RAGAgent(ExcelAwareRAGAgent):
16
  def __init__(
17
  self,
 
22
  max_history_tokens: int = 4000,
23
  max_history_messages: int = 10
24
  ):
25
+ """
26
+ Initialize RAG Agent
27
+
28
+ Args:
29
+ llm (BaseLLM): Language model instance
30
+ embedding (BaseEmbedding): Embedding model instance
31
+ vector_store (BaseVectorStore): Vector store instance
32
+ mongodb (MongoDBStore): MongoDB store instance
33
+ max_history_tokens (int): Maximum tokens in conversation history
34
+ max_history_messages (int): Maximum messages to keep in history
35
+ """
36
  super().__init__() # Initialize ExcelAwareRAGAgent
37
  self.llm = llm
38
  self.embedding = embedding
 
42
  max_tokens=max_history_tokens,
43
  max_messages=max_history_messages
44
  )
 
 
 
45
 
46
  async def generate_response(
47
  self,
 
49
  conversation_id: Optional[str],
50
  temperature: float,
51
  max_tokens: Optional[int] = None,
52
+ context_docs: Optional[List[str]] = None
 
 
53
  ) -> RAGResponse:
54
+ """Generate response with specific handling for different query types"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  try:
56
+ # First, check if this is an introduction/welcome message query
 
 
 
 
 
 
 
 
 
 
 
57
  is_introduction = (
58
+ "wants support" in query and
59
+ "This is Introduction" in query and
60
  ("A new user with name:" in query or "An old user with name:" in query)
61
  )
62
 
63
  if is_introduction:
64
+ # Handle introduction message - no context needed
65
  welcome_message = self._handle_contact_query(query)
66
  return RAGResponse(
67
  response=welcome_message,
 
70
  scores=None
71
  )
72
 
73
+ # Get conversation history if conversation_id exists
74
  history = []
 
75
  if conversation_id:
 
76
  history = await self.mongodb.get_recent_messages(
77
  conversation_id,
78
  limit=self.conversation_manager.max_messages
79
  )
80
+
81
+ # Get relevant history within token limits
82
  history = self.conversation_manager.get_relevant_history(
83
  messages=history,
84
  current_query=query
85
  )
86
+
87
+ # Retrieve context if not provided
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if not context_docs:
 
89
  context_docs, sources, scores = await self.retrieve_context(
90
+ query=query,
91
  conversation_history=history
92
  )
93
  else:
94
+ sources = None
95
  scores = None
96
 
97
+ # Check if we have any relevant context
98
  if not context_docs:
99
+ return RAGResponse(
100
+ response="Information about this is not available, do you want to inquire about something else?",
101
+ context_docs=[],
102
+ sources=[],
103
+ scores=None
104
+ )
 
 
 
 
 
 
 
105
 
106
+ # Check if this is an Excel-related query
107
  has_excel_content = any('Sheet:' in doc for doc in context_docs)
108
  if has_excel_content:
 
109
  try:
110
+ context_docs = self._process_excel_context(
111
+ context_docs, query)
112
  except Exception as e:
113
  logger.warning(f"Error processing Excel context: {str(e)}")
114
 
115
+ # Generate prompt with context and history
116
+ augmented_prompt = self.conversation_manager.generate_prompt_with_history(
117
+ current_query=query,
118
  history=history,
119
  context_docs=context_docs
120
  )
121
 
122
+ # Generate initial response
 
 
 
 
 
 
123
  response = self.llm.generate(
124
+ prompt=augmented_prompt,
125
  temperature=temperature,
126
  max_tokens=max_tokens
127
  )
128
 
129
+ # Clean the response
130
  cleaned_response = self._clean_response(response)
131
+
132
+ # For Excel queries, enhance the response
133
  if has_excel_content:
134
  try:
135
  enhanced_response = await self.enhance_excel_response(
136
+ query=query,
137
  response=cleaned_response,
138
  context_docs=context_docs
139
  )
 
142
  except Exception as e:
143
  logger.warning(f"Error enhancing Excel response: {str(e)}")
144
 
145
+ # Return the final response
 
 
 
 
 
 
 
 
 
 
 
 
146
  return RAGResponse(
147
  response=cleaned_response,
148
  context_docs=context_docs,
149
  sources=sources,
150
+ scores=scores
 
151
  )
152
 
153
  except Exception as e:
154
+ logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
155
  raise
156
 
157
+ def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
158
+ """
159
+ Create prompt for generating response from context
160
+
161
+ Args:
162
+ query (str): User query
163
+ context_docs (List[str]): Retrieved context documents
164
+
165
+ Returns:
166
+ str: Formatted prompt for the LLM
167
+ """
168
+ if not context_docs:
169
+ return f"Query: {query}\nResponse: Information about this is not available, do you want to inquire about something else?"
170
+
171
+ # Format context documents
172
+ formatted_context = "\n\n".join(
173
+ f"Context {i+1}:\n{doc.strip()}"
174
+ for i, doc in enumerate(context_docs)
175
+ if doc and doc.strip()
176
+ )
177
+
178
+ # Build the prompt with detailed instructions
179
+ prompt = f"""You are a knowledgeable assistant. Use the following context to answer the query accurately and informatively.
180
+
181
+ Context Information:
182
+ {formatted_context}
183
+
184
+ Query: {query}
185
+
186
+ Instructions:
187
+ 1. Base your response ONLY on the information provided in the context above
188
+ 2. If the context contains numbers, statistics, or specific details, include them in your response
189
+ 3. Keep your response focused and relevant to the query
190
+ 4. Use clear and professional language
191
+ 5. If the context includes technical terms, explain them appropriately
192
+ 6. Do not make assumptions or add information not present in the context
193
+ 7. If specific sections of a report are mentioned, maintain their original structure
194
+ 8. Format the response in a clear, readable manner
195
+ 9. If the context includes chronological information, maintain the proper sequence
196
+
197
+ Response:"""
198
+
199
+ return prompt
200
+
201
  async def retrieve_context(
202
  self,
203
  query: str,
204
  conversation_history: Optional[List[Dict]] = None,
205
  top_k: int = 3
206
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
207
+ """
208
+ Retrieve context with conversation history enhancement
209
+ """
210
+ # Enhance query with conversation history
211
+ if conversation_history:
212
+ recent_queries = [
213
+ msg['query'] for msg in conversation_history[-2:]
214
+ if msg.get('query')
215
+ ]
216
+ enhanced_query = " ".join([*recent_queries, query])
217
+ else:
218
+ enhanced_query = query
219
+
220
+ # Debug log the enhanced query
221
+ logger.info(f"Enhanced query: {enhanced_query}")
222
+
223
+ # Embed the enhanced query
224
+ query_embedding = self.embedding.embed_query(enhanced_query)
225
+
226
+ # Debug log embedding shape
227
+ logger.info(f"Query embedding shape: {len(query_embedding)}")
228
+
229
+ # Retrieve similar documents
230
+ results = self.vector_store.similarity_search(
231
+ query_embedding,
232
+ top_k=top_k
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ # Debug log search results
236
+ logger.info(f"Number of search results: {len(results)}")
237
+ for i, result in enumerate(results):
238
+ logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
239
+ logger.info(
240
+ f"Result {i} text preview: {result.get('text', '')[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ # Process results
243
+ documents = [doc['text'] for doc in results]
244
+ sources = [self._convert_metadata_to_strings(doc['metadata'])
245
+ for doc in results]
246
+ scores = [doc['score'] for doc in results
247
+ if doc.get('score') is not None]
248
 
249
+ # Return scores only if available for all documents
250
+ if len(scores) != len(documents):
251
+ scores = None
252
 
253
+ return documents, sources, scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
256
+ """Convert numeric metadata values to strings"""
257
+ converted = {}
258
+ for key, value in metadata.items():
259
+ if isinstance(value, (int, float)):
260
+ converted[key] = str(value)
261
+ else:
262
+ converted[key] = value
263
+ return converted
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/agents/rag_agent_manager.py DELETED
@@ -1,77 +0,0 @@
1
- # src/agents/rag_agent_manager.py
2
- from typing import Optional
3
- import weakref
4
-
5
- from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
6
- from src.llms.base_llm import BaseLLM
7
- from src.embeddings.base_embedding import BaseEmbedding
8
- from src.vectorstores.base_vectorstore import BaseVectorStore
9
- from src.db.mongodb_store import MongoDBStore
10
- from src.utils.logger import logger
11
-
12
- class RAGAgentManager:
13
- """
14
- Singleton manager for RAG Agent instances with intelligent caching
15
- """
16
- _instance = None
17
-
18
- def __new__(cls):
19
- if not cls._instance:
20
- cls._instance = super().__new__(cls)
21
- return cls._instance
22
-
23
- def __init__(self):
24
- # Ensure this is only initialized once
25
- if not hasattr(self, '_initialized'):
26
- self._rag_agent = None
27
- self._initialized = True
28
-
29
- def get_rag_agent(
30
- self,
31
- llm: BaseLLM,
32
- embedding_model: BaseEmbedding,
33
- vector_store: BaseVectorStore,
34
- mongodb: MongoDBStore
35
- ) -> SystemInstructionsRAGAgent:
36
- """
37
- Get or create a singleton RAG agent instance with intelligent caching
38
-
39
- Args:
40
- llm: Language Model instance
41
- embedding_model: Embedding model instance
42
- vector_store: Vector store instance
43
- mongodb: MongoDB store instance
44
-
45
- Returns:
46
- SystemInstructionsRAGAgent: Singleton instance of the RAG agent
47
- """
48
- # If RAG agent exists and all dependencies are the same, return it
49
- if self._rag_agent is not None:
50
- logger.info("Reusing existing RAG agent instance")
51
- return self._rag_agent
52
-
53
- try:
54
- logger.info("Creating new RAG agent instance")
55
- # Create the agent
56
- self._rag_agent = SystemInstructionsRAGAgent(
57
- llm=llm,
58
- embedding=embedding_model,
59
- vector_store=vector_store,
60
- mongodb=mongodb
61
- )
62
-
63
- return self._rag_agent
64
-
65
- except Exception as e:
66
- logger.error(f"Error creating RAG agent: {str(e)}")
67
- raise
68
-
69
- def reset_rag_agent(self):
70
- """
71
- Reset the RAG agent instance
72
- """
73
- logger.info("Resetting RAG agent instance")
74
- self._rag_agent = None
75
-
76
- # Create a global instance for easy import
77
- rag_agent_manager = RAGAgentManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/agents/system_instructions_rag.py CHANGED
@@ -1,34 +1,12 @@
1
  # src/agents/system_instructions_rag.py
2
- from typing import List, Dict, Optional, Tuple
3
- import spacy
4
- from src.agents.rag_agent import RAGAgent
5
- from src.llms.base_llm import BaseLLM
6
- from src.embeddings.base_embedding import BaseEmbedding
7
- from src.vectorstores.base_vectorstore import BaseVectorStore
8
- from src.db.mongodb_store import MongoDBStore
9
- from src.models.rag import RAGResponse
10
  from src.utils.logger import logger
 
 
11
 
12
  class SystemInstructionsRAGAgent(RAGAgent):
13
- def __init__(
14
- self,
15
- llm: BaseLLM,
16
- embedding: BaseEmbedding,
17
- vector_store: BaseVectorStore,
18
- mongodb: MongoDBStore,
19
- max_history_tokens: int = 4000,
20
- max_history_messages: int = 10
21
- ):
22
- """Initialize SystemInstructionsRAGAgent with enhanced context management"""
23
- super().__init__(
24
- llm=llm,
25
- embedding=embedding,
26
- vector_store=vector_store,
27
- mongodb=mongodb,
28
- max_history_tokens=max_history_tokens,
29
- max_history_messages=max_history_messages
30
- )
31
- self.nlp = spacy.load("en_core_web_sm")
32
 
33
  async def generate_response(
34
  self,
@@ -36,18 +14,19 @@ class SystemInstructionsRAGAgent(RAGAgent):
36
  conversation_id: Optional[str] = None,
37
  temperature: float = 0.7,
38
  max_tokens: Optional[int] = None,
39
- context_docs: Optional[List[str]] = None,
40
- stream: bool = False
41
  ) -> RAGResponse:
42
- """Generate response with guaranteed context handling"""
43
  try:
44
- logger.info(f"Processing query: {query}")
45
-
46
- # Store original context if provided
47
- original_context = context_docs
 
 
48
 
49
- # Handle introduction queries
50
- if self._is_introduction_query(query):
51
  welcome_message = self._handle_contact_query(query)
52
  return RAGResponse(
53
  response=welcome_message,
@@ -56,282 +35,200 @@ class SystemInstructionsRAGAgent(RAGAgent):
56
  scores=None
57
  )
58
 
59
- # Get and process conversation history
60
- history = []
61
  if conversation_id:
62
- history = await self.mongodb.get_recent_messages(
63
- conversation_id,
64
- limit=self.conversation_manager.max_messages
65
- )
66
-
67
- # Process history in context manager
68
- for msg in history:
69
- if msg.get('query') and msg.get('response'):
70
- self.context_manager.process_turn(msg['query'], msg['response'])
71
-
72
- # Initialize context tracking
73
- current_context = None
74
- sources = []
75
- scores = None
76
-
77
- # Multi-stage context retrieval
78
- if original_context:
79
- current_context = original_context
80
- else:
81
- # Try with original query first
82
- current_context, sources, scores = await self.retrieve_context(
83
  query,
84
- conversation_history=history
85
  )
86
 
87
- # If no context, try with enhanced query
88
- if not current_context:
89
- enhanced_query = self.context_manager.enhance_query(query)
90
- if enhanced_query != query:
91
- current_context, sources, scores = await self.retrieve_context(
92
- enhanced_query,
93
- conversation_history=history
94
- )
95
-
96
- # If still no context, try history fallback
97
- if not current_context:
98
- current_context, sources = self._get_context_from_history(history)
99
-
100
- logger.info(f"Retrieved {len(current_context) if current_context else 0} context documents")
101
-
102
- # Check context relevance
103
- has_relevant_context = self._check_context_relevance(query, current_context or [])
104
- logger.info(f"Context relevance check result: {has_relevant_context}")
105
-
106
- # Handle no context case
107
  if not has_relevant_context:
108
- return self._create_no_info_response()
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # Generate response
111
- prompt = self._create_response_prompt(query, current_context)
112
  response_text = self.llm.generate(
113
- prompt=prompt,
114
  temperature=temperature,
115
  max_tokens=max_tokens
116
  )
117
 
118
- # Process and validate response
119
  cleaned_response = self._clean_response(response_text)
120
  if self._is_no_info_response(cleaned_response):
121
- return self._create_no_info_response()
122
-
123
- # Update context tracking
124
- self.context_manager.process_turn(query, cleaned_response)
125
-
126
- # For Excel content, enhance the response
127
- if any('Sheet:' in doc for doc in (current_context or [])):
128
- try:
129
- cleaned_response = await self.enhance_excel_response(
130
- query=query,
131
- response=cleaned_response,
132
- context_docs=current_context
133
- )
134
- except Exception as e:
135
- logger.warning(f"Error enhancing Excel response: {str(e)}")
136
 
137
  return RAGResponse(
138
  response=cleaned_response,
139
- context_docs=current_context,
140
  sources=sources,
141
  scores=scores
142
  )
143
 
144
  except Exception as e:
145
- logger.error(f"Error in generate_response: {str(e)}")
146
  raise
147
 
148
- def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
149
- """Convert all metadata values to strings"""
150
- return {
151
- key: str(value) if value is not None else None
152
- for key, value in metadata.items()
153
- }
154
-
155
- async def retrieve_context(
156
  self,
157
  query: str,
 
158
  conversation_history: Optional[List[Dict]] = None
159
- ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
160
- """Enhanced context retrieval with proper metadata type handling"""
161
- try:
162
- logger.info(f"Processing query for context retrieval: {query}")
163
-
164
- collection_data = self.vector_store.collection.get()
165
-
166
- if not collection_data or 'documents' not in collection_data:
167
- logger.warning("No documents found in ChromaDB")
168
- return [], [], None
169
-
170
- documents = collection_data['documents']
171
- metadatas = collection_data.get('metadatas', [])
172
-
173
- # Clean and enhance query with date variations
174
- clean_query = query.lower().strip()
175
-
176
- # Extract and enhance date information
177
- import re
178
- from datetime import datetime
179
-
180
- date_pattern = r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}'
181
- dates = re.findall(date_pattern, clean_query.lower())
182
-
183
- enhanced_query = clean_query
184
- target_date = None
185
-
186
- if dates:
187
- try:
188
- date_obj = datetime.strptime(dates[0], '%b %d, %Y')
189
- target_date = date_obj.strftime('%b %d, %Y')
190
-
191
- date_variations = [
192
- date_obj.strftime('%B %d, %Y'),
193
- date_obj.strftime('%d/%m/%Y'),
194
- date_obj.strftime('%Y-%m-%d'),
195
- target_date
196
- ]
197
-
198
- enhanced_query = f"{clean_query} {' '.join(date_variations)}"
199
-
200
- except ValueError as e:
201
- logger.warning(f"Error parsing date: {str(e)}")
202
-
203
- # First try exact date matching
204
- exact_matches = []
205
- exact_metadata = []
206
-
207
- if target_date:
208
- for i, doc in enumerate(documents):
209
- if target_date in doc:
210
- logger.info(f"Found exact date match in document {i}")
211
- exact_matches.append(doc)
212
- if metadatas:
213
- # Convert metadata values to strings
214
- exact_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
215
-
216
- if exact_matches:
217
- logger.info(f"Found {len(exact_matches)} exact date matches")
218
- document_id = exact_metadata[0].get('document_id') if exact_metadata else None
219
-
220
- if document_id:
221
- all_related_chunks = []
222
- all_related_metadata = []
223
- all_related_scores = []
224
-
225
- for i, doc in enumerate(documents):
226
- if metadatas[i].get('document_id') == document_id:
227
- all_related_chunks.append(doc)
228
- # Convert metadata values to strings
229
- all_related_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
230
- all_related_scores.append(1.0)
231
-
232
- # Sort chunks by their index
233
- sorted_results = sorted(
234
- zip(all_related_chunks, all_related_metadata, all_related_scores),
235
- key=lambda x: int(x[1].get('chunk_index', '0')) # Convert to int for sorting
236
- )
237
-
238
- sorted_chunks, sorted_metadata, sorted_scores = zip(*sorted_results)
239
-
240
- logger.info(f"Returning {len(sorted_chunks)} chunks from document {document_id}")
241
- return list(sorted_chunks), list(sorted_metadata), list(sorted_scores)
242
-
243
- # If no exact matches, use enhanced query for embedding search
244
- logger.info("No exact matches found, using enhanced query for embedding search")
245
- query_embedding = self.embedding.embed_query(enhanced_query)
246
-
247
- results = self.vector_store.similarity_search(
248
- query_embedding,
249
- top_k=5
250
- )
251
-
252
- if not results:
253
- logger.warning("No results found in similarity search")
254
- return [], [], None
255
-
256
- context_docs = []
257
- sources = []
258
- scores = []
259
-
260
- sorted_results = sorted(results, key=lambda x: x.get('score', 0), reverse=True)
261
-
262
- for result in sorted_results:
263
- score = result.get('score', 0)
264
- if score > 0.3:
265
- context_docs.append(result.get('text', ''))
266
- # Convert metadata values to strings
267
- sources.append(self._convert_metadata_to_strings(result.get('metadata', {})))
268
- scores.append(score)
269
-
270
- if context_docs:
271
- logger.info(f"Returning {len(context_docs)} documents from similarity search")
272
- return context_docs, sources, scores
273
-
274
- logger.warning("No relevant documents found")
275
- return [], [], None
276
-
277
- except Exception as e:
278
- logger.error(f"Error in retrieve_context: {str(e)}")
279
- logger.exception("Full traceback:")
280
- return [], [], None
281
-
282
- def _is_introduction_query(self, query: str) -> bool:
283
- """Check if query is an introduction message"""
284
- return (
285
- "wants support" in query and
286
- "This is Introduction" in query and
287
- ("A new user with name:" in query or "An old user with name:" in query)
288
- )
289
-
290
- def _get_context_from_history(
291
- self,
292
- history: List[Dict]
293
- ) -> Tuple[Optional[List[str]], Optional[List[Dict]]]:
294
- """Extract context from conversation history"""
295
- for msg in reversed(history):
296
- if msg.get('context') and not self._is_no_info_response(msg.get('response', '')):
297
- return msg['context'], msg.get('sources', [])
298
- return None, None
299
-
300
- def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
301
- """Create prompt for response generation"""
302
  formatted_context = '\n\n'.join(
303
  f"Context {i+1}:\n{doc.strip()}"
304
  for i, doc in enumerate(context_docs)
305
  if doc and doc.strip()
306
  )
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  return f"""
309
- Use ONLY the following context to provide information about: {query}
310
 
 
311
  {formatted_context}
 
312
 
313
  Instructions:
314
- 1. Use ONLY information present in the context above
315
- 2. If the information is found in the context, provide a direct and concise response
316
  3. Do not make assumptions or add information not present in the context
317
  4. Ensure the response is clear and complete based on available information
318
- 5. If you cannot find relevant information about the specific query in the context,
319
  respond exactly with: "Information about this is not available, do you want to inquire about something else?"
320
 
321
  Query: {query}
322
  Response:"""
323
 
324
- def _create_no_info_response(self) -> RAGResponse:
325
- """Create standard response for no information case"""
326
- return RAGResponse(
327
- response="Information about this is not available, do you want to inquire about something else?",
328
- context_docs=[],
329
- sources=[],
330
- scores=None
331
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  def _clean_response(self, response: str) -> str:
334
- """Clean response text"""
335
  if not response:
336
  return response
337
 
@@ -351,6 +248,7 @@ Response:"""
351
  "Here's what I found:",
352
  "Here's the information you requested:",
353
  "According to the provided information,",
 
354
  "The information suggests that",
355
  "From what I can see,",
356
  "Let me explain",
@@ -359,85 +257,26 @@ Response:"""
359
  "I can see that",
360
  "Sure,",
361
  "Well,",
 
 
 
 
 
 
362
  "I apologize,"
363
  ]
364
-
365
  cleaned_response = response
366
  for phrase in phrases_to_remove:
367
  cleaned_response = cleaned_response.replace(phrase, "").strip()
368
-
369
  cleaned_response = " ".join(cleaned_response.split())
370
-
371
  if not cleaned_response:
372
  return response
373
-
374
- if cleaned_response[0].islower():
375
- cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
376
-
377
- return cleaned_response
378
-
379
- def _is_no_info_response(self, response: str) -> bool:
380
- """Check if response indicates no information available"""
381
- no_info_indicators = [
382
- "i do not have",
383
- "i don't have",
384
- "no information",
385
- "not available",
386
- "could not find",
387
- "couldn't find",
388
- "cannot find",
389
- "don't know",
390
- "do not know",
391
- "unable to find",
392
- "no data",
393
- "no relevant"
394
- ]
395
- response_lower = response.lower()
396
- return any(indicator in response_lower for indicator in no_info_indicators)
397
-
398
- def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
399
- """Enhanced context relevance checking"""
400
- if not context_docs:
401
- return False
402
-
403
- # Clean and prepare query
404
- clean_query = query.lower().strip()
405
- query_terms = set(word for word in clean_query.split()
406
- if word not in {'tell', 'me', 'about', 'what', 'is', 'the'})
407
-
408
- for doc in context_docs:
409
- if not doc:
410
- continue
411
-
412
- doc_lower = doc.lower()
413
-
414
- # For CSV-like content, check each line
415
- lines = doc_lower.split('\n')
416
- for line in lines:
417
- # Check if any query term appears in the line
418
- if any(term in line for term in query_terms):
419
- return True
420
-
421
- # Also check the whole document for good measure
422
- if any(term in doc_lower for term in query_terms):
423
- return True
424
-
425
- return False
426
-
427
- def _handle_contact_query(self, query: str) -> str:
428
- """Handle contact/introduction queries"""
429
- try:
430
- name_start = query.find('name: "') + 7
431
- name_end = query.find('"', name_start)
432
- name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
433
 
434
- is_returning = (
435
- "An old user with name:" in query and
436
- "wants support again" in query
437
- )
438
-
439
- return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
440
 
441
- except Exception as e:
442
- logger.error(f"Error handling contact query: {str(e)}")
443
- return "Welcome, How can I help you?"
 
1
  # src/agents/system_instructions_rag.py
2
+ from typing import List, Dict, Optional
3
+ from src.agents.rag_agent import RAGResponse
 
 
 
 
 
 
4
  from src.utils.logger import logger
5
+ from src.agents.rag_agent import RAGAgent
6
+
7
 
8
  class SystemInstructionsRAGAgent(RAGAgent):
9
+ """RAG Agent with enhanced system instructions for specific use cases"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  async def generate_response(
12
  self,
 
14
  conversation_id: Optional[str] = None,
15
  temperature: float = 0.7,
16
  max_tokens: Optional[int] = None,
17
+ context_docs: Optional[List[str]] = None
 
18
  ) -> RAGResponse:
19
+ """Generate response with specific handling for introduction and no-context cases"""
20
  try:
21
+ # First, check if this is an introduction/welcome message query
22
+ is_introduction = (
23
+ "wants support" in query and
24
+ "This is Introduction" in query and
25
+ ("A new user with name:" in query or "An old user with name:" in query)
26
+ )
27
 
28
+ if is_introduction:
29
+ # Handle introduction message - no context needed
30
  welcome_message = self._handle_contact_query(query)
31
  return RAGResponse(
32
  response=welcome_message,
 
35
  scores=None
36
  )
37
 
38
+ # Get conversation history if conversation_id exists
39
+ conversation_history = []
40
  if conversation_id:
41
+ try:
42
+ conversation_history = await self.mongodb.get_recent_messages(
43
+ conversation_id,
44
+ limit=self.conversation_manager.max_messages
45
+ )
46
+
47
+ # Get relevant history within token limits
48
+ conversation_history = self.conversation_manager.get_relevant_history(
49
+ messages=conversation_history,
50
+ current_query=query
51
+ )
52
+ except Exception as e:
53
+ logger.warning(
54
+ f"Error fetching conversation history: {str(e)}")
55
+
56
+ # For all other queries, proceed with context-based response
57
+ if not context_docs:
58
+ context_docs, sources, scores = await self.retrieve_context(
 
 
 
59
  query,
60
+ conversation_history=conversation_history
61
  )
62
 
63
+ # Check if we have relevant context
64
+ has_relevant_context = self._check_context_relevance(
65
+ query, context_docs or []
66
+ )
67
+
68
+ # If no relevant context found, return the standard message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  if not has_relevant_context:
70
+ return RAGResponse(
71
+ response="Information about this is not available, do you want to inquire about something else?",
72
+ context_docs=[],
73
+ sources=[],
74
+ scores=None
75
+ )
76
+
77
+ # Generate response using context and conversation history
78
+ prompt = self._create_response_prompt(
79
+ query=query,
80
+ context_docs=context_docs,
81
+ conversation_history=conversation_history
82
+ )
83
 
 
 
84
  response_text = self.llm.generate(
85
+ prompt,
86
  temperature=temperature,
87
  max_tokens=max_tokens
88
  )
89
 
90
+ # Check if the generated response indicates no information
91
  cleaned_response = self._clean_response(response_text)
92
  if self._is_no_info_response(cleaned_response):
93
+ return RAGResponse(
94
+ response="Information about this is not available, do you want to inquire about something else?",
95
+ context_docs=[],
96
+ sources=[],
97
+ scores=None
98
+ )
 
 
 
 
 
 
 
 
 
99
 
100
  return RAGResponse(
101
  response=cleaned_response,
102
+ context_docs=context_docs,
103
  sources=sources,
104
  scores=scores
105
  )
106
 
107
  except Exception as e:
108
+ logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
109
  raise
110
 
111
+ def _create_response_prompt(
 
 
 
 
 
 
 
112
  self,
113
  query: str,
114
+ context_docs: List[str],
115
  conversation_history: Optional[List[Dict]] = None
116
+ ) -> str:
117
+ """Create prompt for generating response from context and conversation history"""
118
+ # Format context documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  formatted_context = '\n\n'.join(
120
  f"Context {i+1}:\n{doc.strip()}"
121
  for i, doc in enumerate(context_docs)
122
  if doc and doc.strip()
123
  )
124
 
125
+ # Format conversation history if available
126
+ history_context = ""
127
+ if conversation_history:
128
+ history_messages = []
129
+ # Use last 3 messages for context
130
+ for msg in conversation_history[-3:]:
131
+ role = msg.get('role', 'unknown')
132
+ content = msg.get('content', '')
133
+ history_messages.append(f"{role.capitalize()}: {content}")
134
+
135
+ if history_messages:
136
+ history_context = "\nPrevious Conversation:\n" + \
137
+ "\n".join(history_messages)
138
+
139
  return f"""
140
+ Use the following context and conversation history to provide information about: {query}
141
 
142
+ Context Information:
143
  {formatted_context}
144
+ {history_context}
145
 
146
  Instructions:
147
+ 1. Use information from both the context and conversation history
148
+ 2. If the information is found, provide a direct and concise response
149
  3. Do not make assumptions or add information not present in the context
150
  4. Ensure the response is clear and complete based on available information
151
+ 5. If you cannot find relevant information about the specific query,
152
  respond exactly with: "Information about this is not available, do you want to inquire about something else?"
153
 
154
  Query: {query}
155
  Response:"""
156
 
157
+ def _is_no_info_response(self, response: str) -> bool:
158
+ """Check if the response indicates no information available"""
159
+ no_info_indicators = [
160
+ "i do not have",
161
+ "i don't have",
162
+ "no information",
163
+ "not available",
164
+ "could not find",
165
+ "couldn't find",
166
+ "cannot find"
167
+ ]
168
+ response_lower = response.lower()
169
+ return any(indicator in response_lower for indicator in no_info_indicators)
170
+
171
+ def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
172
+ """Check if context contains information relevant to the query"""
173
+ if not context_docs:
174
+ return False
175
+
176
+ # Extract key terms from query (keeping important words)
177
+ query_words = query.lower().split()
178
+ stop_words = {'me', 'a', 'about', 'what', 'is',
179
+ 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
180
+
181
+ # Remove only basic stop words, keep important terms like "report", "share", etc.
182
+ query_terms = {word for word in query_words if word not in stop_words}
183
+
184
+ # Add additional relevant terms that might appear in the content
185
+ related_terms = {
186
+ 'comprehensive',
187
+ 'report',
188
+ 'overview',
189
+ 'summary',
190
+ 'details',
191
+ 'information'
192
+ }
193
+ query_terms.update(
194
+ word for word in query_words if word in related_terms)
195
+
196
+ # Check each context document for relevance
197
+ for doc in context_docs:
198
+ if not doc:
199
+ continue
200
+ doc_lower = doc.lower()
201
+
202
+ # Consider document relevant if it contains any query terms
203
+ # or if it starts with common report headers
204
+ if any(term in doc_lower for term in query_terms) or \
205
+ any(header in doc_lower for header in ['overview', 'comprehensive report', 'summary']):
206
+ return True
207
+
208
+ return False
209
+
210
+ def _handle_contact_query(self, query: str) -> str:
211
+ """Handle queries from /user/contact endpoint"""
212
+ try:
213
+ name_start = query.find('name: "') + 7
214
+ name_end = query.find('"', name_start)
215
+ name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
216
+
217
+ is_returning = (
218
+ "An old user with name:" in query and
219
+ "wants support again" in query
220
+ )
221
+
222
+ if is_returning:
223
+ return f"Welcome back {name}, How can I help you?"
224
+ return f"Welcome {name}, How can I help you?"
225
+
226
+ except Exception as e:
227
+ logger.error(f"Error handling contact query: {str(e)}")
228
+ return "Welcome, How can I help you?"
229
 
230
  def _clean_response(self, response: str) -> str:
231
+ """Clean response by removing unwanted phrases"""
232
  if not response:
233
  return response
234
 
 
248
  "Here's what I found:",
249
  "Here's the information you requested:",
250
  "According to the provided information,",
251
+ "Based on the documents,",
252
  "The information suggests that",
253
  "From what I can see,",
254
  "Let me explain",
 
257
  "I can see that",
258
  "Sure,",
259
  "Well,",
260
+ "Based on the given context,",
261
+ "The available information shows that",
262
+ "From the context provided,",
263
+ "The documentation mentions that",
264
+ "According to the context,",
265
+ "As shown in the context,",
266
  "I apologize,"
267
  ]
268
+
269
  cleaned_response = response
270
  for phrase in phrases_to_remove:
271
  cleaned_response = cleaned_response.replace(phrase, "").strip()
272
+
273
  cleaned_response = " ".join(cleaned_response.split())
274
+
275
  if not cleaned_response:
276
  return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ if cleaned_response[0].islower():
279
+ cleaned_response = cleaned_response[0].upper(
280
+ ) + cleaned_response[1:]
 
 
 
281
 
282
+ return cleaned_response
 
 
src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED
Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ
 
src/main.py CHANGED
@@ -1,4 +1,31 @@
1
  # src/main.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
3
  from fastapi.responses import StreamingResponse, FileResponse
4
  from fastapi.staticfiles import StaticFiles
@@ -9,53 +36,36 @@ from datetime import datetime
9
  from pathlib import Path
10
  import os
11
  import asyncio
12
- os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
13
- #os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
14
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- from fastapi.responses import RedirectResponse
17
- from google.oauth2.credentials import Credentials
18
- from google_auth_oauthlib.flow import Flow
19
- from src.utils.google_drive_service import GoogleDriveService
20
 
21
  # Import custom modules1
22
- #from src.agents.rag_agent import RAGAgent
23
- from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
24
- from src.agents.rag_agent_manager import rag_agent_manager
25
- from src.models.document import AllDocumentsResponse, StoredDocument
26
- from src.models.UserContact import UserContactRequest
27
- from src.utils.document_processor import DocumentProcessor
28
- from src.utils.drive_document_processor import DriveDocumentProcessor
29
- from src.utils.conversation_summarizer import ConversationSummarizer
30
- from src.utils.logger import logger
31
- from src.utils.llm_utils import get_llm_instance, get_vector_store
32
- from src.db.mongodb_store import MongoDBStore
33
- from src.implementations.document_service import DocumentService
34
- from src.models import (
35
- ChatRequest,
36
- ChatResponse,
37
- BatchUploadResponse,
38
- SummarizeRequest,
39
- SummaryResponse,
40
- FeedbackRequest
41
- )
42
- from fastapi import HTTPException, Depends
43
- from fastapi.security import APIKeyHeader
44
- from src.utils.database_cleanup import perform_cleanup
45
 
46
- from config.config import settings
47
 
48
  app = FastAPI(title="Chatbot API")
49
 
50
  app.add_middleware(
51
  CORSMiddleware,
52
- allow_origins=["http://localhost:8080", "http://localhost:3000"], # Add both ports
 
53
  allow_credentials=True,
54
  allow_methods=["*"], # Allows all methods
55
  allow_headers=["*"], # Allows all headers
56
  )
57
 
58
- #google_drive_service = GoogleDriveService()
59
 
60
  # Initialize MongoDB
61
  mongodb = MongoDBStore(settings.MONGODB_URI)
@@ -75,6 +85,7 @@ app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
75
  # Security setup
76
  API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
77
 
 
78
  async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
79
  """Verify admin API key"""
80
  if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
@@ -84,41 +95,16 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
84
  )
85
  return api_key
86
 
87
- # @app.get("/google/auth")
88
- # async def google_auth():
89
- # authorization_url, _ = settings.google_oauth_flow.authorization_url(
90
- # access_type='offline',
91
- # prompt='consent',
92
- # include_granted_scopes='true'
93
- # )
94
- # return RedirectResponse(authorization_url)
95
-
96
- # @app.get("/google/oauth2callback")
97
- # async def google_auth_callback(code: str):
98
- # flow = Flow.from_client_config({
99
- # "web": {
100
- # "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
101
- # "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
102
- # "auth_uri": "https://accounts.google.com/o/oauth2/auth",
103
- # "token_uri": "https://oauth2.googleapis.com/token",
104
- # "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
105
- # }
106
- # }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
107
-
108
- # flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
109
-
110
- # # Add access type and prompt parameters for refresh token
111
- # flow.fetch_token(
112
- # code=code,
113
- # access_type='offline',
114
- # prompt='consent'
115
- # )
116
- # credentials = flow.credentials
117
-
118
- # return {
119
- # "message": "Authentication successful",
120
- # "credentials": credentials.to_json()
121
- # }
122
 
123
 
124
  @app.get("/documents")
@@ -126,7 +112,7 @@ async def get_all_documents():
126
  """Get all documents from MongoDB"""
127
  try:
128
  documents = await mongodb.get_all_documents()
129
-
130
  formatted_documents = []
131
  for doc in documents:
132
  try:
@@ -140,9 +126,10 @@ async def get_all_documents():
140
  }
141
  formatted_documents.append(formatted_doc)
142
  except Exception as e:
143
- logger.error(f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
 
144
  continue
145
-
146
  return {
147
  "total_documents": len(formatted_documents),
148
  "documents": formatted_documents
@@ -151,6 +138,7 @@ async def get_all_documents():
151
  logger.error(f"Error retrieving documents: {str(e)}")
152
  raise HTTPException(status_code=500, detail=str(e))
153
 
 
154
  @app.get("/documents/{document_id}/download")
155
  async def get_document_file(document_id: str):
156
  """Serve a document file by its ID"""
@@ -159,27 +147,28 @@ async def get_document_file(document_id: str):
159
  doc = await mongodb.get_document(document_id)
160
  if not doc:
161
  raise HTTPException(status_code=404, detail="Document not found")
162
-
163
  # Extract filename from url_path
164
  filename = doc["url_path"].split("/")[-1]
165
  file_path = UPLOADS_DIR / filename
166
-
167
  if not file_path.exists():
168
  raise HTTPException(
169
- status_code=404,
170
  detail=f"File not found on server: {filename}"
171
  )
172
-
173
  return FileResponse(
174
  path=str(file_path),
175
  filename=doc["filename"],
176
  media_type=doc["content_type"]
177
  )
178
-
179
  except Exception as e:
180
  logger.error(f"Error serving document file: {str(e)}")
181
  raise HTTPException(status_code=500, detail=str(e))
182
 
 
183
  @app.post("/documents/upload", response_model=BatchUploadResponse)
184
  async def upload_documents(
185
  files: List[UploadFile] = File(...),
@@ -189,14 +178,84 @@ async def upload_documents(
189
  try:
190
  vector_store, _ = await get_vector_store()
191
  response = await document_service.process_documents(
192
- files,
193
- vector_store,
194
  background_tasks
195
  )
196
  return response
197
  except Exception as e:
198
  logger.error(f"Error in document upload: {str(e)}")
199
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  @app.get("/documentChunks/{document_id}")
@@ -205,10 +264,10 @@ async def get_document_chunks(document_id: str):
205
  try:
206
  vector_store, _ = await get_vector_store()
207
  chunks = vector_store.get_document_chunks(document_id)
208
-
209
  if not chunks:
210
  raise HTTPException(status_code=404, detail="Document not found")
211
-
212
  return {
213
  "document_id": document_id,
214
  "total_chunks": len(chunks),
@@ -218,53 +277,57 @@ async def get_document_chunks(document_id: str):
218
  logger.error(f"Error retrieving document chunks: {str(e)}")
219
  raise HTTPException(status_code=500, detail=str(e))
220
 
 
221
  @app.delete("/documents/{document_id}")
222
  async def delete_document(document_id: str):
223
  """Delete document from MongoDB, ChromaDB, and physical storage"""
224
  try:
225
  # First get document details from MongoDB to get file path
226
  document = await mongodb.get_document(document_id)
227
- if not document:
228
- raise HTTPException(status_code=404, detail="Document not found")
229
-
230
  # Get vector store instance
231
  vector_store, _ = await get_vector_store()
232
-
233
  # Delete physical file using document service
234
  deletion_success = await document_service.delete_document(document_id)
235
  if not deletion_success:
236
- logger.warning(f"Failed to delete physical file for document {document_id}")
237
-
 
238
  # Delete from vector store
239
  try:
240
  vector_store.delete_document(document_id)
241
  except Exception as e:
242
- logger.error(f"Error deleting document from vector store: {str(e)}")
 
243
  raise HTTPException(
244
- status_code=500,
245
  detail=f"Failed to delete document from vector store: {str(e)}"
246
  )
247
-
248
  # Delete from MongoDB - don't check return value since document might already be deleted
249
  await mongodb.delete_document(document_id)
250
-
251
  return {
252
  "status": "success",
253
  "message": f"Document {document_id} successfully deleted from all stores"
254
  }
255
-
256
  except HTTPException:
257
  raise
258
  except Exception as e:
259
  logger.error(f"Error in delete_document endpoint: {str(e)}")
260
  raise HTTPException(status_code=500, detail=str(e))
261
 
 
262
  @app.post("/processDriveDocuments")
263
  async def process_drive_documents():
264
  try:
265
  # Initialize vector store
266
  vector_store, _ = await get_vector_store()
267
-
268
  # Initialize Drive document processor
269
  drive_processor = DriveDocumentProcessor(
270
  google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
@@ -272,18 +335,19 @@ async def process_drive_documents():
272
  temp_dir=settings.TEMP_DOWNLOAD_DIR,
273
  doc_processor=doc_processor
274
  )
275
-
276
  # Process documents
277
  result = await drive_processor.process_documents(vector_store)
278
  return result
279
-
280
  except Exception as e:
281
  logger.error(f"Error in process_drive_documents: {str(e)}")
282
  raise HTTPException(
283
  status_code=500,
284
  detail=str(e)
285
  )
286
-
 
287
  @app.post("/user/contact", response_model=ChatResponse)
288
  async def create_user_contact(
289
  request: UserContactRequest,
@@ -296,7 +360,7 @@ async def create_user_contact(
296
  email=request.email,
297
  phone_number=request.phone_number
298
  )
299
-
300
  if existing_conversation_id:
301
  chat_request = ChatRequest(
302
  query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
@@ -315,7 +379,7 @@ async def create_user_contact(
315
  email=request.email,
316
  phone_number=request.phone_number
317
  )
318
-
319
  chat_request = ChatRequest(
320
  query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
321
  llm_provider="openai",
@@ -324,14 +388,15 @@ async def create_user_contact(
324
  stream=False,
325
  conversation_id=new_conversation_id
326
  )
327
-
328
  # Call chat_endpoint with the prepared request
329
  return await chat_endpoint(chat_request, background_tasks)
330
-
331
  except Exception as e:
332
  logger.error(f"Error in create_user_contact: {str(e)}")
333
  raise HTTPException(status_code=500, detail=str(e))
334
-
 
335
  @app.post("/chat", response_model=ChatResponse)
336
  async def chat_endpoint(
337
  request: ChatRequest,
@@ -340,33 +405,61 @@ async def chat_endpoint(
340
  """Chat endpoint with RAG support and enhanced Excel handling"""
341
  try:
342
  # Initialize core components
343
- logger.info(f"Initializing vector store and embedding: {str(datetime.now())}")
 
344
  vector_store, embedding_model = await get_vector_store()
345
-
346
  logger.info(f"Initializing LLM: {str(datetime.now())}")
347
  llm = get_llm_instance(request.llm_provider)
348
-
349
- # Use RAG agent manager to get singleton RAG agent
350
- rag_agent = rag_agent_manager.get_rag_agent(
 
 
 
 
 
 
 
351
  llm=llm,
352
- embedding_model=embedding_model,
353
  vector_store=vector_store,
354
  mongodb=mongodb
355
  )
356
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  # Use provided conversation ID or create new one
358
  conversation_id = request.conversation_id or str(uuid.uuid4())
359
-
360
  # Process the query
361
  query = request.query
362
-
363
  # Add specific instructions for certain types of queries
364
- #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
365
- #query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
366
-
367
  # Generate response
368
  logger.info(f"Generating response: {str(datetime.now())}")
369
-
370
  max_retries = 3
371
  retry_count = 0
372
  response = None
@@ -378,7 +471,8 @@ async def chat_endpoint(
378
  query=query,
379
  conversation_id=conversation_id,
380
  temperature=request.temperature,
381
- max_tokens=request.max_tokens if hasattr(request, 'max_tokens') else None
 
382
  )
383
  break
384
  except Exception as e:
@@ -388,7 +482,8 @@ async def chat_endpoint(
388
  await asyncio.sleep(1) # Brief pause before retry
389
 
390
  if response is None:
391
- raise last_error or Exception("Failed to generate response after retries")
 
392
 
393
  logger.info(f"Response generated: {str(datetime.now())}")
394
 
@@ -401,13 +496,13 @@ async def chat_endpoint(
401
 
402
  # Add Excel-specific metadata if present
403
  has_excel_content = any(
404
- doc and 'Sheet:' in doc
405
  for doc in (response.context_docs or [])
406
  )
407
  if has_excel_content:
408
  try:
409
  metadata['excel_content'] = True
410
-
411
  # Extract Excel-specific insights if available
412
  if hasattr(rag_agent, 'get_excel_insights'):
413
  excel_insights = rag_agent.get_excel_insights(
@@ -436,13 +531,14 @@ async def chat_endpoint(
436
  sources=response.sources,
437
  conversation_id=conversation_id,
438
  timestamp=datetime.now(),
439
- relevant_doc_scores=response.scores if hasattr(response, 'scores') else None,
 
440
  metadata=metadata
441
  )
442
 
443
  # Log completion
444
  logger.info(f"Chat response completed: {str(datetime.now())}")
445
-
446
  return chat_response
447
 
448
  except Exception as e:
@@ -451,43 +547,48 @@ async def chat_endpoint(
451
  if isinstance(e, ValueError):
452
  raise HTTPException(status_code=400, detail=str(e))
453
  elif isinstance(e, (KeyError, AttributeError)):
454
- raise HTTPException(status_code=500, detail="Internal processing error")
 
455
  else:
456
  raise HTTPException(status_code=500, detail=str(e))
457
 
 
458
  @app.get("/chat/history/{conversation_id}")
459
  async def get_conversation_history(conversation_id: str):
460
  """Get complete conversation history"""
461
  history = await mongodb.get_conversation_history(conversation_id)
462
-
463
  if not history:
464
  raise HTTPException(status_code=404, detail="Conversation not found")
465
-
466
  return {
467
  "conversation_id": conversation_id,
468
  "messages": history
469
  }
470
 
 
471
  @app.post("/chat/summarize", response_model=SummaryResponse)
472
  async def summarize_conversation(request: SummarizeRequest):
473
  """Generate a summary of a conversation"""
474
  try:
475
  messages = await mongodb.get_messages_for_summary(request.conversation_id)
476
-
477
  if not messages:
478
- raise HTTPException(status_code=404, detail="Conversation not found")
479
-
 
480
  summary = await summarizer.summarize_conversation(
481
  messages,
482
  include_metadata=request.include_metadata
483
  )
484
-
485
  return SummaryResponse(**summary)
486
-
487
  except Exception as e:
488
  logger.error(f"Error generating summary: {str(e)}")
489
  raise HTTPException(status_code=500, detail=str(e))
490
 
 
491
  @app.post("/chat/feedback/{conversation_id}")
492
  async def submit_feedback(
493
  conversation_id: str,
@@ -498,21 +599,22 @@ async def submit_feedback(
498
  # Validate conversation exists
499
  conversation = await mongodb.get_conversation_metadata(conversation_id)
500
  if not conversation:
501
- raise HTTPException(status_code=404, detail="Conversation not found")
502
-
 
503
  # Update feedback
504
  success = await mongodb.update_feedback(
505
  conversation_id=conversation_id,
506
  feedback=feedback_request.feedback,
507
  rating=feedback_request.rating
508
  )
509
-
510
  if not success:
511
  raise HTTPException(
512
  status_code=500,
513
  detail="Failed to update feedback"
514
  )
515
-
516
  return {
517
  "status": "success",
518
  "message": "Feedback submitted successfully",
@@ -522,20 +624,21 @@ async def submit_feedback(
522
  "rating": feedback_request.format_rating()
523
  }
524
  }
525
-
526
  except HTTPException:
527
  raise
528
  except Exception as e:
529
  logger.error(f"Error submitting feedback: {str(e)}")
530
  raise HTTPException(status_code=500, detail=str(e))
531
 
 
532
  @app.get("/debug/config")
533
  async def debug_config():
534
  """Debug endpoint to check configuration"""
535
  import os
536
  from config.config import settings
537
  from pathlib import Path
538
-
539
  debug_info = {
540
  "environment_variables": {
541
  "OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
@@ -550,16 +653,17 @@ async def debug_config():
550
  "openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
551
  }
552
  }
553
-
554
  if settings.OPENAI_API_KEY:
555
  key = settings.OPENAI_API_KEY
556
  debug_info["api_key_info"] = {
557
  "length": len(key),
558
  "preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
559
  }
560
-
561
  return debug_info
562
 
 
563
  @app.post("/admin/cleanup")
564
  async def cleanup_databases(
565
  include_files: bool = True,
@@ -567,20 +671,36 @@ async def cleanup_databases(
567
  ):
568
  """
569
  Clean up all data from ChromaDB and MongoDB
570
-
571
  Args:
572
  include_files (bool): Whether to also delete uploaded files
 
 
 
573
  """
574
  try:
575
  result = await perform_cleanup(mongodb, include_files)
 
 
 
 
 
 
 
 
 
 
 
576
  return result
 
577
  except Exception as e:
578
  logger.error(f"Error in cleanup operation: {str(e)}")
579
  raise HTTPException(
580
  status_code=500,
581
  detail=f"Error during cleanup: {str(e)}"
582
  )
583
-
 
584
  @app.get("/health")
585
  async def health_check():
586
  """Health check endpoint"""
@@ -588,4 +708,4 @@ async def health_check():
588
 
589
  if __name__ == "__main__":
590
  import uvicorn
591
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  # src/main.py
2
+ from config.config import settings
3
+ from src.utils.database_cleanup import perform_cleanup
4
+ from fastapi.security import APIKeyHeader
5
+ from fastapi import HTTPException, Depends
6
+ from fastapi.responses import JSONResponse
7
+ from src.models import (
8
+ ChatRequest,
9
+ ChatResponse,
10
+ BatchUploadResponse,
11
+ SummarizeRequest,
12
+ SummaryResponse,
13
+ FeedbackRequest
14
+ )
15
+ from src.implementations.document_service import DocumentService
16
+ from src.db.mongodb_store import MongoDBStore
17
+ from src.utils.llm_utils import get_llm_instance, get_vector_store
18
+ from src.utils.logger import logger
19
+ from src.utils.conversation_summarizer import ConversationSummarizer
20
+ from src.utils.drive_document_processor import DriveDocumentProcessor
21
+ from src.utils.document_processor import DocumentProcessor
22
+ from src.models.UserContact import UserContactRequest
23
+ from src.models.document import AllDocumentsResponse, StoredDocument
24
+ from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
25
+ from src.utils.google_drive_service import GoogleDriveService
26
+ from google_auth_oauthlib.flow import Flow
27
+ from google.oauth2.credentials import Credentials
28
+ from fastapi.responses import RedirectResponse
29
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
30
  from fastapi.responses import StreamingResponse, FileResponse
31
  from fastapi.staticfiles import StaticFiles
 
36
  from pathlib import Path
37
  import os
38
  import asyncio
 
 
39
 
40
+ import chromadb
41
+ from pathlib import Path
42
+ import asyncio
43
+ import gc
44
+ import random
45
+ from typing import List
46
+ from src.utils.logger import logger
47
+ from config.config import settings
48
+
49
+ os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
50
+ # os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
51
 
 
 
 
 
52
 
53
  # Import custom modules1
54
+ # from src.agents.rag_agent import RAGAgent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
 
56
 
57
  app = FastAPI(title="Chatbot API")
58
 
59
  app.add_middleware(
60
  CORSMiddleware,
61
+ allow_origins=["http://localhost:8080",
62
+ "http://localhost:3000"], # Add both ports
63
  allow_credentials=True,
64
  allow_methods=["*"], # Allows all methods
65
  allow_headers=["*"], # Allows all headers
66
  )
67
 
68
+ # google_drive_service = GoogleDriveService()
69
 
70
  # Initialize MongoDB
71
  mongodb = MongoDBStore(settings.MONGODB_URI)
 
85
  # Security setup
86
  API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
87
 
88
+
89
  async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
90
  """Verify admin API key"""
91
  if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
 
95
  )
96
  return api_key
97
 
98
+
99
+ def get_chroma_client():
100
+ """Get a new ChromaDB client instance"""
101
+ return chromadb.PersistentClient(
102
+ path=settings.CHROMA_PATH,
103
+ settings=chromadb.Settings(
104
+ allow_reset=True,
105
+ is_persistent=True
106
+ )
107
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  @app.get("/documents")
 
112
  """Get all documents from MongoDB"""
113
  try:
114
  documents = await mongodb.get_all_documents()
115
+
116
  formatted_documents = []
117
  for doc in documents:
118
  try:
 
126
  }
127
  formatted_documents.append(formatted_doc)
128
  except Exception as e:
129
+ logger.error(
130
+ f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
131
  continue
132
+
133
  return {
134
  "total_documents": len(formatted_documents),
135
  "documents": formatted_documents
 
138
  logger.error(f"Error retrieving documents: {str(e)}")
139
  raise HTTPException(status_code=500, detail=str(e))
140
 
141
+
142
  @app.get("/documents/{document_id}/download")
143
  async def get_document_file(document_id: str):
144
  """Serve a document file by its ID"""
 
147
  doc = await mongodb.get_document(document_id)
148
  if not doc:
149
  raise HTTPException(status_code=404, detail="Document not found")
150
+
151
  # Extract filename from url_path
152
  filename = doc["url_path"].split("/")[-1]
153
  file_path = UPLOADS_DIR / filename
154
+
155
  if not file_path.exists():
156
  raise HTTPException(
157
+ status_code=404,
158
  detail=f"File not found on server: {filename}"
159
  )
160
+
161
  return FileResponse(
162
  path=str(file_path),
163
  filename=doc["filename"],
164
  media_type=doc["content_type"]
165
  )
166
+
167
  except Exception as e:
168
  logger.error(f"Error serving document file: {str(e)}")
169
  raise HTTPException(status_code=500, detail=str(e))
170
 
171
+
172
  @app.post("/documents/upload", response_model=BatchUploadResponse)
173
  async def upload_documents(
174
  files: List[UploadFile] = File(...),
 
178
  try:
179
  vector_store, _ = await get_vector_store()
180
  response = await document_service.process_documents(
181
+ files,
182
+ vector_store,
183
  background_tasks
184
  )
185
  return response
186
  except Exception as e:
187
  logger.error(f"Error in document upload: {str(e)}")
188
+ raise HTTPException(status_code=500, detail=str(e))
189
+
190
+
191
+ @app.get("/documentChunks")
192
+ async def get_all_document_chunks():
193
+ """Get all document chunks from the vector store"""
194
+ try:
195
+ # Get vector store instance
196
+ vector_store, _ = await get_vector_store()
197
+
198
+ # Retrieve all documents
199
+ all_documents = vector_store.get_all_documents()
200
+
201
+ # If no documents, return a structured response instead of raising an exception
202
+ if not all_documents:
203
+ return {
204
+ "total_documents": 0,
205
+ "documents": [],
206
+ "message": "No documents are currently stored in the vector store. Upload some documents to see chunks."
207
+ }
208
+
209
+ # Group chunks by document_id
210
+ document_chunks = {}
211
+ for doc in all_documents:
212
+ # Safely extract document_id
213
+ document_id = doc.get('metadata', {}).get('document_id',
214
+ doc.get('id',
215
+ str(uuid.uuid4())))
216
+
217
+ # Ensure metadata is a dictionary
218
+ metadata = doc.get('metadata', {}) if isinstance(
219
+ doc.get('metadata'), dict) else {}
220
+
221
+ # Create chunk entry
222
+ chunk = {
223
+ 'text': str(doc.get('text', '')),
224
+ 'metadata': metadata
225
+ }
226
+
227
+ # Group chunks by document_id
228
+ if document_id not in document_chunks:
229
+ document_chunks[document_id] = []
230
+
231
+ document_chunks[document_id].append(chunk)
232
+
233
+ # Prepare response
234
+ processed_documents = []
235
+ for doc_id, chunks in document_chunks.items():
236
+ processed_documents.append({
237
+ "document_id": doc_id,
238
+ "total_chunks": len(chunks),
239
+ "chunks": chunks
240
+ })
241
+
242
+ return {
243
+ "total_documents": len(processed_documents),
244
+ "documents": processed_documents,
245
+ "message": f"Successfully retrieved {len(processed_documents)} documents"
246
+ }
247
+
248
+ except Exception as e:
249
+ # Log the full error for debugging
250
+ logger.error(
251
+ f"Error retrieving all document chunks: {str(e)}", exc_info=True)
252
+
253
+ # Return a structured error response
254
+ return {
255
+ "total_documents": 0,
256
+ "documents": [],
257
+ "message": f"An error occurred while retrieving document chunks: {str(e)}"
258
+ }
259
 
260
 
261
  @app.get("/documentChunks/{document_id}")
 
264
  try:
265
  vector_store, _ = await get_vector_store()
266
  chunks = vector_store.get_document_chunks(document_id)
267
+
268
  if not chunks:
269
  raise HTTPException(status_code=404, detail="Document not found")
270
+
271
  return {
272
  "document_id": document_id,
273
  "total_chunks": len(chunks),
 
277
  logger.error(f"Error retrieving document chunks: {str(e)}")
278
  raise HTTPException(status_code=500, detail=str(e))
279
 
280
+
281
  @app.delete("/documents/{document_id}")
282
  async def delete_document(document_id: str):
283
  """Delete document from MongoDB, ChromaDB, and physical storage"""
284
  try:
285
  # First get document details from MongoDB to get file path
286
  document = await mongodb.get_document(document_id)
287
+ # if not document:
288
+ # raise HTTPException(status_code=404, detail="Document not found")
289
+
290
  # Get vector store instance
291
  vector_store, _ = await get_vector_store()
292
+
293
  # Delete physical file using document service
294
  deletion_success = await document_service.delete_document(document_id)
295
  if not deletion_success:
296
+ logger.warning(
297
+ f"Failed to delete physical file for document {document_id}")
298
+
299
  # Delete from vector store
300
  try:
301
  vector_store.delete_document(document_id)
302
  except Exception as e:
303
+ logger.error(
304
+ f"Error deleting document from vector store: {str(e)}")
305
  raise HTTPException(
306
+ status_code=500,
307
  detail=f"Failed to delete document from vector store: {str(e)}"
308
  )
309
+
310
  # Delete from MongoDB - don't check return value since document might already be deleted
311
  await mongodb.delete_document(document_id)
312
+
313
  return {
314
  "status": "success",
315
  "message": f"Document {document_id} successfully deleted from all stores"
316
  }
317
+
318
  except HTTPException:
319
  raise
320
  except Exception as e:
321
  logger.error(f"Error in delete_document endpoint: {str(e)}")
322
  raise HTTPException(status_code=500, detail=str(e))
323
 
324
+
325
  @app.post("/processDriveDocuments")
326
  async def process_drive_documents():
327
  try:
328
  # Initialize vector store
329
  vector_store, _ = await get_vector_store()
330
+
331
  # Initialize Drive document processor
332
  drive_processor = DriveDocumentProcessor(
333
  google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
 
335
  temp_dir=settings.TEMP_DOWNLOAD_DIR,
336
  doc_processor=doc_processor
337
  )
338
+
339
  # Process documents
340
  result = await drive_processor.process_documents(vector_store)
341
  return result
342
+
343
  except Exception as e:
344
  logger.error(f"Error in process_drive_documents: {str(e)}")
345
  raise HTTPException(
346
  status_code=500,
347
  detail=str(e)
348
  )
349
+
350
+
351
  @app.post("/user/contact", response_model=ChatResponse)
352
  async def create_user_contact(
353
  request: UserContactRequest,
 
360
  email=request.email,
361
  phone_number=request.phone_number
362
  )
363
+
364
  if existing_conversation_id:
365
  chat_request = ChatRequest(
366
  query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
 
379
  email=request.email,
380
  phone_number=request.phone_number
381
  )
382
+
383
  chat_request = ChatRequest(
384
  query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
385
  llm_provider="openai",
 
388
  stream=False,
389
  conversation_id=new_conversation_id
390
  )
391
+
392
  # Call chat_endpoint with the prepared request
393
  return await chat_endpoint(chat_request, background_tasks)
394
+
395
  except Exception as e:
396
  logger.error(f"Error in create_user_contact: {str(e)}")
397
  raise HTTPException(status_code=500, detail=str(e))
398
+
399
+
400
  @app.post("/chat", response_model=ChatResponse)
401
  async def chat_endpoint(
402
  request: ChatRequest,
 
405
  """Chat endpoint with RAG support and enhanced Excel handling"""
406
  try:
407
  # Initialize core components
408
+ logger.info(
409
+ f"Initializing vector store and embedding: {str(datetime.now())}")
410
  vector_store, embedding_model = await get_vector_store()
411
+
412
  logger.info(f"Initializing LLM: {str(datetime.now())}")
413
  llm = get_llm_instance(request.llm_provider)
414
+
415
+ # Initialize RAG agent
416
+ # rag_agent = RAGAgent(
417
+ # llm=llm,
418
+ # embedding=embedding_model,
419
+ # vector_store=vector_store,
420
+ # mongodb=mongodb
421
+ # )
422
+
423
+ rag_agent = SystemInstructionsRAGAgent(
424
  llm=llm,
425
+ embedding=embedding_model,
426
  vector_store=vector_store,
427
  mongodb=mongodb
428
  )
429
+
430
+ # rag_agent.add_custom_role(
431
+ # "Knowledge based chatbot and introduction specialist",
432
+ # """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
433
+ # Your role is to:
434
+ # 1. Your response should be short and to the point.
435
+ # 2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
436
+ # """
437
+ # )
438
+
439
+ # rag_agent.add_custom_role(
440
+ # "Knowledge based chatbot",
441
+ # """You are a knowledge based specialist focusing on knowledge attached.
442
+ # Your role is to:
443
+ # 1. Your response should be short and to the point.
444
+ # 2. if it is not introduction then make sure to share the response from Vector store.
445
+ # 3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
446
+ # 4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
447
+ # """
448
+ # )
449
+
450
  # Use provided conversation ID or create new one
451
  conversation_id = request.conversation_id or str(uuid.uuid4())
452
+
453
  # Process the query
454
  query = request.query
455
+
456
  # Add specific instructions for certain types of queries
457
+ # if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
458
+ # query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
459
+
460
  # Generate response
461
  logger.info(f"Generating response: {str(datetime.now())}")
462
+
463
  max_retries = 3
464
  retry_count = 0
465
  response = None
 
471
  query=query,
472
  conversation_id=conversation_id,
473
  temperature=request.temperature,
474
+ max_tokens=request.max_tokens if hasattr(
475
+ request, 'max_tokens') else None
476
  )
477
  break
478
  except Exception as e:
 
482
  await asyncio.sleep(1) # Brief pause before retry
483
 
484
  if response is None:
485
+ raise last_error or Exception(
486
+ "Failed to generate response after retries")
487
 
488
  logger.info(f"Response generated: {str(datetime.now())}")
489
 
 
496
 
497
  # Add Excel-specific metadata if present
498
  has_excel_content = any(
499
+ doc and 'Sheet:' in doc
500
  for doc in (response.context_docs or [])
501
  )
502
  if has_excel_content:
503
  try:
504
  metadata['excel_content'] = True
505
+
506
  # Extract Excel-specific insights if available
507
  if hasattr(rag_agent, 'get_excel_insights'):
508
  excel_insights = rag_agent.get_excel_insights(
 
531
  sources=response.sources,
532
  conversation_id=conversation_id,
533
  timestamp=datetime.now(),
534
+ relevant_doc_scores=response.scores if hasattr(
535
+ response, 'scores') else None,
536
  metadata=metadata
537
  )
538
 
539
  # Log completion
540
  logger.info(f"Chat response completed: {str(datetime.now())}")
541
+
542
  return chat_response
543
 
544
  except Exception as e:
 
547
  if isinstance(e, ValueError):
548
  raise HTTPException(status_code=400, detail=str(e))
549
  elif isinstance(e, (KeyError, AttributeError)):
550
+ raise HTTPException(
551
+ status_code=500, detail="Internal processing error")
552
  else:
553
  raise HTTPException(status_code=500, detail=str(e))
554
 
555
+
556
  @app.get("/chat/history/{conversation_id}")
557
  async def get_conversation_history(conversation_id: str):
558
  """Get complete conversation history"""
559
  history = await mongodb.get_conversation_history(conversation_id)
560
+
561
  if not history:
562
  raise HTTPException(status_code=404, detail="Conversation not found")
563
+
564
  return {
565
  "conversation_id": conversation_id,
566
  "messages": history
567
  }
568
 
569
+
570
  @app.post("/chat/summarize", response_model=SummaryResponse)
571
  async def summarize_conversation(request: SummarizeRequest):
572
  """Generate a summary of a conversation"""
573
  try:
574
  messages = await mongodb.get_messages_for_summary(request.conversation_id)
575
+
576
  if not messages:
577
+ raise HTTPException(
578
+ status_code=404, detail="Conversation not found")
579
+
580
  summary = await summarizer.summarize_conversation(
581
  messages,
582
  include_metadata=request.include_metadata
583
  )
584
+
585
  return SummaryResponse(**summary)
586
+
587
  except Exception as e:
588
  logger.error(f"Error generating summary: {str(e)}")
589
  raise HTTPException(status_code=500, detail=str(e))
590
 
591
+
592
  @app.post("/chat/feedback/{conversation_id}")
593
  async def submit_feedback(
594
  conversation_id: str,
 
599
  # Validate conversation exists
600
  conversation = await mongodb.get_conversation_metadata(conversation_id)
601
  if not conversation:
602
+ raise HTTPException(
603
+ status_code=404, detail="Conversation not found")
604
+
605
  # Update feedback
606
  success = await mongodb.update_feedback(
607
  conversation_id=conversation_id,
608
  feedback=feedback_request.feedback,
609
  rating=feedback_request.rating
610
  )
611
+
612
  if not success:
613
  raise HTTPException(
614
  status_code=500,
615
  detail="Failed to update feedback"
616
  )
617
+
618
  return {
619
  "status": "success",
620
  "message": "Feedback submitted successfully",
 
624
  "rating": feedback_request.format_rating()
625
  }
626
  }
627
+
628
  except HTTPException:
629
  raise
630
  except Exception as e:
631
  logger.error(f"Error submitting feedback: {str(e)}")
632
  raise HTTPException(status_code=500, detail=str(e))
633
 
634
+
635
  @app.get("/debug/config")
636
  async def debug_config():
637
  """Debug endpoint to check configuration"""
638
  import os
639
  from config.config import settings
640
  from pathlib import Path
641
+
642
  debug_info = {
643
  "environment_variables": {
644
  "OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
 
653
  "openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
654
  }
655
  }
656
+
657
  if settings.OPENAI_API_KEY:
658
  key = settings.OPENAI_API_KEY
659
  debug_info["api_key_info"] = {
660
  "length": len(key),
661
  "preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
662
  }
663
+
664
  return debug_info
665
 
666
+
667
  @app.post("/admin/cleanup")
668
  async def cleanup_databases(
669
  include_files: bool = True,
 
671
  ):
672
  """
673
  Clean up all data from ChromaDB and MongoDB
674
+
675
  Args:
676
  include_files (bool): Whether to also delete uploaded files
677
+
678
+ Returns:
679
+ Dict: Cleanup operation summary with restart information
680
  """
681
  try:
682
  result = await perform_cleanup(mongodb, include_files)
683
+
684
+ # If restart is needed, return 202 Accepted instead of 200 OK
685
+ if result.get("restart_needed"):
686
+ return JSONResponse(
687
+ status_code=202,
688
+ content={
689
+ **result,
690
+ "message": "Cleanup partially completed. Please restart the server to complete ChromaDB cleanup."
691
+ }
692
+ )
693
+
694
  return result
695
+
696
  except Exception as e:
697
  logger.error(f"Error in cleanup operation: {str(e)}")
698
  raise HTTPException(
699
  status_code=500,
700
  detail=f"Error during cleanup: {str(e)}"
701
  )
702
+
703
+
704
  @app.get("/health")
705
  async def health_check():
706
  """Health check endpoint"""
 
708
 
709
  if __name__ == "__main__":
710
  import uvicorn
711
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/utils/__pycache__/database_cleanup.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ
 
src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
 
src/utils/database_cleanup.py CHANGED
@@ -1,131 +1,180 @@
1
  # src/utils/database_cleanup.py
2
- from typing import List, Dict
3
  import chromadb
4
  import shutil
5
  from pathlib import Path
 
 
 
 
6
  from src.utils.logger import logger
7
  from config.config import settings
8
 
9
- async def cleanup_chroma():
10
- """Clean up ChromaDB vector store"""
 
 
 
 
11
  try:
12
- # Initialize client with allow_reset=True
13
  client = chromadb.PersistentClient(
14
  path=settings.CHROMA_PATH,
15
  settings=chromadb.Settings(
16
  allow_reset=True,
17
- is_persistent=True
 
18
  )
19
  )
20
-
21
- # Get collection names
22
- collection_names = client.list_collections()
23
-
24
- # Delete each collection by name
25
- for name in collection_names:
26
- client.delete_collection(name)
27
-
28
- # Reset client
29
- client.reset()
30
-
31
- # Remove persistence directory
32
- path = Path(settings.CHROMA_PATH)
33
- if path.exists():
34
- shutil.rmtree(path)
35
-
36
- return ["All vector store data cleared"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  except Exception as e:
38
  raise Exception(f"ChromaDB cleanup failed: {str(e)}")
39
-
 
40
  async def cleanup_mongodb(mongodb) -> List[str]:
41
- """
42
- Clean up MongoDB collections
43
-
44
- Args:
45
- mongodb: MongoDB store instance
46
-
47
- Returns:
48
- List[str]: Details of cleanup operations
49
- """
50
  details = []
51
-
52
  try:
53
- # Drop all collections
54
- await mongodb.chat_history.delete_many({})
55
- details.append("Cleared chat history")
56
-
57
- await mongodb.conversations.delete_many({})
58
- details.append("Cleared conversations")
59
-
60
- await mongodb.documents.delete_many({})
61
- details.append("Cleared document metadata")
62
-
63
- await mongodb.knowledge_base.delete_many({})
64
- details.append("Cleared knowledge base")
65
-
66
- if hasattr(mongodb.db, 'vector_metadata'):
67
- await mongodb.db.vector_metadata.delete_many({})
68
- details.append("Cleared vector metadata")
69
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return details
71
  except Exception as e:
72
  raise Exception(f"MongoDB cleanup failed: {str(e)}")
73
 
 
74
  async def cleanup_files() -> List[str]:
75
- """
76
- Clean up uploaded files
77
-
78
- Returns:
79
- List[str]: Details of cleanup operations
80
- """
81
  details = []
82
- uploads_dir = Path("uploads")
83
-
84
- if uploads_dir.exists():
85
- # Get list of files before deletion
86
- files = list(uploads_dir.glob('*'))
87
-
88
- # Delete all files
89
- for file in files:
90
- if file.is_file():
91
- file.unlink()
92
- details.append(f"Deleted file: {file.name}")
93
-
94
- # Try to remove the directory itself
95
- if not any(uploads_dir.iterdir()):
96
- uploads_dir.rmdir()
97
- details.append("Removed empty uploads directory")
98
- else:
99
- details.append("No uploads directory found")
100
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  return details
102
 
103
- async def perform_cleanup(
104
- mongodb,
105
- include_files: bool = True
106
- ) -> Dict:
107
  """
108
- Perform comprehensive cleanup of all databases
109
-
110
  Args:
111
  mongodb: MongoDB store instance
112
  include_files (bool): Whether to also delete uploaded files
113
-
114
  Returns:
115
- Dict: Cleanup operation summary
116
  """
117
  cleanup_summary = {
118
  "chroma_db": {"status": "not_started", "details": []},
119
  "mongodb": {"status": "not_started", "details": []},
120
  "files": {"status": "not_started", "details": []}
121
  }
122
-
123
  try:
124
  # Clean ChromaDB
125
  try:
126
- details = await cleanup_chroma()
127
  cleanup_summary["chroma_db"] = {
128
- "status": "success",
129
  "details": details
130
  }
131
  except Exception as e:
@@ -166,17 +215,21 @@ async def perform_cleanup(
166
 
167
  # Determine overall status
168
  overall_status = "success"
169
- if any(item["status"] == "error" for item in cleanup_summary.values()):
 
 
 
170
  overall_status = "partial_success"
171
- if all(item["status"] == "error" for item in cleanup_summary.values()):
172
  overall_status = "error"
173
 
174
  return {
175
  "status": overall_status,
176
- "message": "Cleanup operation completed",
177
- "details": cleanup_summary
 
178
  }
179
 
180
  except Exception as e:
181
  logger.error(f"Error in cleanup operation: {str(e)}")
182
- raise
 
1
  # src/utils/database_cleanup.py
 
2
  import chromadb
3
  import shutil
4
  from pathlib import Path
5
+ import asyncio
6
+ import gc
7
+ import random
8
+ from typing import List, Dict, Tuple
9
  from src.utils.logger import logger
10
  from config.config import settings
11
 
12
+
13
+ async def cleanup_chroma() -> Tuple[List[str], bool]:
14
+ """Clean up ChromaDB data while maintaining connection"""
15
+ details = []
16
+ restart_needed = False
17
+
18
  try:
19
+ # Get existing client
20
  client = chromadb.PersistentClient(
21
  path=settings.CHROMA_PATH,
22
  settings=chromadb.Settings(
23
  allow_reset=True,
24
+ is_persistent=True,
25
+ anonymized_telemetry=False
26
  )
27
  )
28
+
29
+ # Get all collections
30
+ collections = client.list_collections()
31
+
32
+ if not collections:
33
+ details.append("No collections found in ChromaDB")
34
+ return details, restart_needed
35
+
36
+ # Delete data from each collection
37
+ for collection in collections:
38
+ try:
39
+ # Get all IDs in the collection
40
+ all_ids = collection.get()['ids']
41
+
42
+ if all_ids:
43
+ # Delete all documents in the collection
44
+ collection.delete(ids=all_ids)
45
+ details.append(
46
+ f"Deleted {len(all_ids)} documents from collection {collection.name}")
47
+ else:
48
+ details.append(
49
+ f"Collection {collection.name} was already empty")
50
+
51
+ # Delete the collection itself
52
+ client.delete_collection(collection.name)
53
+ details.append(f"Deleted collection {collection.name}")
54
+
55
+ except Exception as e:
56
+ logger.warning(
57
+ f"Error cleaning collection {collection.name}: {str(e)}")
58
+ details.append(
59
+ f"Error cleaning collection {collection.name}: {str(e)}")
60
+ restart_needed = True # Set restart flag if any collection fails
61
+
62
+ # Optional: Add a check to see if a full reset might be necessary
63
+ if len(client.list_collections()) > 0:
64
+ restart_needed = True
65
+ details.append("Some collections might require manual reset")
66
+
67
+ return details, restart_needed
68
+
69
  except Exception as e:
70
  raise Exception(f"ChromaDB cleanup failed: {str(e)}")
71
+
72
+
73
  async def cleanup_mongodb(mongodb) -> List[str]:
74
+ """Clean up MongoDB collections"""
 
 
 
 
 
 
 
 
75
  details = []
76
+
77
  try:
78
+ # Get all collections in the database
79
+ collections = await mongodb.db.list_collection_names()
80
+
81
+ # Core collections from MongoDBStore initialization
82
+ core_collections = {
83
+ 'chat_history': mongodb.chat_history,
84
+ 'conversations': mongodb.conversations,
85
+ 'knowledge_base': mongodb.documents, # documents maps to knowledge_base
86
+ # Direct access to vector_metadata collection
87
+ 'vector_metadata': mongodb.db.vector_metadata,
88
+ }
89
+
90
+ # Clean each core collection
91
+ for name, collection in core_collections.items():
92
+ try:
93
+ result = await collection.delete_many({})
94
+ details.append(
95
+ f"Cleared {name} ({result.deleted_count} documents)")
96
+ except Exception as e:
97
+ logger.error(f"Error clearing {name}: {str(e)}")
98
+ details.append(f"Error clearing {name}: {str(e)}")
99
+
100
+ # Clean any additional collections not in the core set
101
+ for coll_name in collections:
102
+ if coll_name not in core_collections:
103
+ try:
104
+ result = await mongodb.db[coll_name].delete_many({})
105
+ details.append(
106
+ f"Cleared additional collection {coll_name} ({result.deleted_count} documents)")
107
+ except Exception as e:
108
+ logger.error(
109
+ f"Error clearing additional collection {coll_name}: {str(e)}")
110
+
111
  return details
112
  except Exception as e:
113
  raise Exception(f"MongoDB cleanup failed: {str(e)}")
114
 
115
+
116
  async def cleanup_files() -> List[str]:
117
+ """Clean up uploaded files and temporary directories"""
 
 
 
 
 
118
  details = []
119
+
120
+ # Directories to clean
121
+ directories = {
122
+ 'uploads': Path("uploads"),
123
+ 'temp_downloads': Path(settings.TEMP_DOWNLOAD_DIR),
124
+ # Additional temp directory used by some components
125
+ 'temp_dir': Path('./temp')
126
+ }
127
+
128
+ for dir_name, dir_path in directories.items():
129
+ if dir_path.exists():
130
+ try:
131
+ # Delete all files in the directory
132
+ for file in dir_path.glob('*'):
133
+ try:
134
+ if file.is_file():
135
+ file.unlink()
136
+ details.append(
137
+ f"Deleted file: {file.name} from {dir_name}")
138
+ except Exception as e:
139
+ details.append(
140
+ f"Error deleting file {file.name} from {dir_name}: {str(e)}")
141
+
142
+ # Try to remove the empty directory
143
+ if not any(dir_path.iterdir()):
144
+ dir_path.rmdir()
145
+ details.append(f"Removed empty {dir_name} directory")
146
+ except Exception as e:
147
+ details.append(
148
+ f"Error cleaning {dir_name} directory: {str(e)}")
149
+ else:
150
+ details.append(f"No {dir_name} directory found")
151
+
152
  return details
153
 
154
+
155
+ async def perform_cleanup(mongodb, include_files: bool = True) -> Dict:
 
 
156
  """
157
+ Perform comprehensive cleanup of all databases and files
158
+
159
  Args:
160
  mongodb: MongoDB store instance
161
  include_files (bool): Whether to also delete uploaded files
162
+
163
  Returns:
164
+ Dict: Cleanup operation summary with detailed status
165
  """
166
  cleanup_summary = {
167
  "chroma_db": {"status": "not_started", "details": []},
168
  "mongodb": {"status": "not_started", "details": []},
169
  "files": {"status": "not_started", "details": []}
170
  }
171
+
172
  try:
173
  # Clean ChromaDB
174
  try:
175
+ details, restart_needed = await cleanup_chroma()
176
  cleanup_summary["chroma_db"] = {
177
+ "status": "success" if not restart_needed else "partial",
178
  "details": details
179
  }
180
  except Exception as e:
 
215
 
216
  # Determine overall status
217
  overall_status = "success"
218
+ if restart_needed:
219
+ overall_status = "partial_success"
220
+ cleanup_summary["message"] = "Cleanup partially completed. Server restart required to complete ChromaDB cleanup."
221
+ elif any(item["status"] == "error" for item in cleanup_summary.values()):
222
  overall_status = "partial_success"
223
+ elif all(item["status"] == "error" for item in cleanup_summary.values()):
224
  overall_status = "error"
225
 
226
  return {
227
  "status": overall_status,
228
+ "message": cleanup_summary.get("message", "Cleanup operation completed"),
229
+ "details": cleanup_summary,
230
+ "restart_needed": restart_needed
231
  }
232
 
233
  except Exception as e:
234
  logger.error(f"Error in cleanup operation: {str(e)}")
235
+ raise
src/utils/document_processor.py CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
6
  import json
7
  from pathlib import Path
8
  import hashlib
9
- import magic # python-magic library for file type detection
10
  from bs4 import BeautifulSoup
11
  import csv
12
  from datetime import datetime
@@ -16,41 +16,92 @@ import tiktoken
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  import logging
18
  from bs4.element import ProcessingInstruction
 
 
19
  from .enhanced_excel_processor import EnhancedExcelProcessor
20
 
 
21
  class DocumentProcessor:
22
  def __init__(
23
  self,
24
- chunk_size: int = 1000,
25
- chunk_overlap: int = 200,
26
- max_file_size: int = 10 * 1024 * 1024, # 10MB
27
  supported_formats: Optional[List[str]] = None
28
  ):
29
- self.chunk_size = chunk_size
30
- self.chunk_overlap = chunk_overlap
31
- self.max_file_size = max_file_size
32
- self.supported_formats = supported_formats or [
33
- '.txt', '.pdf', '.docx', '.csv', '.json',
34
- '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
35
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  self.processing_queue = Queue()
37
  self.processed_docs = {}
38
  self._initialize_text_splitter()
39
-
40
- # Initialize Excel processor
41
  self.excel_processor = EnhancedExcelProcessor()
42
-
43
- # Check for required packages
44
  try:
45
  import striprtf.striprtf
46
  except ImportError:
47
- logging.warning("Warning: striprtf package not found. RTF support will be limited.")
48
-
 
49
  try:
50
  from bs4 import BeautifulSoup
51
  import lxml
52
  except ImportError:
53
- logging.warning("Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def _initialize_text_splitter(self):
56
  """Initialize the text splitter with custom settings"""
@@ -58,13 +109,241 @@ class DocumentProcessor:
58
  chunk_size=self.chunk_size,
59
  chunk_overlap=self.chunk_overlap,
60
  length_function=len,
61
- separators=["\n\n", "\n", " ", ""]
 
 
 
 
62
  )
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def _extract_content(self, file_path: Path) -> str:
65
  """Extract content from different file formats"""
66
  suffix = file_path.suffix.lower()
67
-
68
  try:
69
  if suffix == '.pdf':
70
  return self._extract_pdf(file_path)
@@ -87,7 +366,8 @@ class DocumentProcessor:
87
  else:
88
  raise ValueError(f"Unsupported format: {suffix}")
89
  except Exception as e:
90
- raise Exception(f"Error extracting content from {file_path}: {str(e)}")
 
91
 
92
  def _extract_text(self, file_path: Path) -> str:
93
  """Extract content from text-based files"""
@@ -104,31 +384,31 @@ class DocumentProcessor:
104
  with open(file_path, 'rb') as file:
105
  reader = PyPDF2.PdfReader(file)
106
  metadata = reader.metadata
107
-
108
  for page in reader.pages:
109
  text += page.extract_text() + "\n\n"
110
-
111
  # Extract images if available
112
  if '/XObject' in page['/Resources']:
113
  for obj in page['/Resources']['/XObject'].get_object():
114
  if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
115
  pass
116
-
117
  return text.strip()
118
 
119
  def _extract_docx(self, file_path: Path) -> str:
120
  """Extract text from DOCX with formatting"""
121
  doc = docx.Document(file_path)
122
  full_text = []
123
-
124
  for para in doc.paragraphs:
125
  full_text.append(para.text)
126
-
127
  for table in doc.tables:
128
  for row in table.rows:
129
  row_text = [cell.text for cell in row.cells]
130
  full_text.append(" | ".join(row_text))
131
-
132
  return "\n\n".join(full_text)
133
 
134
  def _extract_csv(self, file_path: Path) -> str:
@@ -146,10 +426,10 @@ class DocumentProcessor:
146
  """Extract text from HTML with structure preservation"""
147
  with open(file_path) as f:
148
  soup = BeautifulSoup(f, 'html.parser')
149
-
150
  for script in soup(["script", "style"]):
151
  script.decompose()
152
-
153
  text = soup.get_text(separator='\n')
154
  lines = [line.strip() for line in text.splitlines() if line.strip()]
155
  return "\n\n".join(lines)
@@ -159,12 +439,13 @@ class DocumentProcessor:
159
  try:
160
  with open(file_path, 'r', encoding='utf-8') as f:
161
  soup = BeautifulSoup(f, 'xml')
162
-
163
  for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
164
  pi.extract()
165
-
166
  text = soup.get_text(separator='\n')
167
- lines = [line.strip() for line in text.splitlines() if line.strip()]
 
168
  return "\n\n".join(lines)
169
  except Exception as e:
170
  raise Exception(f"Error processing XML file: {str(e)}")
@@ -173,12 +454,13 @@ class DocumentProcessor:
173
  """Extract text from RTF files"""
174
  try:
175
  import striprtf.striprtf as striprtf
176
-
177
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
178
  rtf_text = f.read()
179
-
180
  plain_text = striprtf.rtf_to_text(rtf_text)
181
- lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
 
182
  return "\n\n".join(lines)
183
  except ImportError:
184
  raise ImportError("striprtf package is required for RTF support.")
@@ -190,14 +472,15 @@ class DocumentProcessor:
190
  try:
191
  # Use enhanced Excel processor
192
  processed_content = self.excel_processor.process_excel(file_path)
193
-
194
  # If processing fails, fall back to basic processing
195
  if not processed_content:
196
- logging.warning(f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
 
197
  return self._basic_excel_extract(file_path)
198
-
199
  return processed_content
200
-
201
  except Exception as e:
202
  logging.error(f"Error in enhanced Excel processing: {str(e)}")
203
  # Fall back to basic Excel processing
@@ -208,12 +491,12 @@ class DocumentProcessor:
208
  try:
209
  excel_file = pd.ExcelFile(file_path)
210
  sheets_data = []
211
-
212
  for sheet_name in excel_file.sheet_names:
213
  df = pd.read_excel(excel_file, sheet_name=sheet_name)
214
  sheet_content = f"\nSheet: {sheet_name}\n"
215
  sheet_content += "=" * (len(sheet_name) + 7) + "\n"
216
-
217
  if df.empty:
218
  sheet_content += "Empty Sheet\n"
219
  else:
@@ -223,11 +506,11 @@ class DocumentProcessor:
223
  max_cols=None,
224
  line_width=120
225
  ) + "\n"
226
-
227
  sheets_data.append(sheet_content)
228
-
229
  return "\n\n".join(sheets_data)
230
-
231
  except Exception as e:
232
  raise Exception(f"Error in basic Excel processing: {str(e)}")
233
 
@@ -239,7 +522,7 @@ class DocumentProcessor:
239
  ) -> Dict:
240
  """Generate comprehensive metadata"""
241
  file_stat = file_path.stat()
242
-
243
  metadata = {
244
  'filename': file_path.name,
245
  'file_type': file_path.suffix,
@@ -252,7 +535,7 @@ class DocumentProcessor:
252
  'character_count': len(content),
253
  'processing_timestamp': datetime.now().isoformat()
254
  }
255
-
256
  # Add Excel-specific metadata if applicable
257
  if file_path.suffix.lower() in ['.xlsx', '.xls']:
258
  try:
@@ -261,32 +544,42 @@ class DocumentProcessor:
261
  metadata.update({'excel_metadata': excel_metadata})
262
  except Exception as e:
263
  logging.warning(f"Could not extract Excel metadata: {str(e)}")
264
-
265
  if additional_metadata:
266
  metadata.update(additional_metadata)
267
-
268
  return metadata
269
 
270
  def _calculate_hash(self, text: str) -> str:
271
  """Calculate SHA-256 hash of text"""
272
  return hashlib.sha256(text.encode()).hexdigest()
273
 
274
- async def process_document(
275
- self,
276
- file_path: Union[str, Path],
277
- metadata: Optional[Dict] = None
278
- ) -> Dict:
279
  """Process a document with metadata and content extraction"""
280
  file_path = Path(file_path)
281
-
282
  if not self._validate_file(file_path):
283
  raise ValueError(f"Invalid file: {file_path}")
284
 
285
  content = self._extract_content(file_path)
286
  doc_metadata = self._generate_metadata(file_path, content, metadata)
287
- chunks = self.text_splitter.split_text(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
289
-
290
  return {
291
  'content': content,
292
  'chunks': chunks,
@@ -295,20 +588,28 @@ class DocumentProcessor:
295
  'statistics': self._generate_statistics(content, chunks)
296
  }
297
 
 
 
 
 
 
 
 
 
298
  def _validate_file(self, file_path: Path) -> bool:
299
  """Validate file type, size, and content"""
300
  if not file_path.exists():
301
  raise FileNotFoundError(f"File not found: {file_path}")
302
-
303
  if file_path.suffix.lower() not in self.supported_formats:
304
  raise ValueError(f"Unsupported file format: {file_path.suffix}")
305
-
306
  if file_path.stat().st_size > self.max_file_size:
307
  raise ValueError(f"File too large: {file_path}")
308
-
309
  if file_path.stat().st_size == 0:
310
  raise ValueError(f"Empty file: {file_path}")
311
-
312
  return True
313
 
314
  def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
@@ -328,7 +629,7 @@ class DocumentProcessor:
328
  ) -> Dict[str, Dict]:
329
  """Process multiple documents in parallel"""
330
  results = {}
331
-
332
  if parallel:
333
  threads = []
334
  for file_path in file_paths:
@@ -338,13 +639,13 @@ class DocumentProcessor:
338
  )
339
  threads.append(thread)
340
  thread.start()
341
-
342
  for thread in threads:
343
  thread.join()
344
  else:
345
  for file_path in file_paths:
346
  await self._process_and_store(file_path, results)
347
-
348
  return results
349
 
350
  async def _process_and_store(
@@ -357,4 +658,4 @@ class DocumentProcessor:
357
  result = await self.process_document(file_path)
358
  results[str(file_path)] = result
359
  except Exception as e:
360
- results[str(file_path)] = {'error': str(e)}
 
6
  import json
7
  from pathlib import Path
8
  import hashlib
9
+ import magic
10
  from bs4 import BeautifulSoup
11
  import csv
12
  from datetime import datetime
 
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  import logging
18
  from bs4.element import ProcessingInstruction
19
+
20
+ from config.config import Settings
21
  from .enhanced_excel_processor import EnhancedExcelProcessor
22
 
23
+
24
  class DocumentProcessor:
25
  def __init__(
26
  self,
27
+ chunk_size: Optional[int] = None,
28
+ chunk_overlap: Optional[int] = None,
29
+ max_file_size: Optional[int] = None,
30
  supported_formats: Optional[List[str]] = None
31
  ):
32
+ """
33
+ Initialize DocumentProcessor with configurable parameters
34
+
35
+ Args:
36
+ chunk_size (Optional[int]): Size of text chunks
37
+ chunk_overlap (Optional[int]): Overlap between chunks
38
+ max_file_size (Optional[int]): Maximum file size in bytes
39
+ supported_formats (Optional[List[str]]): List of supported file extensions
40
+ """
41
+
42
+ logging.basicConfig(
43
+ level=logging.DEBUG,
44
+ format='%(asctime)s - %(levelname)s - %(message)s'
45
+ )
46
+
47
+ # Get settings with validation
48
+ default_settings = Settings.get_document_processor_settings()
49
+
50
+ # Use provided values or defaults from settings
51
+ self.chunk_size = chunk_size if chunk_size is not None else default_settings[
52
+ 'chunk_size']
53
+ self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[
54
+ 'chunk_overlap']
55
+ self.max_file_size = max_file_size if max_file_size is not None else default_settings[
56
+ 'max_file_size']
57
+ self.supported_formats = supported_formats if supported_formats is not None else default_settings[
58
+ 'supported_formats']
59
+
60
+ # Validate settings
61
+ self._validate_settings()
62
+
63
+ # Initialize existing components
64
  self.processing_queue = Queue()
65
  self.processed_docs = {}
66
  self._initialize_text_splitter()
 
 
67
  self.excel_processor = EnhancedExcelProcessor()
68
+
69
+ # Check for required packages (keep existing functionality)
70
  try:
71
  import striprtf.striprtf
72
  except ImportError:
73
+ logging.warning(
74
+ "Warning: striprtf package not found. RTF support will be limited.")
75
+
76
  try:
77
  from bs4 import BeautifulSoup
78
  import lxml
79
  except ImportError:
80
+ logging.warning(
81
+ "Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
82
+
83
+ def _validate_settings(self):
84
+ """Validate and adjust settings if necessary"""
85
+ # Ensure chunk_size is positive and reasonable
86
+ self.chunk_size = max(100, self.chunk_size)
87
+
88
+ # Ensure chunk_overlap is less than chunk_size
89
+ self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50)
90
+
91
+ # Ensure max_file_size is reasonable (minimum 1MB)
92
+ self.max_file_size = max(1024 * 1024, self.max_file_size)
93
+
94
+ # Ensure supported_formats contains valid extensions
95
+ if not self.supported_formats:
96
+ # Fallback to default supported formats if empty
97
+ self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats']
98
+
99
+ # Ensure all formats start with a dot
100
+ self.supported_formats = [
101
+ f".{fmt.lower().lstrip('.')}" if not fmt.startswith(
102
+ '.') else fmt.lower()
103
+ for fmt in self.supported_formats
104
+ ]
105
 
106
  def _initialize_text_splitter(self):
107
  """Initialize the text splitter with custom settings"""
 
109
  chunk_size=self.chunk_size,
110
  chunk_overlap=self.chunk_overlap,
111
  length_function=len,
112
+ # Modify separators to better handle markdown while maintaining overlap
113
+ separators=["\n\n", "\n", " ", ""],
114
+ keep_separator=True,
115
+ add_start_index=True,
116
+ strip_whitespace=False # Keep whitespace to maintain markdown formatting
117
  )
118
 
119
+ def split_text(self, text: str) -> List[str]:
120
+ """Split text with enforced overlap while preserving structure"""
121
+ try:
122
+ # Get initial split using RecursiveCharacterTextSplitter
123
+ initial_chunks = self.text_splitter.split_text(text)
124
+ if len(initial_chunks) <= 1:
125
+ return initial_chunks
126
+
127
+ # Process chunks with enforced overlap
128
+ final_chunks = []
129
+
130
+ for i, current_chunk in enumerate(initial_chunks):
131
+ if i == 0:
132
+ final_chunks.append(current_chunk)
133
+ continue
134
+
135
+ prev_chunk = final_chunks[-1]
136
+
137
+ # Get the last part of previous chunk for overlap
138
+ overlap_size = min(self.chunk_overlap, len(prev_chunk))
139
+ overlap_text = prev_chunk[-overlap_size:]
140
+
141
+ # For tables, include the header row
142
+ if '|' in current_chunk and '\n' in current_chunk:
143
+ table_lines = current_chunk.split('\n')
144
+ header_lines = []
145
+ for line in table_lines:
146
+ if line.strip().startswith('|'):
147
+ header_lines.append(line)
148
+ else:
149
+ break
150
+ if header_lines:
151
+ header_text = '\n'.join(header_lines) + '\n'
152
+ overlap_text = header_text + overlap_text
153
+
154
+ # Create new chunk with overlap
155
+ new_chunk = overlap_text + current_chunk
156
+
157
+ # Ensure we don't have duplicate content at the overlap point
158
+ if current_chunk.startswith(overlap_text):
159
+ new_chunk = current_chunk
160
+
161
+ # Add context from previous chunk when needed
162
+ if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
163
+ context_markers = ['**AGENDA**',
164
+ '**DISCUSSIONS**', '| No |']
165
+ for marker in context_markers:
166
+ if marker in prev_chunk and marker not in new_chunk:
167
+ new_chunk = marker + "\n" + new_chunk
168
+ break
169
+
170
+ final_chunks.append(new_chunk)
171
+
172
+ # Validate and log overlaps
173
+ for i in range(len(final_chunks)-1):
174
+ actual_overlap = self._find_actual_overlap(
175
+ final_chunks[i], final_chunks[i+1])
176
+ logging.debug(
177
+ f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
178
+ if len(actual_overlap) < self.chunk_overlap:
179
+ logging.warning(
180
+ f"Insufficient overlap between chunks {i} and {i+1}")
181
+
182
+ return final_chunks
183
+
184
+ for start, end in table_sections:
185
+ # Process text before table if exists
186
+ if start > current_position:
187
+ non_table_text = text[current_position:start]
188
+ if non_table_text.strip():
189
+ text_chunks = self.text_splitter.split_text(
190
+ non_table_text)
191
+ if chunks and text_chunks:
192
+ # Ensure overlap with previous chunk
193
+ prev_chunk = chunks[-1]
194
+ overlap = self._get_overlap_text(prev_chunk)
195
+ text_chunks[0] = overlap + text_chunks[0]
196
+ chunks.extend(text_chunks)
197
+
198
+ # Process table as a single chunk with overlap
199
+ table_text = text[start:end]
200
+ if chunks:
201
+ prev_chunk = chunks[-1]
202
+ overlap = self._get_overlap_text(prev_chunk)
203
+ table_text = overlap + table_text
204
+ chunks.append(table_text)
205
+ current_position = end
206
+
207
+ # Process remaining text after last table
208
+ if current_position < len(text):
209
+ remaining_text = text[current_position:]
210
+ if remaining_text.strip():
211
+ text_chunks = self.text_splitter.split_text(remaining_text)
212
+ if chunks and text_chunks:
213
+ # Ensure overlap with previous chunk
214
+ prev_chunk = chunks[-1]
215
+ overlap = self._get_overlap_text(prev_chunk)
216
+ text_chunks[0] = overlap + text_chunks[0]
217
+ chunks.extend(text_chunks)
218
+
219
+ # Validate and adjust overlaps
220
+ chunks = self._ensure_minimum_overlap(chunks)
221
+
222
+ # Log chunk details for debugging
223
+ for i in range(len(chunks)-1):
224
+ overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
225
+ logging.debug(
226
+ f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
227
+ logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
228
+ logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
229
+
230
+ return chunks
231
+
232
+ except Exception as e:
233
+ logging.error(f"Error in split_text: {str(e)}")
234
+ # Fallback to original text splitter
235
+ return self.text_splitter.split_text(text)
236
+
237
+ def _find_break_point(self, text: str, prev_chunk: str) -> int:
238
+ """
239
+ Find suitable breaking point that maintains document structure
240
+
241
+ Args:
242
+ text (str): Text to find break point in (the overlap portion)
243
+ prev_chunk (str): The complete previous chunk for context
244
+
245
+ Returns:
246
+ int: Position of suitable break point
247
+ """
248
+ # Get the context of how the previous chunk ends
249
+ prev_chunk_lines = prev_chunk.split('\n')
250
+
251
+ # Special handling for markdown tables
252
+ if '|' in prev_chunk:
253
+ # Check if we're in the middle of a table
254
+ table_rows = [
255
+ line for line in prev_chunk_lines if line.strip().startswith('|')]
256
+ if table_rows:
257
+ # Find where the current table starts in the text
258
+ table_start = text.find('|')
259
+ if table_start >= 0:
260
+ # Find the next row boundary
261
+ next_row = text.find('\n', table_start)
262
+ if next_row >= 0:
263
+ return next_row + 1 # Include the newline
264
+
265
+ # Define break point markers in order of preference
266
+ break_markers = [
267
+ ('\n\n', True), # Paragraph breaks (keep marker)
268
+ ('\n', True), # Line breaks (keep marker)
269
+ ('. ', True), # Sentence endings (keep marker)
270
+ (', ', True), # Clause breaks (keep marker)
271
+ (' ', False) # Word breaks (don't keep marker)
272
+ ]
273
+
274
+ # Check the structure of the previous chunk end
275
+ last_line = prev_chunk_lines[-1] if prev_chunk_lines else ""
276
+
277
+ # Look for each type of break point
278
+ for marker, keep_marker in break_markers:
279
+ if marker in text:
280
+ # Try to find a break point that maintains document structure
281
+ marker_positions = [i for i in range(
282
+ len(text)) if text[i:i+len(marker)] == marker]
283
+
284
+ for pos in reversed(marker_positions):
285
+ # Check if this break point would maintain document structure
286
+ if self._is_valid_break_point(text, pos, last_line):
287
+ return pos + (len(marker) if keep_marker else 0)
288
+
289
+ # If no suitable break point found, default to exact position
290
+ return min(len(text), self.chunk_overlap)
291
+
292
+ def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool:
293
+ """
294
+ Check if a break point would maintain document structure
295
+
296
+ Args:
297
+ text (str): Text being checked
298
+ position (int): Potential break position
299
+ last_line (str): Last line of previous chunk
300
+
301
+ Returns:
302
+ bool: True if break point is valid
303
+ """
304
+ # Don't break in the middle of markdown formatting
305
+ markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#']
306
+ if position > 0 and position < len(text) - 1:
307
+ if text[position-1] in markdown_markers or text[position+1] in markdown_markers:
308
+ return False
309
+
310
+ # Don't break in the middle of a table cell
311
+ if '|' in last_line:
312
+ cell_count = last_line.count('|')
313
+ text_before_break = text[:position]
314
+ if text_before_break.count('|') % cell_count != 0:
315
+ return False
316
+
317
+ # Don't break URLs or code blocks
318
+ url_patterns = ['http://', 'https://', '```', '`']
319
+ for pattern in url_patterns:
320
+ if pattern in text[:position] and pattern not in text[position:]:
321
+ return False
322
+
323
+ return True
324
+
325
+ def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool:
326
+ """Validate that chunks maintain document integrity"""
327
+ try:
328
+ # Remove overlap to check content
329
+ reconstructed = chunks[0]
330
+ for chunk in chunks[1:]:
331
+ if len(chunk) > self.chunk_overlap:
332
+ reconstructed += chunk[self.chunk_overlap:]
333
+
334
+ # Clean both texts for comparison (remove extra whitespace)
335
+ clean_original = ' '.join(original_text.split())
336
+ clean_reconstructed = ' '.join(reconstructed.split())
337
+
338
+ return clean_original == clean_reconstructed
339
+ except Exception as e:
340
+ logging.error(f"Error validating chunks: {str(e)}")
341
+ return False
342
+
343
  def _extract_content(self, file_path: Path) -> str:
344
  """Extract content from different file formats"""
345
  suffix = file_path.suffix.lower()
346
+
347
  try:
348
  if suffix == '.pdf':
349
  return self._extract_pdf(file_path)
 
366
  else:
367
  raise ValueError(f"Unsupported format: {suffix}")
368
  except Exception as e:
369
+ raise Exception(
370
+ f"Error extracting content from {file_path}: {str(e)}")
371
 
372
  def _extract_text(self, file_path: Path) -> str:
373
  """Extract content from text-based files"""
 
384
  with open(file_path, 'rb') as file:
385
  reader = PyPDF2.PdfReader(file)
386
  metadata = reader.metadata
387
+
388
  for page in reader.pages:
389
  text += page.extract_text() + "\n\n"
390
+
391
  # Extract images if available
392
  if '/XObject' in page['/Resources']:
393
  for obj in page['/Resources']['/XObject'].get_object():
394
  if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
395
  pass
396
+
397
  return text.strip()
398
 
399
  def _extract_docx(self, file_path: Path) -> str:
400
  """Extract text from DOCX with formatting"""
401
  doc = docx.Document(file_path)
402
  full_text = []
403
+
404
  for para in doc.paragraphs:
405
  full_text.append(para.text)
406
+
407
  for table in doc.tables:
408
  for row in table.rows:
409
  row_text = [cell.text for cell in row.cells]
410
  full_text.append(" | ".join(row_text))
411
+
412
  return "\n\n".join(full_text)
413
 
414
  def _extract_csv(self, file_path: Path) -> str:
 
426
  """Extract text from HTML with structure preservation"""
427
  with open(file_path) as f:
428
  soup = BeautifulSoup(f, 'html.parser')
429
+
430
  for script in soup(["script", "style"]):
431
  script.decompose()
432
+
433
  text = soup.get_text(separator='\n')
434
  lines = [line.strip() for line in text.splitlines() if line.strip()]
435
  return "\n\n".join(lines)
 
439
  try:
440
  with open(file_path, 'r', encoding='utf-8') as f:
441
  soup = BeautifulSoup(f, 'xml')
442
+
443
  for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
444
  pi.extract()
445
+
446
  text = soup.get_text(separator='\n')
447
+ lines = [line.strip()
448
+ for line in text.splitlines() if line.strip()]
449
  return "\n\n".join(lines)
450
  except Exception as e:
451
  raise Exception(f"Error processing XML file: {str(e)}")
 
454
  """Extract text from RTF files"""
455
  try:
456
  import striprtf.striprtf as striprtf
457
+
458
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
459
  rtf_text = f.read()
460
+
461
  plain_text = striprtf.rtf_to_text(rtf_text)
462
+ lines = [line.strip()
463
+ for line in plain_text.splitlines() if line.strip()]
464
  return "\n\n".join(lines)
465
  except ImportError:
466
  raise ImportError("striprtf package is required for RTF support.")
 
472
  try:
473
  # Use enhanced Excel processor
474
  processed_content = self.excel_processor.process_excel(file_path)
475
+
476
  # If processing fails, fall back to basic processing
477
  if not processed_content:
478
+ logging.warning(
479
+ f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
480
  return self._basic_excel_extract(file_path)
481
+
482
  return processed_content
483
+
484
  except Exception as e:
485
  logging.error(f"Error in enhanced Excel processing: {str(e)}")
486
  # Fall back to basic Excel processing
 
491
  try:
492
  excel_file = pd.ExcelFile(file_path)
493
  sheets_data = []
494
+
495
  for sheet_name in excel_file.sheet_names:
496
  df = pd.read_excel(excel_file, sheet_name=sheet_name)
497
  sheet_content = f"\nSheet: {sheet_name}\n"
498
  sheet_content += "=" * (len(sheet_name) + 7) + "\n"
499
+
500
  if df.empty:
501
  sheet_content += "Empty Sheet\n"
502
  else:
 
506
  max_cols=None,
507
  line_width=120
508
  ) + "\n"
509
+
510
  sheets_data.append(sheet_content)
511
+
512
  return "\n\n".join(sheets_data)
513
+
514
  except Exception as e:
515
  raise Exception(f"Error in basic Excel processing: {str(e)}")
516
 
 
522
  ) -> Dict:
523
  """Generate comprehensive metadata"""
524
  file_stat = file_path.stat()
525
+
526
  metadata = {
527
  'filename': file_path.name,
528
  'file_type': file_path.suffix,
 
535
  'character_count': len(content),
536
  'processing_timestamp': datetime.now().isoformat()
537
  }
538
+
539
  # Add Excel-specific metadata if applicable
540
  if file_path.suffix.lower() in ['.xlsx', '.xls']:
541
  try:
 
544
  metadata.update({'excel_metadata': excel_metadata})
545
  except Exception as e:
546
  logging.warning(f"Could not extract Excel metadata: {str(e)}")
547
+
548
  if additional_metadata:
549
  metadata.update(additional_metadata)
550
+
551
  return metadata
552
 
553
  def _calculate_hash(self, text: str) -> str:
554
  """Calculate SHA-256 hash of text"""
555
  return hashlib.sha256(text.encode()).hexdigest()
556
 
557
+ async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
 
 
 
 
558
  """Process a document with metadata and content extraction"""
559
  file_path = Path(file_path)
560
+
561
  if not self._validate_file(file_path):
562
  raise ValueError(f"Invalid file: {file_path}")
563
 
564
  content = self._extract_content(file_path)
565
  doc_metadata = self._generate_metadata(file_path, content, metadata)
566
+
567
+ # Try enhanced splitting with validation
568
+ chunks = self.split_text(content)
569
+ if not self._validate_chunks(content, chunks):
570
+ logging.warning(
571
+ "Enhanced splitting failed validation, falling back to original splitter")
572
+ chunks = self.text_splitter.split_text(content)
573
+
574
+ # Add logging to verify chunk overlap
575
+ for i in range(len(chunks)-1):
576
+ logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
577
+ logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
578
+ logging.debug(
579
+ f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
580
+
581
  chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
582
+
583
  return {
584
  'content': content,
585
  'chunks': chunks,
 
588
  'statistics': self._generate_statistics(content, chunks)
589
  }
590
 
591
+ def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
592
+ """Calculate the size of overlap between two chunks"""
593
+ min_len = min(len(chunk1), len(chunk2))
594
+ for i in range(min_len, 0, -1):
595
+ if chunk1[-i:] == chunk2[:i]:
596
+ return i
597
+ return 0
598
+
599
  def _validate_file(self, file_path: Path) -> bool:
600
  """Validate file type, size, and content"""
601
  if not file_path.exists():
602
  raise FileNotFoundError(f"File not found: {file_path}")
603
+
604
  if file_path.suffix.lower() not in self.supported_formats:
605
  raise ValueError(f"Unsupported file format: {file_path.suffix}")
606
+
607
  if file_path.stat().st_size > self.max_file_size:
608
  raise ValueError(f"File too large: {file_path}")
609
+
610
  if file_path.stat().st_size == 0:
611
  raise ValueError(f"Empty file: {file_path}")
612
+
613
  return True
614
 
615
  def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
 
629
  ) -> Dict[str, Dict]:
630
  """Process multiple documents in parallel"""
631
  results = {}
632
+
633
  if parallel:
634
  threads = []
635
  for file_path in file_paths:
 
639
  )
640
  threads.append(thread)
641
  thread.start()
642
+
643
  for thread in threads:
644
  thread.join()
645
  else:
646
  for file_path in file_paths:
647
  await self._process_and_store(file_path, results)
648
+
649
  return results
650
 
651
  async def _process_and_store(
 
658
  result = await self.process_document(file_path)
659
  results[str(file_path)] = result
660
  except Exception as e:
661
+ results[str(file_path)] = {'error': str(e)}
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
 
src/vectorstores/chroma_vectorstore.py CHANGED
@@ -6,17 +6,18 @@ import logging
6
 
7
  from .base_vectorstore import BaseVectorStore
8
 
 
9
  class ChromaVectorStore(BaseVectorStore):
10
  def __init__(
11
- self,
12
- embedding_function: Callable[[List[str]], List[List[float]]],
13
  persist_directory: str = './chroma_db',
14
  collection_name: str = "documents",
15
  client_settings: Optional[Dict[str, Any]] = None
16
  ):
17
  """
18
  Initialize Chroma Vector Store
19
-
20
  Args:
21
  embedding_function (Callable): Function to generate embeddings
22
  persist_directory (str): Directory to persist the vector store
@@ -31,23 +32,24 @@ class ChromaVectorStore(BaseVectorStore):
31
  self.client = chromadb.PersistentClient(settings=settings)
32
  self.collection = self.client.get_or_create_collection(
33
  name=collection_name,
34
- metadata={"hnsw:space": "cosine"} # Using cosine similarity by default
 
35
  )
36
  self.embedding_function = embedding_function
37
  except Exception as e:
38
  logging.error(f"Error initializing ChromaDB: {str(e)}")
39
  raise
40
-
41
  def add_documents(
42
- self,
43
- documents: List[str],
44
  embeddings: Optional[List[List[float]]] = None,
45
  metadatas: Optional[List[Dict[str, Any]]] = None,
46
  ids: Optional[List[str]] = None
47
  ) -> None:
48
  """
49
  Add documents to the vector store
50
-
51
  Args:
52
  documents (List[str]): List of document texts
53
  embeddings (Optional[List[List[float]]]): Pre-computed embeddings
@@ -63,32 +65,35 @@ class ChromaVectorStore(BaseVectorStore):
63
  embeddings = self.embedding_function(documents)
64
 
65
  if len(documents) != len(embeddings):
66
- raise ValueError("Number of documents and embeddings must match")
67
-
 
68
  # Use provided IDs or generate them
69
- doc_ids = ids if ids is not None else [f"doc_{i}" for i in range(len(documents))]
70
-
 
71
  # Prepare add parameters
72
  add_params = {
73
  "documents": documents,
74
  "embeddings": embeddings,
75
  "ids": doc_ids
76
  }
77
-
78
  # Only include metadatas if provided
79
  if metadatas is not None:
80
  if len(metadatas) != len(documents):
81
- raise ValueError("Number of documents and metadatas must match")
 
82
  add_params["metadatas"] = metadatas
83
-
84
  self.collection.add(**add_params)
85
  except Exception as e:
86
  logging.error(f"Error adding documents to ChromaDB: {str(e)}")
87
  raise
88
-
89
  def similarity_search(
90
- self,
91
- query_embedding: List[float],
92
  top_k: int = 3,
93
  **kwargs
94
  ) -> List[Dict[str, Any]]:
@@ -102,21 +107,24 @@ class ChromaVectorStore(BaseVectorStore):
102
  n_results=10, # Get more initial results
103
  include=['documents', 'metadatas', 'distances']
104
  )
105
-
106
  if not results or 'documents' not in results or not results['documents']:
107
  logging.warning("No results found in similarity search")
108
  return []
109
-
110
  formatted_results = []
111
  documents = results['documents'][0] # First query's results
112
- metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
113
- distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
114
-
 
 
115
  # Process all results
116
  for doc, meta, dist in zip(documents, metadatas, distances):
117
  # Convert distance to similarity score (1 is most similar, 0 is least)
118
- similarity_score = 1.0 - (dist or 0.0) if dist is not None else None
119
-
 
120
  # More permissive threshold and include all results for filtering
121
  if similarity_score is not None and similarity_score > 0.2: # Lower threshold
122
  formatted_results.append({
@@ -124,45 +132,47 @@ class ChromaVectorStore(BaseVectorStore):
124
  'metadata': meta or {},
125
  'score': similarity_score
126
  })
127
-
128
  # Sort by score and get top_k results
129
  formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
130
-
131
  # Check if results are from same document and get consecutive chunks
132
  if formatted_results:
133
- first_doc_id = formatted_results[0]['metadata'].get('document_id')
 
134
  all_chunks_same_doc = []
135
-
136
  # Get all chunks from the same document
137
  for result in formatted_results:
138
  if result['metadata'].get('document_id') == first_doc_id:
139
  all_chunks_same_doc.append(result)
140
-
141
  # Sort chunks by their index to maintain document flow
142
  all_chunks_same_doc.sort(
143
  key=lambda x: x['metadata'].get('chunk_index', 0)
144
  )
145
-
146
  # Return either all chunks from same document or top_k results
147
  if len(all_chunks_same_doc) > 0:
148
  return all_chunks_same_doc[:top_k]
149
-
150
  return formatted_results[:top_k]
151
-
152
  except Exception as e:
153
- logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
 
154
  raise
155
-
156
  def get_all_documents(
157
  self,
158
  include_embeddings: bool = False
159
  ) -> List[Dict[str, Any]]:
160
  """
161
  Retrieve all documents from the vector store
162
-
163
  Args:
164
  include_embeddings (bool): Whether to include embeddings in the response
165
-
166
  Returns:
167
  List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
168
  """
@@ -170,45 +180,46 @@ class ChromaVectorStore(BaseVectorStore):
170
  include = ["documents", "metadatas"]
171
  if include_embeddings:
172
  include.append("embeddings")
173
-
174
  results = self.collection.get(
175
  include=include
176
  )
177
-
178
  if not results or 'documents' not in results:
179
  return []
180
-
181
  documents = []
182
  for i in range(len(results['documents'])):
183
  doc = {
184
  'id': str(i), # Generate sequential IDs
185
  'text': results['documents'][i],
186
  }
187
-
188
  if include_embeddings and 'embeddings' in results:
189
  doc['embedding'] = results['embeddings'][i]
190
-
191
  if 'metadatas' in results and results['metadatas'][i]:
192
  doc['metadata'] = results['metadatas'][i]
193
-
194
  # Use document_id from metadata if available
195
  if 'document_id' in results['metadatas'][i]:
196
  doc['id'] = results['metadatas'][i]['document_id']
197
-
198
  documents.append(doc)
199
-
200
  return documents
201
  except Exception as e:
202
- logging.error(f"Error retrieving documents from ChromaDB: {str(e)}")
 
203
  raise
204
-
205
  def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
206
  """
207
  Retrieve all chunks for a specific document
208
-
209
  Args:
210
  document_id (str): ID of the document to retrieve chunks for
211
-
212
  Returns:
213
  List[Dict[str, Any]]: List of document chunks with their metadata
214
  """
@@ -217,10 +228,10 @@ class ChromaVectorStore(BaseVectorStore):
217
  where={"document_id": document_id},
218
  include=["documents", "metadatas"]
219
  )
220
-
221
  if not results or 'documents' not in results:
222
  return []
223
-
224
  chunks = []
225
  for i in range(len(results['documents'])):
226
  chunk = {
@@ -228,10 +239,11 @@ class ChromaVectorStore(BaseVectorStore):
228
  'metadata': results['metadatas'][i] if results.get('metadatas') else None
229
  }
230
  chunks.append(chunk)
231
-
232
  # Sort by chunk_index if available
233
- chunks.sort(key=lambda x: x.get('metadata', {}).get('chunk_index', 0))
234
-
 
235
  return chunks
236
  except Exception as e:
237
  logging.error(f"Error retrieving document chunks: {str(e)}")
@@ -240,7 +252,7 @@ class ChromaVectorStore(BaseVectorStore):
240
  def delete_document(self, document_id: str) -> None:
241
  """
242
  Delete all chunks associated with a document_id
243
-
244
  Args:
245
  document_id (str): ID of the document to delete
246
  """
@@ -250,15 +262,17 @@ class ChromaVectorStore(BaseVectorStore):
250
  where={"document_id": document_id},
251
  include=["metadatas"]
252
  )
253
-
254
  if not results or 'ids' not in results:
255
  logging.warning(f"No document found with ID: {document_id}")
256
  return
257
-
258
  # Delete all chunks associated with the document
259
- chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
 
260
  self.collection.delete(ids=chunk_ids)
261
-
262
  except Exception as e:
263
- logging.error(f"Error deleting document {document_id} from ChromaDB: {str(e)}")
264
- raise
 
 
6
 
7
  from .base_vectorstore import BaseVectorStore
8
 
9
+
10
  class ChromaVectorStore(BaseVectorStore):
11
  def __init__(
12
+ self,
13
+ embedding_function: Callable[[List[str]], List[List[float]]],
14
  persist_directory: str = './chroma_db',
15
  collection_name: str = "documents",
16
  client_settings: Optional[Dict[str, Any]] = None
17
  ):
18
  """
19
  Initialize Chroma Vector Store
20
+
21
  Args:
22
  embedding_function (Callable): Function to generate embeddings
23
  persist_directory (str): Directory to persist the vector store
 
32
  self.client = chromadb.PersistentClient(settings=settings)
33
  self.collection = self.client.get_or_create_collection(
34
  name=collection_name,
35
+ # Using cosine similarity by default
36
+ metadata={"hnsw:space": "cosine"}
37
  )
38
  self.embedding_function = embedding_function
39
  except Exception as e:
40
  logging.error(f"Error initializing ChromaDB: {str(e)}")
41
  raise
42
+
43
  def add_documents(
44
+ self,
45
+ documents: List[str],
46
  embeddings: Optional[List[List[float]]] = None,
47
  metadatas: Optional[List[Dict[str, Any]]] = None,
48
  ids: Optional[List[str]] = None
49
  ) -> None:
50
  """
51
  Add documents to the vector store
52
+
53
  Args:
54
  documents (List[str]): List of document texts
55
  embeddings (Optional[List[List[float]]]): Pre-computed embeddings
 
65
  embeddings = self.embedding_function(documents)
66
 
67
  if len(documents) != len(embeddings):
68
+ raise ValueError(
69
+ "Number of documents and embeddings must match")
70
+
71
  # Use provided IDs or generate them
72
+ doc_ids = ids if ids is not None else [
73
+ f"doc_{i}" for i in range(len(documents))]
74
+
75
  # Prepare add parameters
76
  add_params = {
77
  "documents": documents,
78
  "embeddings": embeddings,
79
  "ids": doc_ids
80
  }
81
+
82
  # Only include metadatas if provided
83
  if metadatas is not None:
84
  if len(metadatas) != len(documents):
85
+ raise ValueError(
86
+ "Number of documents and metadatas must match")
87
  add_params["metadatas"] = metadatas
88
+
89
  self.collection.add(**add_params)
90
  except Exception as e:
91
  logging.error(f"Error adding documents to ChromaDB: {str(e)}")
92
  raise
93
+
94
  def similarity_search(
95
+ self,
96
+ query_embedding: List[float],
97
  top_k: int = 3,
98
  **kwargs
99
  ) -> List[Dict[str, Any]]:
 
107
  n_results=10, # Get more initial results
108
  include=['documents', 'metadatas', 'distances']
109
  )
110
+
111
  if not results or 'documents' not in results or not results['documents']:
112
  logging.warning("No results found in similarity search")
113
  return []
114
+
115
  formatted_results = []
116
  documents = results['documents'][0] # First query's results
117
+ metadatas = results['metadatas'][0] if results.get('metadatas') else [
118
+ None] * len(documents)
119
+ distances = results['distances'][0] if results.get('distances') else [
120
+ None] * len(documents)
121
+
122
  # Process all results
123
  for doc, meta, dist in zip(documents, metadatas, distances):
124
  # Convert distance to similarity score (1 is most similar, 0 is least)
125
+ similarity_score = 1.0 - \
126
+ (dist or 0.0) if dist is not None else None
127
+
128
  # More permissive threshold and include all results for filtering
129
  if similarity_score is not None and similarity_score > 0.2: # Lower threshold
130
  formatted_results.append({
 
132
  'metadata': meta or {},
133
  'score': similarity_score
134
  })
135
+
136
  # Sort by score and get top_k results
137
  formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
138
+
139
  # Check if results are from same document and get consecutive chunks
140
  if formatted_results:
141
+ first_doc_id = formatted_results[0]['metadata'].get(
142
+ 'document_id')
143
  all_chunks_same_doc = []
144
+
145
  # Get all chunks from the same document
146
  for result in formatted_results:
147
  if result['metadata'].get('document_id') == first_doc_id:
148
  all_chunks_same_doc.append(result)
149
+
150
  # Sort chunks by their index to maintain document flow
151
  all_chunks_same_doc.sort(
152
  key=lambda x: x['metadata'].get('chunk_index', 0)
153
  )
154
+
155
  # Return either all chunks from same document or top_k results
156
  if len(all_chunks_same_doc) > 0:
157
  return all_chunks_same_doc[:top_k]
158
+
159
  return formatted_results[:top_k]
160
+
161
  except Exception as e:
162
+ logging.error(
163
+ f"Error performing similarity search in ChromaDB: {str(e)}")
164
  raise
165
+
166
  def get_all_documents(
167
  self,
168
  include_embeddings: bool = False
169
  ) -> List[Dict[str, Any]]:
170
  """
171
  Retrieve all documents from the vector store
172
+
173
  Args:
174
  include_embeddings (bool): Whether to include embeddings in the response
175
+
176
  Returns:
177
  List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
178
  """
 
180
  include = ["documents", "metadatas"]
181
  if include_embeddings:
182
  include.append("embeddings")
183
+
184
  results = self.collection.get(
185
  include=include
186
  )
187
+
188
  if not results or 'documents' not in results:
189
  return []
190
+
191
  documents = []
192
  for i in range(len(results['documents'])):
193
  doc = {
194
  'id': str(i), # Generate sequential IDs
195
  'text': results['documents'][i],
196
  }
197
+
198
  if include_embeddings and 'embeddings' in results:
199
  doc['embedding'] = results['embeddings'][i]
200
+
201
  if 'metadatas' in results and results['metadatas'][i]:
202
  doc['metadata'] = results['metadatas'][i]
203
+
204
  # Use document_id from metadata if available
205
  if 'document_id' in results['metadatas'][i]:
206
  doc['id'] = results['metadatas'][i]['document_id']
207
+
208
  documents.append(doc)
209
+
210
  return documents
211
  except Exception as e:
212
+ logging.error(
213
+ f"Error retrieving documents from ChromaDB: {str(e)}")
214
  raise
215
+
216
  def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
217
  """
218
  Retrieve all chunks for a specific document
219
+
220
  Args:
221
  document_id (str): ID of the document to retrieve chunks for
222
+
223
  Returns:
224
  List[Dict[str, Any]]: List of document chunks with their metadata
225
  """
 
228
  where={"document_id": document_id},
229
  include=["documents", "metadatas"]
230
  )
231
+
232
  if not results or 'documents' not in results:
233
  return []
234
+
235
  chunks = []
236
  for i in range(len(results['documents'])):
237
  chunk = {
 
239
  'metadata': results['metadatas'][i] if results.get('metadatas') else None
240
  }
241
  chunks.append(chunk)
242
+
243
  # Sort by chunk_index if available
244
+ chunks.sort(key=lambda x: x.get(
245
+ 'metadata', {}).get('chunk_index', 0))
246
+
247
  return chunks
248
  except Exception as e:
249
  logging.error(f"Error retrieving document chunks: {str(e)}")
 
252
  def delete_document(self, document_id: str) -> None:
253
  """
254
  Delete all chunks associated with a document_id
255
+
256
  Args:
257
  document_id (str): ID of the document to delete
258
  """
 
262
  where={"document_id": document_id},
263
  include=["metadatas"]
264
  )
265
+
266
  if not results or 'ids' not in results:
267
  logging.warning(f"No document found with ID: {document_id}")
268
  return
269
+
270
  # Delete all chunks associated with the document
271
+ chunk_ids = [
272
+ f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
273
  self.collection.delete(ids=chunk_ids)
274
+
275
  except Exception as e:
276
+ logging.error(
277
+ f"Error deleting document {document_id} from ChromaDB: {str(e)}")
278
+ raise
temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx CHANGED
Binary files a/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx and b/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx differ