Spaces:
Running
Running
Commit
·
415595f
1
Parent(s):
f36ab64
Updarte chatbot with deployment configurations on the Render
Browse files- .gitignore +49 -0
- DockerComposeConfiguration +0 -33
- Dockerfile +0 -25
- config/__pycache__/config.cpython-312.pyc +0 -0
- config/config.py +49 -31
- render.yaml +25 -0
- requirements.txt +38 -23
- runtime.txt +1 -0
- setup.py +0 -53
- src/__pycache__/main.cpython-312.pyc +0 -0
- src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc +0 -0
- src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/rag_agent_manager.cpython-312.pyc +0 -0
- src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
- src/agents/enhanced_context_manager.py +0 -202
- src/agents/rag_agent.py +144 -247
- src/agents/rag_agent_manager.py +0 -77
- src/agents/system_instructions_rag.py +177 -338
- src/implementations/__pycache__/document_service.cpython-312.pyc +0 -0
- src/main.py +258 -138
- src/utils/__pycache__/database_cleanup.cpython-312.pyc +0 -0
- src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
- src/utils/database_cleanup.py +144 -91
- src/utils/document_processor.py +365 -64
- src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
- src/vectorstores/chroma_vectorstore.py +75 -61
- temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx +0 -0
.gitignore
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
env/
|
8 |
+
build/
|
9 |
+
develop-eggs/
|
10 |
+
dist/
|
11 |
+
downloads/
|
12 |
+
eggs/
|
13 |
+
.eggs/
|
14 |
+
lib/
|
15 |
+
lib64/
|
16 |
+
parts/
|
17 |
+
sdist/
|
18 |
+
var/
|
19 |
+
wheels/
|
20 |
+
*.egg-info/
|
21 |
+
.installed.cfg
|
22 |
+
*.egg
|
23 |
+
|
24 |
+
# Virtual Environment
|
25 |
+
venv/
|
26 |
+
ENV/
|
27 |
+
|
28 |
+
# Environment Variables
|
29 |
+
.env
|
30 |
+
.env.local
|
31 |
+
.env.*.local
|
32 |
+
|
33 |
+
# IDE
|
34 |
+
.idea/
|
35 |
+
.vscode/
|
36 |
+
*.swp
|
37 |
+
*.swo
|
38 |
+
|
39 |
+
# Logs
|
40 |
+
*.log
|
41 |
+
|
42 |
+
# Database
|
43 |
+
chroma_db/
|
44 |
+
uploads/
|
45 |
+
temp_downloads/
|
46 |
+
|
47 |
+
# OS
|
48 |
+
.DS_Store
|
49 |
+
Thumbs.db
|
DockerComposeConfiguration
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
version: '3.8'
|
2 |
-
|
3 |
-
services:
|
4 |
-
app:
|
5 |
-
build: .
|
6 |
-
ports:
|
7 |
-
- "8000:8000"
|
8 |
-
env_file:
|
9 |
-
- .env
|
10 |
-
volumes:
|
11 |
-
- ./:/app
|
12 |
-
depends_on:
|
13 |
-
- ollama
|
14 |
-
|
15 |
-
ollama:
|
16 |
-
image: ollama/ollama
|
17 |
-
ports:
|
18 |
-
- "11434:11434"
|
19 |
-
volumes:
|
20 |
-
- ollama-data:/root/.ollama
|
21 |
-
|
22 |
-
chroma:
|
23 |
-
image: chromadb/chroma
|
24 |
-
ports:
|
25 |
-
- "8000:8000"
|
26 |
-
volumes:
|
27 |
-
- chroma-data:/chroma
|
28 |
-
environment:
|
29 |
-
- PERSIST_DIRECTORY=/chroma
|
30 |
-
|
31 |
-
volumes:
|
32 |
-
ollama-data:
|
33 |
-
chroma-data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
# Use an official Python runtime as a parent image
|
2 |
-
FROM python:3.9-slim
|
3 |
-
|
4 |
-
# Set the working directory in the container
|
5 |
-
WORKDIR /app
|
6 |
-
|
7 |
-
# Install system dependencies
|
8 |
-
RUN apt-get update && apt-get install -y \
|
9 |
-
build-essential \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
11 |
-
|
12 |
-
# Copy the current directory contents into the container at /app
|
13 |
-
COPY . /app
|
14 |
-
|
15 |
-
# Install any needed packages specified in requirements.txt
|
16 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
-
|
18 |
-
# Make port 8000 available to the world outside this container
|
19 |
-
EXPOSE 8000
|
20 |
-
|
21 |
-
# Define environment variable
|
22 |
-
ENV NAME RAGChatbot
|
23 |
-
|
24 |
-
# Run the application
|
25 |
-
CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/__pycache__/config.cpython-312.pyc
CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
|
|
config/config.py
CHANGED
@@ -7,32 +7,33 @@ from google_auth_oauthlib.flow import Flow
|
|
7 |
# Load environment variables
|
8 |
load_dotenv()
|
9 |
|
|
|
10 |
class Settings:
|
11 |
# OpenAI Configuration
|
12 |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
|
13 |
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
|
14 |
-
|
15 |
ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
|
16 |
|
17 |
# Ollama Configuration
|
18 |
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
19 |
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
|
20 |
-
|
21 |
# Anthropic Configuration
|
22 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
23 |
-
|
24 |
# Embedding Configuration
|
25 |
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
26 |
-
|
27 |
# Vector Store Configuration
|
28 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
29 |
-
|
30 |
# MongoDB Configuration
|
31 |
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
|
32 |
-
|
33 |
# Feedback Configuration
|
34 |
MAX_RATING = int(os.getenv('MAX_RATING', '5'))
|
35 |
-
|
36 |
# Temporary directory for downloaded files
|
37 |
TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
|
38 |
|
@@ -40,27 +41,44 @@ class Settings:
|
|
40 |
DEBUG = os.getenv('DEBUG', 'False') == 'True'
|
41 |
|
42 |
# Google Drive Configuration
|
43 |
-
GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
|
44 |
-
GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv(
|
45 |
-
|
46 |
-
|
47 |
-
#
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
# Load environment variables
|
8 |
load_dotenv()
|
9 |
|
10 |
+
|
11 |
class Settings:
|
12 |
# OpenAI Configuration
|
13 |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
|
14 |
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
|
15 |
+
|
16 |
ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
|
17 |
|
18 |
# Ollama Configuration
|
19 |
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
20 |
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
|
21 |
+
|
22 |
# Anthropic Configuration
|
23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
24 |
+
|
25 |
# Embedding Configuration
|
26 |
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
27 |
+
|
28 |
# Vector Store Configuration
|
29 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
30 |
+
|
31 |
# MongoDB Configuration
|
32 |
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
|
33 |
+
|
34 |
# Feedback Configuration
|
35 |
MAX_RATING = int(os.getenv('MAX_RATING', '5'))
|
36 |
+
|
37 |
# Temporary directory for downloaded files
|
38 |
TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
|
39 |
|
|
|
41 |
DEBUG = os.getenv('DEBUG', 'False') == 'True'
|
42 |
|
43 |
# Google Drive Configuration
|
44 |
+
GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
|
45 |
+
GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv(
|
46 |
+
'GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
|
47 |
+
|
48 |
+
# Use explicit type conversion to ensure correct types
|
49 |
+
DOCUMENT_PROCESSOR = {
|
50 |
+
'chunk_size': int(os.getenv('DOCUMENT_CHUNK_SIZE', '1000')),
|
51 |
+
'chunk_overlap': int(os.getenv('DOCUMENT_CHUNK_OVERLAP', '200')),
|
52 |
+
# 20MB in bytes
|
53 |
+
'max_file_size': int(os.getenv('DOCUMENT_MAX_FILE_SIZE', str(20 * 1024 * 1024))),
|
54 |
+
'supported_formats': [
|
55 |
+
'.txt', '.pdf', '.docx', '.csv', '.json',
|
56 |
+
'.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
|
57 |
+
]
|
58 |
+
}
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
def get_document_processor_settings(cls) -> dict:
|
62 |
+
"""
|
63 |
+
Get document processor settings with validation
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
dict: Validated document processor settings
|
67 |
+
"""
|
68 |
+
settings = cls.DOCUMENT_PROCESSOR.copy()
|
69 |
+
|
70 |
+
# Ensure positive values for numeric settings
|
71 |
+
settings['chunk_size'] = max(
|
72 |
+
100, settings['chunk_size']) # Minimum 100
|
73 |
+
settings['chunk_overlap'] = min(
|
74 |
+
settings['chunk_overlap'],
|
75 |
+
# Ensure overlap is less than chunk size
|
76 |
+
settings['chunk_size'] - 50
|
77 |
+
)
|
78 |
+
settings['max_file_size'] = max(
|
79 |
+
1024 * 1024, settings['max_file_size']) # Minimum 1MB
|
80 |
+
|
81 |
+
return settings
|
82 |
+
|
83 |
+
|
84 |
+
settings = Settings()
|
render.yaml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
- type: web
|
3 |
+
name: chatbot-backend
|
4 |
+
env: python
|
5 |
+
region: ohio # Choose appropriate region
|
6 |
+
plan: starter # Or choose appropriate plan
|
7 |
+
buildCommand: pip install -r requirements.txt
|
8 |
+
startCommand: uvicorn src.main:app --host 0.0.0.0 --port $PORT
|
9 |
+
envVars:
|
10 |
+
- key: MONGODB_URI
|
11 |
+
sync: false
|
12 |
+
- key: OPENAI_API_KEY
|
13 |
+
sync: false
|
14 |
+
- key: ANTHROPIC_API_KEY
|
15 |
+
sync: false
|
16 |
+
- key: ADMIN_API_KEY
|
17 |
+
sync: false
|
18 |
+
- key: CHROMA_PATH
|
19 |
+
value: ./chroma_db
|
20 |
+
- key: DEBUG
|
21 |
+
value: "False"
|
22 |
+
- key: ENVIRONMENT
|
23 |
+
value: "production"
|
24 |
+
healthCheckPath: /health
|
25 |
+
autoDeploy: true
|
requirements.txt
CHANGED
@@ -1,23 +1,38 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
torch
|
4 |
+
transformers
|
5 |
+
openai
|
6 |
+
anthropic
|
7 |
+
sentence-transformers
|
8 |
+
accelerate
|
9 |
+
bitsandbytes
|
10 |
+
pydantic
|
11 |
+
email-validator
|
12 |
+
numpy
|
13 |
+
pandas
|
14 |
+
scipy
|
15 |
+
scikit-learn
|
16 |
+
pymongo
|
17 |
+
motor
|
18 |
+
chromadb
|
19 |
+
aiosqlite
|
20 |
+
python-dotenv
|
21 |
+
box
|
22 |
+
PyPDF2
|
23 |
+
python-docx
|
24 |
+
python-magic-bin==0.4.14
|
25 |
+
openpyxl
|
26 |
+
xlrd
|
27 |
+
striprtf
|
28 |
+
beautifulsoup4
|
29 |
+
pydrive2==1.14.0
|
30 |
+
google-auth-oauthlib==0.4.6
|
31 |
+
requests
|
32 |
+
tqdm
|
33 |
+
matplotlib
|
34 |
+
plotly
|
35 |
+
tiktoken
|
36 |
+
psutil
|
37 |
+
huggingface_hub
|
38 |
+
setuptools
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.12
|
setup.py
DELETED
@@ -1,53 +0,0 @@
|
|
1 |
-
from setuptools import setup, find_packages
|
2 |
-
|
3 |
-
setup(
|
4 |
-
name="chatbot",
|
5 |
-
version="1.0.0",
|
6 |
-
packages=find_packages(),
|
7 |
-
install_requires=[
|
8 |
-
# Web Framework
|
9 |
-
"fastapi",
|
10 |
-
"uvicorn",
|
11 |
-
|
12 |
-
# AI/ML
|
13 |
-
"torch",
|
14 |
-
"transformers",
|
15 |
-
"sentence-transformers",
|
16 |
-
"huggingface_hub",
|
17 |
-
|
18 |
-
# LLM Providers
|
19 |
-
"openai",
|
20 |
-
"anthropic",
|
21 |
-
"ollama",
|
22 |
-
|
23 |
-
# Data Validation & Processing
|
24 |
-
"pydantic",
|
25 |
-
"email-validator",
|
26 |
-
"numpy",
|
27 |
-
"pandas",
|
28 |
-
|
29 |
-
# Database & Storage
|
30 |
-
"pymongo",
|
31 |
-
"motor",
|
32 |
-
"chromadb",
|
33 |
-
"aiosqlite",
|
34 |
-
|
35 |
-
# Document Processing
|
36 |
-
"PyPDF2",
|
37 |
-
"python-docx",
|
38 |
-
"python-magic-bin==0.4.14",
|
39 |
-
"openpyxl",
|
40 |
-
"xlrd",
|
41 |
-
"striprtf",
|
42 |
-
"beautifulsoup4",
|
43 |
-
|
44 |
-
# Utilities
|
45 |
-
"python-dotenv",
|
46 |
-
"requests",
|
47 |
-
"tiktoken",
|
48 |
-
"psutil",
|
49 |
-
|
50 |
-
# Google Integration
|
51 |
-
"google-auth-oauthlib==0.4.6"
|
52 |
-
]
|
53 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/__pycache__/main.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
|
|
src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc and b/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc differ
|
|
src/agents/__pycache__/rag_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
src/agents/__pycache__/rag_agent_manager.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc and b/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc differ
|
|
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
|
|
src/agents/enhanced_context_manager.py
DELETED
@@ -1,202 +0,0 @@
|
|
1 |
-
from typing import List, Dict, Optional, Tuple
|
2 |
-
import spacy
|
3 |
-
from collections import defaultdict
|
4 |
-
|
5 |
-
class EnhancedContextManager:
|
6 |
-
def __init__(self):
|
7 |
-
"""Initialize the context manager with NLP components"""
|
8 |
-
# Load spaCy model for NER and dependency parsing
|
9 |
-
self.nlp = spacy.load("en_core_web_sm")
|
10 |
-
# Track entities and their mentions across conversation
|
11 |
-
self.entity_mentions = defaultdict(list)
|
12 |
-
# Track conversation turns
|
13 |
-
self.conversation_turns = []
|
14 |
-
# Track last processed entity
|
15 |
-
self.last_entity = None
|
16 |
-
# Track last full response context
|
17 |
-
self.last_full_context = None
|
18 |
-
|
19 |
-
def process_turn(self, query: str, response: str) -> None:
|
20 |
-
"""Process a conversation turn to extract and track entities"""
|
21 |
-
# Parse query and response
|
22 |
-
query_doc = self.nlp(query)
|
23 |
-
response_doc = self.nlp(response)
|
24 |
-
|
25 |
-
# Extract and track entities from both query and response
|
26 |
-
turn_entities = self._extract_entities(query_doc, response_doc)
|
27 |
-
|
28 |
-
# Store the turn with its entities
|
29 |
-
self.conversation_turns.append({
|
30 |
-
'query': query,
|
31 |
-
'response': response,
|
32 |
-
'entities': turn_entities
|
33 |
-
})
|
34 |
-
|
35 |
-
# Update entity mentions
|
36 |
-
for entity, info in turn_entities.items():
|
37 |
-
self.entity_mentions[entity].append({
|
38 |
-
'turn_index': len(self.conversation_turns) - 1,
|
39 |
-
'info': info
|
40 |
-
})
|
41 |
-
|
42 |
-
# Update last entity and full context
|
43 |
-
if turn_entities:
|
44 |
-
# Prioritize entities in response, then query
|
45 |
-
primary_entity = (
|
46 |
-
list(turn_entities.keys())[0] if turn_entities
|
47 |
-
else None
|
48 |
-
)
|
49 |
-
self.last_entity = primary_entity
|
50 |
-
|
51 |
-
# Store full context for potential reference
|
52 |
-
self.last_full_context = f"{query} {response}"
|
53 |
-
|
54 |
-
def _extract_entities(self, query_doc, response_doc) -> Dict:
|
55 |
-
"""Extract named entities and their properties"""
|
56 |
-
entities = {}
|
57 |
-
|
58 |
-
# Process both query and response documents
|
59 |
-
for doc in [query_doc, response_doc]:
|
60 |
-
for ent in doc.ents:
|
61 |
-
# Store entity with its type and text
|
62 |
-
entities[ent.text] = {
|
63 |
-
'type': ent.label_,
|
64 |
-
'text': ent.text,
|
65 |
-
'mentions': [tok.text for tok in doc if tok.head == ent.root]
|
66 |
-
}
|
67 |
-
|
68 |
-
return entities
|
69 |
-
|
70 |
-
def resolve_pronouns(self, current_query: str) -> Optional[str]:
|
71 |
-
"""
|
72 |
-
Resolve pronouns in the current query based on conversation history
|
73 |
-
|
74 |
-
Args:
|
75 |
-
current_query (str): Current query with potential pronouns
|
76 |
-
|
77 |
-
Returns:
|
78 |
-
Optional[str]: Query with resolved pronouns, or None if no resolution needed
|
79 |
-
"""
|
80 |
-
if not self.conversation_turns:
|
81 |
-
return None
|
82 |
-
|
83 |
-
query_doc = self.nlp(current_query)
|
84 |
-
|
85 |
-
# Find pronouns in current query
|
86 |
-
pronouns = [token for token in query_doc if token.pos_ == "PRON"]
|
87 |
-
if not pronouns:
|
88 |
-
return None
|
89 |
-
|
90 |
-
for pronoun in pronouns:
|
91 |
-
replacement = self._find_antecedent(pronoun.text)
|
92 |
-
if replacement:
|
93 |
-
# Replace the pronoun with the most likely antecedent
|
94 |
-
new_query = current_query.replace(pronoun.text, replacement)
|
95 |
-
return new_query
|
96 |
-
|
97 |
-
return None
|
98 |
-
|
99 |
-
def _find_antecedent(self, pronoun: str) -> Optional[str]:
|
100 |
-
"""
|
101 |
-
Find the most recent matching entity for a pronoun
|
102 |
-
|
103 |
-
Args:
|
104 |
-
pronoun (str): Pronoun to resolve
|
105 |
-
|
106 |
-
Returns:
|
107 |
-
Optional[str]: Resolved entity or None
|
108 |
-
"""
|
109 |
-
# Pronoun to gender/number mapping
|
110 |
-
pronoun_properties = {
|
111 |
-
'he': {'gender': 'male', 'number': 'singular'},
|
112 |
-
'she': {'gender': 'female', 'number': 'singular'},
|
113 |
-
'they': {'gender': None, 'number': 'plural'},
|
114 |
-
'his': {'gender': 'male', 'number': 'singular'},
|
115 |
-
'her': {'gender': 'female', 'number': 'singular'},
|
116 |
-
'their': {'gender': None, 'number': 'plural'}
|
117 |
-
}
|
118 |
-
|
119 |
-
# Normalize pronoun
|
120 |
-
pronoun_lower = pronoun.lower().rstrip('s')
|
121 |
-
|
122 |
-
# If not a known pronoun, return None
|
123 |
-
if pronoun_lower not in pronoun_properties:
|
124 |
-
return None
|
125 |
-
|
126 |
-
# If a named entity was recently mentioned, use it first
|
127 |
-
if self.last_entity:
|
128 |
-
return self.last_entity
|
129 |
-
|
130 |
-
# Fallback to last full context if no specific entity found
|
131 |
-
if self.last_full_context:
|
132 |
-
return self.last_full_context.split()[0]
|
133 |
-
|
134 |
-
return None
|
135 |
-
|
136 |
-
def enhance_query(self, current_query: str) -> str:
|
137 |
-
"""
|
138 |
-
Enhance current query with context and resolved pronouns
|
139 |
-
|
140 |
-
Args:
|
141 |
-
current_query (str): Original query
|
142 |
-
|
143 |
-
Returns:
|
144 |
-
str: Enhanced query with additional context
|
145 |
-
"""
|
146 |
-
# First try to resolve pronouns
|
147 |
-
resolved_query = self.resolve_pronouns(current_query)
|
148 |
-
|
149 |
-
# If pronouns are resolved, use the resolved query
|
150 |
-
if resolved_query:
|
151 |
-
return resolved_query
|
152 |
-
|
153 |
-
# Get relevant context
|
154 |
-
context = self._get_relevant_context(current_query)
|
155 |
-
|
156 |
-
# If context found, prepend it to the query
|
157 |
-
if context:
|
158 |
-
return f"{context} {current_query}"
|
159 |
-
|
160 |
-
# If no context resolution, return original query
|
161 |
-
return current_query
|
162 |
-
|
163 |
-
def _get_relevant_context(self, query: str) -> Optional[str]:
|
164 |
-
"""
|
165 |
-
Get relevant context from conversation history
|
166 |
-
|
167 |
-
Args:
|
168 |
-
query (str): Current query
|
169 |
-
|
170 |
-
Returns:
|
171 |
-
Optional[str]: Relevant context or None
|
172 |
-
"""
|
173 |
-
if not self.conversation_turns:
|
174 |
-
return None
|
175 |
-
|
176 |
-
# Get the most recent turn
|
177 |
-
recent_turn = self.conversation_turns[-1]
|
178 |
-
|
179 |
-
# If the current query contains a pronoun and we have last full context
|
180 |
-
if any(token.pos_ == "PRON" for token in self.nlp(query)):
|
181 |
-
return self.last_full_context
|
182 |
-
|
183 |
-
return None
|
184 |
-
|
185 |
-
def get_conversation_context(self) -> List[Dict]:
|
186 |
-
"""Get processed conversation context"""
|
187 |
-
return self.conversation_turns
|
188 |
-
|
189 |
-
def record_last_context(self, last_context: Optional[str] = None) -> None:
|
190 |
-
"""
|
191 |
-
Manually record last context if needed
|
192 |
-
|
193 |
-
Args:
|
194 |
-
last_context (Optional[str]): Last context to manually set
|
195 |
-
"""
|
196 |
-
if last_context:
|
197 |
-
self.last_full_context = last_context
|
198 |
-
# Try to extract an entity from the context
|
199 |
-
doc = self.nlp(last_context)
|
200 |
-
entities = [ent.text for ent in doc.ents]
|
201 |
-
if entities:
|
202 |
-
self.last_entity = entities[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/agents/rag_agent.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
|
|
|
2 |
import uuid
|
3 |
|
4 |
from .excel_aware_rag import ExcelAwareRAGAgent
|
5 |
-
from .enhanced_context_manager import EnhancedContextManager
|
6 |
from ..llms.base_llm import BaseLLM
|
7 |
from src.embeddings.base_embedding import BaseEmbedding
|
8 |
from src.vectorstores.base_vectorstore import BaseVectorStore
|
@@ -11,6 +11,7 @@ from src.db.mongodb_store import MongoDBStore
|
|
11 |
from src.models.rag import RAGResponse
|
12 |
from src.utils.logger import logger
|
13 |
|
|
|
14 |
class RAGAgent(ExcelAwareRAGAgent):
|
15 |
def __init__(
|
16 |
self,
|
@@ -21,7 +22,17 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
21 |
max_history_tokens: int = 4000,
|
22 |
max_history_messages: int = 10
|
23 |
):
|
24 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
super().__init__() # Initialize ExcelAwareRAGAgent
|
26 |
self.llm = llm
|
27 |
self.embedding = embedding
|
@@ -31,9 +42,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
31 |
max_tokens=max_history_tokens,
|
32 |
max_messages=max_history_messages
|
33 |
)
|
34 |
-
# Add enhanced context management while preserving existing functionality
|
35 |
-
self.context_manager = EnhancedContextManager()
|
36 |
-
logger.info("RAGAgent initialized with enhanced context management")
|
37 |
|
38 |
async def generate_response(
|
39 |
self,
|
@@ -41,46 +49,19 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
41 |
conversation_id: Optional[str],
|
42 |
temperature: float,
|
43 |
max_tokens: Optional[int] = None,
|
44 |
-
context_docs: Optional[List[str]] = None
|
45 |
-
stream: bool = False,
|
46 |
-
custom_roles: Optional[List[Dict[str, str]]] = None
|
47 |
) -> RAGResponse:
|
48 |
-
"""
|
49 |
-
Generate a response with comprehensive context and role management
|
50 |
-
|
51 |
-
Args:
|
52 |
-
query (str): User query
|
53 |
-
conversation_id (Optional[str]): Conversation identifier
|
54 |
-
temperature (float): LLM temperature for response generation
|
55 |
-
max_tokens (Optional[int]): Maximum tokens for response
|
56 |
-
context_docs (Optional[List[str]]): Pre-retrieved context documents
|
57 |
-
stream (bool): Whether to stream the response
|
58 |
-
custom_roles (Optional[List[Dict[str, str]]]): Custom role instructions
|
59 |
-
|
60 |
-
Returns:
|
61 |
-
RAGResponse: Generated response with context and metadata
|
62 |
-
"""
|
63 |
try:
|
64 |
-
|
65 |
-
|
66 |
-
# Apply custom roles if provided
|
67 |
-
if custom_roles:
|
68 |
-
for role in custom_roles:
|
69 |
-
# Modify query or context based on role
|
70 |
-
if role.get('name') == 'introduction_specialist':
|
71 |
-
query += " Provide a concise, welcoming response."
|
72 |
-
elif role.get('name') == 'knowledge_based_specialist':
|
73 |
-
query += " Ensure response is precise and directly from available knowledge."
|
74 |
-
|
75 |
-
# Introduction Handling
|
76 |
is_introduction = (
|
77 |
-
"wants support" in query and
|
78 |
-
"This is Introduction" in query and
|
79 |
("A new user with name:" in query or "An old user with name:" in query)
|
80 |
)
|
81 |
|
82 |
if is_introduction:
|
83 |
-
|
84 |
welcome_message = self._handle_contact_query(query)
|
85 |
return RAGResponse(
|
86 |
response=welcome_message,
|
@@ -89,118 +70,70 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
89 |
scores=None
|
90 |
)
|
91 |
|
92 |
-
#
|
93 |
history = []
|
94 |
-
last_context = None
|
95 |
if conversation_id:
|
96 |
-
logger.info(f"Retrieving conversation history for ID: {conversation_id}")
|
97 |
history = await self.mongodb.get_recent_messages(
|
98 |
conversation_id,
|
99 |
limit=self.conversation_manager.max_messages
|
100 |
)
|
101 |
-
|
102 |
-
#
|
103 |
history = self.conversation_manager.get_relevant_history(
|
104 |
messages=history,
|
105 |
current_query=query
|
106 |
)
|
107 |
-
|
108 |
-
|
109 |
-
for msg in history:
|
110 |
-
self.context_manager.process_turn(
|
111 |
-
msg.get('query', ''),
|
112 |
-
msg.get('response', '')
|
113 |
-
)
|
114 |
-
|
115 |
-
# Get last context if available
|
116 |
-
if history and history[-1].get('response'):
|
117 |
-
last_context = history[-1]['response']
|
118 |
-
|
119 |
-
# Query Enhancement
|
120 |
-
enhanced_query = self.context_manager.enhance_query(query)
|
121 |
-
|
122 |
-
# Manual Pronoun Handling Fallback
|
123 |
-
if enhanced_query == query:
|
124 |
-
pronoun_map = {
|
125 |
-
'his': 'he',
|
126 |
-
'her': 'she',
|
127 |
-
'their': 'they'
|
128 |
-
}
|
129 |
-
words = query.lower().split()
|
130 |
-
for pronoun, replacement in pronoun_map.items():
|
131 |
-
if pronoun in words:
|
132 |
-
# Try to use last context
|
133 |
-
if last_context:
|
134 |
-
self.context_manager.record_last_context(last_context)
|
135 |
-
enhanced_query = self.context_manager.enhance_query(query)
|
136 |
-
break
|
137 |
-
|
138 |
-
logger.info(f"Enhanced query: {enhanced_query}")
|
139 |
-
|
140 |
-
# Context Retrieval
|
141 |
if not context_docs:
|
142 |
-
logger.info("Retrieving context for enhanced query")
|
143 |
context_docs, sources, scores = await self.retrieve_context(
|
144 |
-
|
145 |
conversation_history=history
|
146 |
)
|
147 |
else:
|
148 |
-
sources =
|
149 |
scores = None
|
150 |
|
151 |
-
#
|
152 |
if not context_docs:
|
153 |
-
|
154 |
-
|
155 |
-
context_docs
|
156 |
-
sources
|
157 |
-
scores
|
158 |
-
|
159 |
-
logger.info("No relevant context found")
|
160 |
-
return RAGResponse(
|
161 |
-
response="Information about this is not available, do you want to inquire about something else?",
|
162 |
-
context_docs=[],
|
163 |
-
sources=[],
|
164 |
-
scores=None
|
165 |
-
)
|
166 |
|
167 |
-
# Excel-
|
168 |
has_excel_content = any('Sheet:' in doc for doc in context_docs)
|
169 |
if has_excel_content:
|
170 |
-
logger.info("Processing Excel-specific content")
|
171 |
try:
|
172 |
-
context_docs = self._process_excel_context(
|
|
|
173 |
except Exception as e:
|
174 |
logger.warning(f"Error processing Excel context: {str(e)}")
|
175 |
|
176 |
-
#
|
177 |
-
|
178 |
-
current_query=
|
179 |
history=history,
|
180 |
context_docs=context_docs
|
181 |
)
|
182 |
|
183 |
-
#
|
184 |
-
if stream:
|
185 |
-
# TODO: Implement actual streaming logic
|
186 |
-
# This is a placeholder and needs proper implementation
|
187 |
-
logger.warning("Streaming not fully implemented")
|
188 |
-
|
189 |
-
# Standard Response Generation
|
190 |
response = self.llm.generate(
|
191 |
-
prompt=
|
192 |
temperature=temperature,
|
193 |
max_tokens=max_tokens
|
194 |
)
|
195 |
|
196 |
-
#
|
197 |
cleaned_response = self._clean_response(response)
|
198 |
-
|
199 |
-
# Excel
|
200 |
if has_excel_content:
|
201 |
try:
|
202 |
enhanced_response = await self.enhance_excel_response(
|
203 |
-
query=
|
204 |
response=cleaned_response,
|
205 |
context_docs=context_docs
|
206 |
)
|
@@ -209,158 +142,122 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
209 |
except Exception as e:
|
210 |
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
211 |
|
212 |
-
#
|
213 |
-
self.context_manager.process_turn(query, cleaned_response)
|
214 |
-
|
215 |
-
# Metadata Generation
|
216 |
-
metadata = {
|
217 |
-
'llm_provider': getattr(self.llm, 'model_name', 'unknown'),
|
218 |
-
'temperature': temperature,
|
219 |
-
'conversation_id': conversation_id,
|
220 |
-
'context_sources': sources,
|
221 |
-
'has_excel_content': has_excel_content
|
222 |
-
}
|
223 |
-
|
224 |
-
logger.info("Successfully generated response")
|
225 |
return RAGResponse(
|
226 |
response=cleaned_response,
|
227 |
context_docs=context_docs,
|
228 |
sources=sources,
|
229 |
-
scores=scores
|
230 |
-
metadata=metadata # Added metadata
|
231 |
)
|
232 |
|
233 |
except Exception as e:
|
234 |
-
logger.error(f"Error in
|
235 |
raise
|
236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
async def retrieve_context(
|
238 |
self,
|
239 |
query: str,
|
240 |
conversation_history: Optional[List[Dict]] = None,
|
241 |
top_k: int = 3
|
242 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
243 |
-
"""
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
# Embed the enhanced query
|
272 |
-
query_embedding = self.embedding.embed_query(enhanced_query)
|
273 |
-
|
274 |
-
# Debug log embedding shape
|
275 |
-
logger.info(f"Query embedding shape: {len(query_embedding)}")
|
276 |
-
|
277 |
-
# Retrieve similar documents
|
278 |
-
results = self.vector_store.similarity_search(
|
279 |
-
query_embedding,
|
280 |
-
top_k=top_k
|
281 |
-
)
|
282 |
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
if not results:
|
290 |
-
logger.info("No results found in similarity search")
|
291 |
-
return [], [], None
|
292 |
-
|
293 |
-
# Process results
|
294 |
-
documents = [doc['text'] for doc in results]
|
295 |
-
sources = [self._convert_metadata_to_strings(doc['metadata'])
|
296 |
-
for doc in results]
|
297 |
-
scores = [doc['score'] for doc in results
|
298 |
-
if doc.get('score') is not None]
|
299 |
-
|
300 |
-
# Return scores only if available for all documents
|
301 |
-
if len(scores) != len(documents):
|
302 |
-
scores = None
|
303 |
|
304 |
-
|
305 |
-
|
|
|
|
|
|
|
|
|
306 |
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
|
311 |
-
|
312 |
-
"""Clean response text while preserving key information"""
|
313 |
-
if not response:
|
314 |
-
return response
|
315 |
-
|
316 |
-
# Keep only the most common phrases to remove
|
317 |
-
phrases_to_remove = [
|
318 |
-
"Based on the context,",
|
319 |
-
"According to the documents,",
|
320 |
-
"From the information available,",
|
321 |
-
"Based on the provided information,",
|
322 |
-
"I apologize,"
|
323 |
-
]
|
324 |
-
|
325 |
-
cleaned_response = response
|
326 |
-
for phrase in phrases_to_remove:
|
327 |
-
cleaned_response = cleaned_response.replace(phrase, "").strip()
|
328 |
-
|
329 |
-
cleaned_response = " ".join(cleaned_response.split())
|
330 |
-
|
331 |
-
if not cleaned_response:
|
332 |
-
return response
|
333 |
-
|
334 |
-
if cleaned_response[0].islower():
|
335 |
-
cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
|
336 |
-
|
337 |
-
return cleaned_response
|
338 |
|
339 |
def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
|
340 |
-
"""Convert metadata values to strings"""
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
return metadata
|
349 |
-
|
350 |
-
def _handle_contact_query(self, query: str) -> str:
|
351 |
-
"""Handle contact/introduction queries"""
|
352 |
-
try:
|
353 |
-
name_start = query.find('name: "') + 7
|
354 |
-
name_end = query.find('"', name_start)
|
355 |
-
name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
|
356 |
-
|
357 |
-
is_returning = (
|
358 |
-
"An old user with name:" in query and
|
359 |
-
"wants support again" in query
|
360 |
-
)
|
361 |
-
|
362 |
-
return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
|
363 |
-
|
364 |
-
except Exception as e:
|
365 |
-
logger.error(f"Error handling contact query: {str(e)}")
|
366 |
-
return "Welcome, How can I help you?"
|
|
|
1 |
+
# src/agents/rag_agent.py
|
2 |
+
from typing import List, Optional, Tuple, Dict
|
3 |
import uuid
|
4 |
|
5 |
from .excel_aware_rag import ExcelAwareRAGAgent
|
|
|
6 |
from ..llms.base_llm import BaseLLM
|
7 |
from src.embeddings.base_embedding import BaseEmbedding
|
8 |
from src.vectorstores.base_vectorstore import BaseVectorStore
|
|
|
11 |
from src.models.rag import RAGResponse
|
12 |
from src.utils.logger import logger
|
13 |
|
14 |
+
|
15 |
class RAGAgent(ExcelAwareRAGAgent):
|
16 |
def __init__(
|
17 |
self,
|
|
|
22 |
max_history_tokens: int = 4000,
|
23 |
max_history_messages: int = 10
|
24 |
):
|
25 |
+
"""
|
26 |
+
Initialize RAG Agent
|
27 |
+
|
28 |
+
Args:
|
29 |
+
llm (BaseLLM): Language model instance
|
30 |
+
embedding (BaseEmbedding): Embedding model instance
|
31 |
+
vector_store (BaseVectorStore): Vector store instance
|
32 |
+
mongodb (MongoDBStore): MongoDB store instance
|
33 |
+
max_history_tokens (int): Maximum tokens in conversation history
|
34 |
+
max_history_messages (int): Maximum messages to keep in history
|
35 |
+
"""
|
36 |
super().__init__() # Initialize ExcelAwareRAGAgent
|
37 |
self.llm = llm
|
38 |
self.embedding = embedding
|
|
|
42 |
max_tokens=max_history_tokens,
|
43 |
max_messages=max_history_messages
|
44 |
)
|
|
|
|
|
|
|
45 |
|
46 |
async def generate_response(
|
47 |
self,
|
|
|
49 |
conversation_id: Optional[str],
|
50 |
temperature: float,
|
51 |
max_tokens: Optional[int] = None,
|
52 |
+
context_docs: Optional[List[str]] = None
|
|
|
|
|
53 |
) -> RAGResponse:
|
54 |
+
"""Generate response with specific handling for different query types"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
try:
|
56 |
+
# First, check if this is an introduction/welcome message query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
is_introduction = (
|
58 |
+
"wants support" in query and
|
59 |
+
"This is Introduction" in query and
|
60 |
("A new user with name:" in query or "An old user with name:" in query)
|
61 |
)
|
62 |
|
63 |
if is_introduction:
|
64 |
+
# Handle introduction message - no context needed
|
65 |
welcome_message = self._handle_contact_query(query)
|
66 |
return RAGResponse(
|
67 |
response=welcome_message,
|
|
|
70 |
scores=None
|
71 |
)
|
72 |
|
73 |
+
# Get conversation history if conversation_id exists
|
74 |
history = []
|
|
|
75 |
if conversation_id:
|
|
|
76 |
history = await self.mongodb.get_recent_messages(
|
77 |
conversation_id,
|
78 |
limit=self.conversation_manager.max_messages
|
79 |
)
|
80 |
+
|
81 |
+
# Get relevant history within token limits
|
82 |
history = self.conversation_manager.get_relevant_history(
|
83 |
messages=history,
|
84 |
current_query=query
|
85 |
)
|
86 |
+
|
87 |
+
# Retrieve context if not provided
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
if not context_docs:
|
|
|
89 |
context_docs, sources, scores = await self.retrieve_context(
|
90 |
+
query=query,
|
91 |
conversation_history=history
|
92 |
)
|
93 |
else:
|
94 |
+
sources = None
|
95 |
scores = None
|
96 |
|
97 |
+
# Check if we have any relevant context
|
98 |
if not context_docs:
|
99 |
+
return RAGResponse(
|
100 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
101 |
+
context_docs=[],
|
102 |
+
sources=[],
|
103 |
+
scores=None
|
104 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# Check if this is an Excel-related query
|
107 |
has_excel_content = any('Sheet:' in doc for doc in context_docs)
|
108 |
if has_excel_content:
|
|
|
109 |
try:
|
110 |
+
context_docs = self._process_excel_context(
|
111 |
+
context_docs, query)
|
112 |
except Exception as e:
|
113 |
logger.warning(f"Error processing Excel context: {str(e)}")
|
114 |
|
115 |
+
# Generate prompt with context and history
|
116 |
+
augmented_prompt = self.conversation_manager.generate_prompt_with_history(
|
117 |
+
current_query=query,
|
118 |
history=history,
|
119 |
context_docs=context_docs
|
120 |
)
|
121 |
|
122 |
+
# Generate initial response
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
response = self.llm.generate(
|
124 |
+
prompt=augmented_prompt,
|
125 |
temperature=temperature,
|
126 |
max_tokens=max_tokens
|
127 |
)
|
128 |
|
129 |
+
# Clean the response
|
130 |
cleaned_response = self._clean_response(response)
|
131 |
+
|
132 |
+
# For Excel queries, enhance the response
|
133 |
if has_excel_content:
|
134 |
try:
|
135 |
enhanced_response = await self.enhance_excel_response(
|
136 |
+
query=query,
|
137 |
response=cleaned_response,
|
138 |
context_docs=context_docs
|
139 |
)
|
|
|
142 |
except Exception as e:
|
143 |
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
144 |
|
145 |
+
# Return the final response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
return RAGResponse(
|
147 |
response=cleaned_response,
|
148 |
context_docs=context_docs,
|
149 |
sources=sources,
|
150 |
+
scores=scores
|
|
|
151 |
)
|
152 |
|
153 |
except Exception as e:
|
154 |
+
logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
|
155 |
raise
|
156 |
|
157 |
+
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
158 |
+
"""
|
159 |
+
Create prompt for generating response from context
|
160 |
+
|
161 |
+
Args:
|
162 |
+
query (str): User query
|
163 |
+
context_docs (List[str]): Retrieved context documents
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
str: Formatted prompt for the LLM
|
167 |
+
"""
|
168 |
+
if not context_docs:
|
169 |
+
return f"Query: {query}\nResponse: Information about this is not available, do you want to inquire about something else?"
|
170 |
+
|
171 |
+
# Format context documents
|
172 |
+
formatted_context = "\n\n".join(
|
173 |
+
f"Context {i+1}:\n{doc.strip()}"
|
174 |
+
for i, doc in enumerate(context_docs)
|
175 |
+
if doc and doc.strip()
|
176 |
+
)
|
177 |
+
|
178 |
+
# Build the prompt with detailed instructions
|
179 |
+
prompt = f"""You are a knowledgeable assistant. Use the following context to answer the query accurately and informatively.
|
180 |
+
|
181 |
+
Context Information:
|
182 |
+
{formatted_context}
|
183 |
+
|
184 |
+
Query: {query}
|
185 |
+
|
186 |
+
Instructions:
|
187 |
+
1. Base your response ONLY on the information provided in the context above
|
188 |
+
2. If the context contains numbers, statistics, or specific details, include them in your response
|
189 |
+
3. Keep your response focused and relevant to the query
|
190 |
+
4. Use clear and professional language
|
191 |
+
5. If the context includes technical terms, explain them appropriately
|
192 |
+
6. Do not make assumptions or add information not present in the context
|
193 |
+
7. If specific sections of a report are mentioned, maintain their original structure
|
194 |
+
8. Format the response in a clear, readable manner
|
195 |
+
9. If the context includes chronological information, maintain the proper sequence
|
196 |
+
|
197 |
+
Response:"""
|
198 |
+
|
199 |
+
return prompt
|
200 |
+
|
201 |
async def retrieve_context(
|
202 |
self,
|
203 |
query: str,
|
204 |
conversation_history: Optional[List[Dict]] = None,
|
205 |
top_k: int = 3
|
206 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
207 |
+
"""
|
208 |
+
Retrieve context with conversation history enhancement
|
209 |
+
"""
|
210 |
+
# Enhance query with conversation history
|
211 |
+
if conversation_history:
|
212 |
+
recent_queries = [
|
213 |
+
msg['query'] for msg in conversation_history[-2:]
|
214 |
+
if msg.get('query')
|
215 |
+
]
|
216 |
+
enhanced_query = " ".join([*recent_queries, query])
|
217 |
+
else:
|
218 |
+
enhanced_query = query
|
219 |
+
|
220 |
+
# Debug log the enhanced query
|
221 |
+
logger.info(f"Enhanced query: {enhanced_query}")
|
222 |
+
|
223 |
+
# Embed the enhanced query
|
224 |
+
query_embedding = self.embedding.embed_query(enhanced_query)
|
225 |
+
|
226 |
+
# Debug log embedding shape
|
227 |
+
logger.info(f"Query embedding shape: {len(query_embedding)}")
|
228 |
+
|
229 |
+
# Retrieve similar documents
|
230 |
+
results = self.vector_store.similarity_search(
|
231 |
+
query_embedding,
|
232 |
+
top_k=top_k
|
233 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
+
# Debug log search results
|
236 |
+
logger.info(f"Number of search results: {len(results)}")
|
237 |
+
for i, result in enumerate(results):
|
238 |
+
logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
|
239 |
+
logger.info(
|
240 |
+
f"Result {i} text preview: {result.get('text', '')[:100]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
+
# Process results
|
243 |
+
documents = [doc['text'] for doc in results]
|
244 |
+
sources = [self._convert_metadata_to_strings(doc['metadata'])
|
245 |
+
for doc in results]
|
246 |
+
scores = [doc['score'] for doc in results
|
247 |
+
if doc.get('score') is not None]
|
248 |
|
249 |
+
# Return scores only if available for all documents
|
250 |
+
if len(scores) != len(documents):
|
251 |
+
scores = None
|
252 |
|
253 |
+
return documents, sources, scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
|
256 |
+
"""Convert numeric metadata values to strings"""
|
257 |
+
converted = {}
|
258 |
+
for key, value in metadata.items():
|
259 |
+
if isinstance(value, (int, float)):
|
260 |
+
converted[key] = str(value)
|
261 |
+
else:
|
262 |
+
converted[key] = value
|
263 |
+
return converted
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/agents/rag_agent_manager.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
# src/agents/rag_agent_manager.py
|
2 |
-
from typing import Optional
|
3 |
-
import weakref
|
4 |
-
|
5 |
-
from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
|
6 |
-
from src.llms.base_llm import BaseLLM
|
7 |
-
from src.embeddings.base_embedding import BaseEmbedding
|
8 |
-
from src.vectorstores.base_vectorstore import BaseVectorStore
|
9 |
-
from src.db.mongodb_store import MongoDBStore
|
10 |
-
from src.utils.logger import logger
|
11 |
-
|
12 |
-
class RAGAgentManager:
|
13 |
-
"""
|
14 |
-
Singleton manager for RAG Agent instances with intelligent caching
|
15 |
-
"""
|
16 |
-
_instance = None
|
17 |
-
|
18 |
-
def __new__(cls):
|
19 |
-
if not cls._instance:
|
20 |
-
cls._instance = super().__new__(cls)
|
21 |
-
return cls._instance
|
22 |
-
|
23 |
-
def __init__(self):
|
24 |
-
# Ensure this is only initialized once
|
25 |
-
if not hasattr(self, '_initialized'):
|
26 |
-
self._rag_agent = None
|
27 |
-
self._initialized = True
|
28 |
-
|
29 |
-
def get_rag_agent(
|
30 |
-
self,
|
31 |
-
llm: BaseLLM,
|
32 |
-
embedding_model: BaseEmbedding,
|
33 |
-
vector_store: BaseVectorStore,
|
34 |
-
mongodb: MongoDBStore
|
35 |
-
) -> SystemInstructionsRAGAgent:
|
36 |
-
"""
|
37 |
-
Get or create a singleton RAG agent instance with intelligent caching
|
38 |
-
|
39 |
-
Args:
|
40 |
-
llm: Language Model instance
|
41 |
-
embedding_model: Embedding model instance
|
42 |
-
vector_store: Vector store instance
|
43 |
-
mongodb: MongoDB store instance
|
44 |
-
|
45 |
-
Returns:
|
46 |
-
SystemInstructionsRAGAgent: Singleton instance of the RAG agent
|
47 |
-
"""
|
48 |
-
# If RAG agent exists and all dependencies are the same, return it
|
49 |
-
if self._rag_agent is not None:
|
50 |
-
logger.info("Reusing existing RAG agent instance")
|
51 |
-
return self._rag_agent
|
52 |
-
|
53 |
-
try:
|
54 |
-
logger.info("Creating new RAG agent instance")
|
55 |
-
# Create the agent
|
56 |
-
self._rag_agent = SystemInstructionsRAGAgent(
|
57 |
-
llm=llm,
|
58 |
-
embedding=embedding_model,
|
59 |
-
vector_store=vector_store,
|
60 |
-
mongodb=mongodb
|
61 |
-
)
|
62 |
-
|
63 |
-
return self._rag_agent
|
64 |
-
|
65 |
-
except Exception as e:
|
66 |
-
logger.error(f"Error creating RAG agent: {str(e)}")
|
67 |
-
raise
|
68 |
-
|
69 |
-
def reset_rag_agent(self):
|
70 |
-
"""
|
71 |
-
Reset the RAG agent instance
|
72 |
-
"""
|
73 |
-
logger.info("Resetting RAG agent instance")
|
74 |
-
self._rag_agent = None
|
75 |
-
|
76 |
-
# Create a global instance for easy import
|
77 |
-
rag_agent_manager = RAGAgentManager()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/agents/system_instructions_rag.py
CHANGED
@@ -1,34 +1,12 @@
|
|
1 |
# src/agents/system_instructions_rag.py
|
2 |
-
from typing import List, Dict, Optional
|
3 |
-
import
|
4 |
-
from src.agents.rag_agent import RAGAgent
|
5 |
-
from src.llms.base_llm import BaseLLM
|
6 |
-
from src.embeddings.base_embedding import BaseEmbedding
|
7 |
-
from src.vectorstores.base_vectorstore import BaseVectorStore
|
8 |
-
from src.db.mongodb_store import MongoDBStore
|
9 |
-
from src.models.rag import RAGResponse
|
10 |
from src.utils.logger import logger
|
|
|
|
|
11 |
|
12 |
class SystemInstructionsRAGAgent(RAGAgent):
|
13 |
-
|
14 |
-
self,
|
15 |
-
llm: BaseLLM,
|
16 |
-
embedding: BaseEmbedding,
|
17 |
-
vector_store: BaseVectorStore,
|
18 |
-
mongodb: MongoDBStore,
|
19 |
-
max_history_tokens: int = 4000,
|
20 |
-
max_history_messages: int = 10
|
21 |
-
):
|
22 |
-
"""Initialize SystemInstructionsRAGAgent with enhanced context management"""
|
23 |
-
super().__init__(
|
24 |
-
llm=llm,
|
25 |
-
embedding=embedding,
|
26 |
-
vector_store=vector_store,
|
27 |
-
mongodb=mongodb,
|
28 |
-
max_history_tokens=max_history_tokens,
|
29 |
-
max_history_messages=max_history_messages
|
30 |
-
)
|
31 |
-
self.nlp = spacy.load("en_core_web_sm")
|
32 |
|
33 |
async def generate_response(
|
34 |
self,
|
@@ -36,18 +14,19 @@ class SystemInstructionsRAGAgent(RAGAgent):
|
|
36 |
conversation_id: Optional[str] = None,
|
37 |
temperature: float = 0.7,
|
38 |
max_tokens: Optional[int] = None,
|
39 |
-
context_docs: Optional[List[str]] = None
|
40 |
-
stream: bool = False
|
41 |
) -> RAGResponse:
|
42 |
-
"""Generate response with
|
43 |
try:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
welcome_message = self._handle_contact_query(query)
|
52 |
return RAGResponse(
|
53 |
response=welcome_message,
|
@@ -56,282 +35,200 @@ class SystemInstructionsRAGAgent(RAGAgent):
|
|
56 |
scores=None
|
57 |
)
|
58 |
|
59 |
-
# Get
|
60 |
-
|
61 |
if conversation_id:
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
#
|
78 |
-
if
|
79 |
-
|
80 |
-
else:
|
81 |
-
# Try with original query first
|
82 |
-
current_context, sources, scores = await self.retrieve_context(
|
83 |
query,
|
84 |
-
conversation_history=
|
85 |
)
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
conversation_history=history
|
94 |
-
)
|
95 |
-
|
96 |
-
# If still no context, try history fallback
|
97 |
-
if not current_context:
|
98 |
-
current_context, sources = self._get_context_from_history(history)
|
99 |
-
|
100 |
-
logger.info(f"Retrieved {len(current_context) if current_context else 0} context documents")
|
101 |
-
|
102 |
-
# Check context relevance
|
103 |
-
has_relevant_context = self._check_context_relevance(query, current_context or [])
|
104 |
-
logger.info(f"Context relevance check result: {has_relevant_context}")
|
105 |
-
|
106 |
-
# Handle no context case
|
107 |
if not has_relevant_context:
|
108 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
# Generate response
|
111 |
-
prompt = self._create_response_prompt(query, current_context)
|
112 |
response_text = self.llm.generate(
|
113 |
-
prompt
|
114 |
temperature=temperature,
|
115 |
max_tokens=max_tokens
|
116 |
)
|
117 |
|
118 |
-
#
|
119 |
cleaned_response = self._clean_response(response_text)
|
120 |
if self._is_no_info_response(cleaned_response):
|
121 |
-
return
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
if any('Sheet:' in doc for doc in (current_context or [])):
|
128 |
-
try:
|
129 |
-
cleaned_response = await self.enhance_excel_response(
|
130 |
-
query=query,
|
131 |
-
response=cleaned_response,
|
132 |
-
context_docs=current_context
|
133 |
-
)
|
134 |
-
except Exception as e:
|
135 |
-
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
136 |
|
137 |
return RAGResponse(
|
138 |
response=cleaned_response,
|
139 |
-
context_docs=
|
140 |
sources=sources,
|
141 |
scores=scores
|
142 |
)
|
143 |
|
144 |
except Exception as e:
|
145 |
-
logger.error(f"Error in
|
146 |
raise
|
147 |
|
148 |
-
def
|
149 |
-
"""Convert all metadata values to strings"""
|
150 |
-
return {
|
151 |
-
key: str(value) if value is not None else None
|
152 |
-
for key, value in metadata.items()
|
153 |
-
}
|
154 |
-
|
155 |
-
async def retrieve_context(
|
156 |
self,
|
157 |
query: str,
|
|
|
158 |
conversation_history: Optional[List[Dict]] = None
|
159 |
-
) ->
|
160 |
-
"""
|
161 |
-
|
162 |
-
logger.info(f"Processing query for context retrieval: {query}")
|
163 |
-
|
164 |
-
collection_data = self.vector_store.collection.get()
|
165 |
-
|
166 |
-
if not collection_data or 'documents' not in collection_data:
|
167 |
-
logger.warning("No documents found in ChromaDB")
|
168 |
-
return [], [], None
|
169 |
-
|
170 |
-
documents = collection_data['documents']
|
171 |
-
metadatas = collection_data.get('metadatas', [])
|
172 |
-
|
173 |
-
# Clean and enhance query with date variations
|
174 |
-
clean_query = query.lower().strip()
|
175 |
-
|
176 |
-
# Extract and enhance date information
|
177 |
-
import re
|
178 |
-
from datetime import datetime
|
179 |
-
|
180 |
-
date_pattern = r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}'
|
181 |
-
dates = re.findall(date_pattern, clean_query.lower())
|
182 |
-
|
183 |
-
enhanced_query = clean_query
|
184 |
-
target_date = None
|
185 |
-
|
186 |
-
if dates:
|
187 |
-
try:
|
188 |
-
date_obj = datetime.strptime(dates[0], '%b %d, %Y')
|
189 |
-
target_date = date_obj.strftime('%b %d, %Y')
|
190 |
-
|
191 |
-
date_variations = [
|
192 |
-
date_obj.strftime('%B %d, %Y'),
|
193 |
-
date_obj.strftime('%d/%m/%Y'),
|
194 |
-
date_obj.strftime('%Y-%m-%d'),
|
195 |
-
target_date
|
196 |
-
]
|
197 |
-
|
198 |
-
enhanced_query = f"{clean_query} {' '.join(date_variations)}"
|
199 |
-
|
200 |
-
except ValueError as e:
|
201 |
-
logger.warning(f"Error parsing date: {str(e)}")
|
202 |
-
|
203 |
-
# First try exact date matching
|
204 |
-
exact_matches = []
|
205 |
-
exact_metadata = []
|
206 |
-
|
207 |
-
if target_date:
|
208 |
-
for i, doc in enumerate(documents):
|
209 |
-
if target_date in doc:
|
210 |
-
logger.info(f"Found exact date match in document {i}")
|
211 |
-
exact_matches.append(doc)
|
212 |
-
if metadatas:
|
213 |
-
# Convert metadata values to strings
|
214 |
-
exact_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
|
215 |
-
|
216 |
-
if exact_matches:
|
217 |
-
logger.info(f"Found {len(exact_matches)} exact date matches")
|
218 |
-
document_id = exact_metadata[0].get('document_id') if exact_metadata else None
|
219 |
-
|
220 |
-
if document_id:
|
221 |
-
all_related_chunks = []
|
222 |
-
all_related_metadata = []
|
223 |
-
all_related_scores = []
|
224 |
-
|
225 |
-
for i, doc in enumerate(documents):
|
226 |
-
if metadatas[i].get('document_id') == document_id:
|
227 |
-
all_related_chunks.append(doc)
|
228 |
-
# Convert metadata values to strings
|
229 |
-
all_related_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
|
230 |
-
all_related_scores.append(1.0)
|
231 |
-
|
232 |
-
# Sort chunks by their index
|
233 |
-
sorted_results = sorted(
|
234 |
-
zip(all_related_chunks, all_related_metadata, all_related_scores),
|
235 |
-
key=lambda x: int(x[1].get('chunk_index', '0')) # Convert to int for sorting
|
236 |
-
)
|
237 |
-
|
238 |
-
sorted_chunks, sorted_metadata, sorted_scores = zip(*sorted_results)
|
239 |
-
|
240 |
-
logger.info(f"Returning {len(sorted_chunks)} chunks from document {document_id}")
|
241 |
-
return list(sorted_chunks), list(sorted_metadata), list(sorted_scores)
|
242 |
-
|
243 |
-
# If no exact matches, use enhanced query for embedding search
|
244 |
-
logger.info("No exact matches found, using enhanced query for embedding search")
|
245 |
-
query_embedding = self.embedding.embed_query(enhanced_query)
|
246 |
-
|
247 |
-
results = self.vector_store.similarity_search(
|
248 |
-
query_embedding,
|
249 |
-
top_k=5
|
250 |
-
)
|
251 |
-
|
252 |
-
if not results:
|
253 |
-
logger.warning("No results found in similarity search")
|
254 |
-
return [], [], None
|
255 |
-
|
256 |
-
context_docs = []
|
257 |
-
sources = []
|
258 |
-
scores = []
|
259 |
-
|
260 |
-
sorted_results = sorted(results, key=lambda x: x.get('score', 0), reverse=True)
|
261 |
-
|
262 |
-
for result in sorted_results:
|
263 |
-
score = result.get('score', 0)
|
264 |
-
if score > 0.3:
|
265 |
-
context_docs.append(result.get('text', ''))
|
266 |
-
# Convert metadata values to strings
|
267 |
-
sources.append(self._convert_metadata_to_strings(result.get('metadata', {})))
|
268 |
-
scores.append(score)
|
269 |
-
|
270 |
-
if context_docs:
|
271 |
-
logger.info(f"Returning {len(context_docs)} documents from similarity search")
|
272 |
-
return context_docs, sources, scores
|
273 |
-
|
274 |
-
logger.warning("No relevant documents found")
|
275 |
-
return [], [], None
|
276 |
-
|
277 |
-
except Exception as e:
|
278 |
-
logger.error(f"Error in retrieve_context: {str(e)}")
|
279 |
-
logger.exception("Full traceback:")
|
280 |
-
return [], [], None
|
281 |
-
|
282 |
-
def _is_introduction_query(self, query: str) -> bool:
|
283 |
-
"""Check if query is an introduction message"""
|
284 |
-
return (
|
285 |
-
"wants support" in query and
|
286 |
-
"This is Introduction" in query and
|
287 |
-
("A new user with name:" in query or "An old user with name:" in query)
|
288 |
-
)
|
289 |
-
|
290 |
-
def _get_context_from_history(
|
291 |
-
self,
|
292 |
-
history: List[Dict]
|
293 |
-
) -> Tuple[Optional[List[str]], Optional[List[Dict]]]:
|
294 |
-
"""Extract context from conversation history"""
|
295 |
-
for msg in reversed(history):
|
296 |
-
if msg.get('context') and not self._is_no_info_response(msg.get('response', '')):
|
297 |
-
return msg['context'], msg.get('sources', [])
|
298 |
-
return None, None
|
299 |
-
|
300 |
-
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
301 |
-
"""Create prompt for response generation"""
|
302 |
formatted_context = '\n\n'.join(
|
303 |
f"Context {i+1}:\n{doc.strip()}"
|
304 |
for i, doc in enumerate(context_docs)
|
305 |
if doc and doc.strip()
|
306 |
)
|
307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
return f"""
|
309 |
-
Use
|
310 |
|
|
|
311 |
{formatted_context}
|
|
|
312 |
|
313 |
Instructions:
|
314 |
-
1. Use
|
315 |
-
2. If the information is found
|
316 |
3. Do not make assumptions or add information not present in the context
|
317 |
4. Ensure the response is clear and complete based on available information
|
318 |
-
5. If you cannot find relevant information about the specific query
|
319 |
respond exactly with: "Information about this is not available, do you want to inquire about something else?"
|
320 |
|
321 |
Query: {query}
|
322 |
Response:"""
|
323 |
|
324 |
-
def
|
325 |
-
"""
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
def _clean_response(self, response: str) -> str:
|
334 |
-
"""Clean response
|
335 |
if not response:
|
336 |
return response
|
337 |
|
@@ -351,6 +248,7 @@ Response:"""
|
|
351 |
"Here's what I found:",
|
352 |
"Here's the information you requested:",
|
353 |
"According to the provided information,",
|
|
|
354 |
"The information suggests that",
|
355 |
"From what I can see,",
|
356 |
"Let me explain",
|
@@ -359,85 +257,26 @@ Response:"""
|
|
359 |
"I can see that",
|
360 |
"Sure,",
|
361 |
"Well,",
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
"I apologize,"
|
363 |
]
|
364 |
-
|
365 |
cleaned_response = response
|
366 |
for phrase in phrases_to_remove:
|
367 |
cleaned_response = cleaned_response.replace(phrase, "").strip()
|
368 |
-
|
369 |
cleaned_response = " ".join(cleaned_response.split())
|
370 |
-
|
371 |
if not cleaned_response:
|
372 |
return response
|
373 |
-
|
374 |
-
if cleaned_response[0].islower():
|
375 |
-
cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
|
376 |
-
|
377 |
-
return cleaned_response
|
378 |
-
|
379 |
-
def _is_no_info_response(self, response: str) -> bool:
|
380 |
-
"""Check if response indicates no information available"""
|
381 |
-
no_info_indicators = [
|
382 |
-
"i do not have",
|
383 |
-
"i don't have",
|
384 |
-
"no information",
|
385 |
-
"not available",
|
386 |
-
"could not find",
|
387 |
-
"couldn't find",
|
388 |
-
"cannot find",
|
389 |
-
"don't know",
|
390 |
-
"do not know",
|
391 |
-
"unable to find",
|
392 |
-
"no data",
|
393 |
-
"no relevant"
|
394 |
-
]
|
395 |
-
response_lower = response.lower()
|
396 |
-
return any(indicator in response_lower for indicator in no_info_indicators)
|
397 |
-
|
398 |
-
def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
|
399 |
-
"""Enhanced context relevance checking"""
|
400 |
-
if not context_docs:
|
401 |
-
return False
|
402 |
-
|
403 |
-
# Clean and prepare query
|
404 |
-
clean_query = query.lower().strip()
|
405 |
-
query_terms = set(word for word in clean_query.split()
|
406 |
-
if word not in {'tell', 'me', 'about', 'what', 'is', 'the'})
|
407 |
-
|
408 |
-
for doc in context_docs:
|
409 |
-
if not doc:
|
410 |
-
continue
|
411 |
-
|
412 |
-
doc_lower = doc.lower()
|
413 |
-
|
414 |
-
# For CSV-like content, check each line
|
415 |
-
lines = doc_lower.split('\n')
|
416 |
-
for line in lines:
|
417 |
-
# Check if any query term appears in the line
|
418 |
-
if any(term in line for term in query_terms):
|
419 |
-
return True
|
420 |
-
|
421 |
-
# Also check the whole document for good measure
|
422 |
-
if any(term in doc_lower for term in query_terms):
|
423 |
-
return True
|
424 |
-
|
425 |
-
return False
|
426 |
-
|
427 |
-
def _handle_contact_query(self, query: str) -> str:
|
428 |
-
"""Handle contact/introduction queries"""
|
429 |
-
try:
|
430 |
-
name_start = query.find('name: "') + 7
|
431 |
-
name_end = query.find('"', name_start)
|
432 |
-
name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
|
433 |
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
)
|
438 |
-
|
439 |
-
return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
|
440 |
|
441 |
-
|
442 |
-
logger.error(f"Error handling contact query: {str(e)}")
|
443 |
-
return "Welcome, How can I help you?"
|
|
|
1 |
# src/agents/system_instructions_rag.py
|
2 |
+
from typing import List, Dict, Optional
|
3 |
+
from src.agents.rag_agent import RAGResponse
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from src.utils.logger import logger
|
5 |
+
from src.agents.rag_agent import RAGAgent
|
6 |
+
|
7 |
|
8 |
class SystemInstructionsRAGAgent(RAGAgent):
|
9 |
+
"""RAG Agent with enhanced system instructions for specific use cases"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
async def generate_response(
|
12 |
self,
|
|
|
14 |
conversation_id: Optional[str] = None,
|
15 |
temperature: float = 0.7,
|
16 |
max_tokens: Optional[int] = None,
|
17 |
+
context_docs: Optional[List[str]] = None
|
|
|
18 |
) -> RAGResponse:
|
19 |
+
"""Generate response with specific handling for introduction and no-context cases"""
|
20 |
try:
|
21 |
+
# First, check if this is an introduction/welcome message query
|
22 |
+
is_introduction = (
|
23 |
+
"wants support" in query and
|
24 |
+
"This is Introduction" in query and
|
25 |
+
("A new user with name:" in query or "An old user with name:" in query)
|
26 |
+
)
|
27 |
|
28 |
+
if is_introduction:
|
29 |
+
# Handle introduction message - no context needed
|
30 |
welcome_message = self._handle_contact_query(query)
|
31 |
return RAGResponse(
|
32 |
response=welcome_message,
|
|
|
35 |
scores=None
|
36 |
)
|
37 |
|
38 |
+
# Get conversation history if conversation_id exists
|
39 |
+
conversation_history = []
|
40 |
if conversation_id:
|
41 |
+
try:
|
42 |
+
conversation_history = await self.mongodb.get_recent_messages(
|
43 |
+
conversation_id,
|
44 |
+
limit=self.conversation_manager.max_messages
|
45 |
+
)
|
46 |
+
|
47 |
+
# Get relevant history within token limits
|
48 |
+
conversation_history = self.conversation_manager.get_relevant_history(
|
49 |
+
messages=conversation_history,
|
50 |
+
current_query=query
|
51 |
+
)
|
52 |
+
except Exception as e:
|
53 |
+
logger.warning(
|
54 |
+
f"Error fetching conversation history: {str(e)}")
|
55 |
+
|
56 |
+
# For all other queries, proceed with context-based response
|
57 |
+
if not context_docs:
|
58 |
+
context_docs, sources, scores = await self.retrieve_context(
|
|
|
|
|
|
|
59 |
query,
|
60 |
+
conversation_history=conversation_history
|
61 |
)
|
62 |
|
63 |
+
# Check if we have relevant context
|
64 |
+
has_relevant_context = self._check_context_relevance(
|
65 |
+
query, context_docs or []
|
66 |
+
)
|
67 |
+
|
68 |
+
# If no relevant context found, return the standard message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
if not has_relevant_context:
|
70 |
+
return RAGResponse(
|
71 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
72 |
+
context_docs=[],
|
73 |
+
sources=[],
|
74 |
+
scores=None
|
75 |
+
)
|
76 |
+
|
77 |
+
# Generate response using context and conversation history
|
78 |
+
prompt = self._create_response_prompt(
|
79 |
+
query=query,
|
80 |
+
context_docs=context_docs,
|
81 |
+
conversation_history=conversation_history
|
82 |
+
)
|
83 |
|
|
|
|
|
84 |
response_text = self.llm.generate(
|
85 |
+
prompt,
|
86 |
temperature=temperature,
|
87 |
max_tokens=max_tokens
|
88 |
)
|
89 |
|
90 |
+
# Check if the generated response indicates no information
|
91 |
cleaned_response = self._clean_response(response_text)
|
92 |
if self._is_no_info_response(cleaned_response):
|
93 |
+
return RAGResponse(
|
94 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
95 |
+
context_docs=[],
|
96 |
+
sources=[],
|
97 |
+
scores=None
|
98 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
return RAGResponse(
|
101 |
response=cleaned_response,
|
102 |
+
context_docs=context_docs,
|
103 |
sources=sources,
|
104 |
scores=scores
|
105 |
)
|
106 |
|
107 |
except Exception as e:
|
108 |
+
logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
|
109 |
raise
|
110 |
|
111 |
+
def _create_response_prompt(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
self,
|
113 |
query: str,
|
114 |
+
context_docs: List[str],
|
115 |
conversation_history: Optional[List[Dict]] = None
|
116 |
+
) -> str:
|
117 |
+
"""Create prompt for generating response from context and conversation history"""
|
118 |
+
# Format context documents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
formatted_context = '\n\n'.join(
|
120 |
f"Context {i+1}:\n{doc.strip()}"
|
121 |
for i, doc in enumerate(context_docs)
|
122 |
if doc and doc.strip()
|
123 |
)
|
124 |
|
125 |
+
# Format conversation history if available
|
126 |
+
history_context = ""
|
127 |
+
if conversation_history:
|
128 |
+
history_messages = []
|
129 |
+
# Use last 3 messages for context
|
130 |
+
for msg in conversation_history[-3:]:
|
131 |
+
role = msg.get('role', 'unknown')
|
132 |
+
content = msg.get('content', '')
|
133 |
+
history_messages.append(f"{role.capitalize()}: {content}")
|
134 |
+
|
135 |
+
if history_messages:
|
136 |
+
history_context = "\nPrevious Conversation:\n" + \
|
137 |
+
"\n".join(history_messages)
|
138 |
+
|
139 |
return f"""
|
140 |
+
Use the following context and conversation history to provide information about: {query}
|
141 |
|
142 |
+
Context Information:
|
143 |
{formatted_context}
|
144 |
+
{history_context}
|
145 |
|
146 |
Instructions:
|
147 |
+
1. Use information from both the context and conversation history
|
148 |
+
2. If the information is found, provide a direct and concise response
|
149 |
3. Do not make assumptions or add information not present in the context
|
150 |
4. Ensure the response is clear and complete based on available information
|
151 |
+
5. If you cannot find relevant information about the specific query,
|
152 |
respond exactly with: "Information about this is not available, do you want to inquire about something else?"
|
153 |
|
154 |
Query: {query}
|
155 |
Response:"""
|
156 |
|
157 |
+
def _is_no_info_response(self, response: str) -> bool:
|
158 |
+
"""Check if the response indicates no information available"""
|
159 |
+
no_info_indicators = [
|
160 |
+
"i do not have",
|
161 |
+
"i don't have",
|
162 |
+
"no information",
|
163 |
+
"not available",
|
164 |
+
"could not find",
|
165 |
+
"couldn't find",
|
166 |
+
"cannot find"
|
167 |
+
]
|
168 |
+
response_lower = response.lower()
|
169 |
+
return any(indicator in response_lower for indicator in no_info_indicators)
|
170 |
+
|
171 |
+
def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
|
172 |
+
"""Check if context contains information relevant to the query"""
|
173 |
+
if not context_docs:
|
174 |
+
return False
|
175 |
+
|
176 |
+
# Extract key terms from query (keeping important words)
|
177 |
+
query_words = query.lower().split()
|
178 |
+
stop_words = {'me', 'a', 'about', 'what', 'is',
|
179 |
+
'are', 'the', 'in', 'how', 'why', 'when', 'where'}
|
180 |
+
|
181 |
+
# Remove only basic stop words, keep important terms like "report", "share", etc.
|
182 |
+
query_terms = {word for word in query_words if word not in stop_words}
|
183 |
+
|
184 |
+
# Add additional relevant terms that might appear in the content
|
185 |
+
related_terms = {
|
186 |
+
'comprehensive',
|
187 |
+
'report',
|
188 |
+
'overview',
|
189 |
+
'summary',
|
190 |
+
'details',
|
191 |
+
'information'
|
192 |
+
}
|
193 |
+
query_terms.update(
|
194 |
+
word for word in query_words if word in related_terms)
|
195 |
+
|
196 |
+
# Check each context document for relevance
|
197 |
+
for doc in context_docs:
|
198 |
+
if not doc:
|
199 |
+
continue
|
200 |
+
doc_lower = doc.lower()
|
201 |
+
|
202 |
+
# Consider document relevant if it contains any query terms
|
203 |
+
# or if it starts with common report headers
|
204 |
+
if any(term in doc_lower for term in query_terms) or \
|
205 |
+
any(header in doc_lower for header in ['overview', 'comprehensive report', 'summary']):
|
206 |
+
return True
|
207 |
+
|
208 |
+
return False
|
209 |
+
|
210 |
+
def _handle_contact_query(self, query: str) -> str:
|
211 |
+
"""Handle queries from /user/contact endpoint"""
|
212 |
+
try:
|
213 |
+
name_start = query.find('name: "') + 7
|
214 |
+
name_end = query.find('"', name_start)
|
215 |
+
name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
|
216 |
+
|
217 |
+
is_returning = (
|
218 |
+
"An old user with name:" in query and
|
219 |
+
"wants support again" in query
|
220 |
+
)
|
221 |
+
|
222 |
+
if is_returning:
|
223 |
+
return f"Welcome back {name}, How can I help you?"
|
224 |
+
return f"Welcome {name}, How can I help you?"
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
logger.error(f"Error handling contact query: {str(e)}")
|
228 |
+
return "Welcome, How can I help you?"
|
229 |
|
230 |
def _clean_response(self, response: str) -> str:
|
231 |
+
"""Clean response by removing unwanted phrases"""
|
232 |
if not response:
|
233 |
return response
|
234 |
|
|
|
248 |
"Here's what I found:",
|
249 |
"Here's the information you requested:",
|
250 |
"According to the provided information,",
|
251 |
+
"Based on the documents,",
|
252 |
"The information suggests that",
|
253 |
"From what I can see,",
|
254 |
"Let me explain",
|
|
|
257 |
"I can see that",
|
258 |
"Sure,",
|
259 |
"Well,",
|
260 |
+
"Based on the given context,",
|
261 |
+
"The available information shows that",
|
262 |
+
"From the context provided,",
|
263 |
+
"The documentation mentions that",
|
264 |
+
"According to the context,",
|
265 |
+
"As shown in the context,",
|
266 |
"I apologize,"
|
267 |
]
|
268 |
+
|
269 |
cleaned_response = response
|
270 |
for phrase in phrases_to_remove:
|
271 |
cleaned_response = cleaned_response.replace(phrase, "").strip()
|
272 |
+
|
273 |
cleaned_response = " ".join(cleaned_response.split())
|
274 |
+
|
275 |
if not cleaned_response:
|
276 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
+
if cleaned_response[0].islower():
|
279 |
+
cleaned_response = cleaned_response[0].upper(
|
280 |
+
) + cleaned_response[1:]
|
|
|
|
|
|
|
281 |
|
282 |
+
return cleaned_response
|
|
|
|
src/implementations/__pycache__/document_service.cpython-312.pyc
CHANGED
Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ
|
|
src/main.py
CHANGED
@@ -1,4 +1,31 @@
|
|
1 |
# src/main.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
3 |
from fastapi.responses import StreamingResponse, FileResponse
|
4 |
from fastapi.staticfiles import StaticFiles
|
@@ -9,53 +36,36 @@ from datetime import datetime
|
|
9 |
from pathlib import Path
|
10 |
import os
|
11 |
import asyncio
|
12 |
-
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
|
13 |
-
#os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
from fastapi.responses import RedirectResponse
|
17 |
-
from google.oauth2.credentials import Credentials
|
18 |
-
from google_auth_oauthlib.flow import Flow
|
19 |
-
from src.utils.google_drive_service import GoogleDriveService
|
20 |
|
21 |
# Import custom modules1
|
22 |
-
#from src.agents.rag_agent import RAGAgent
|
23 |
-
from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
|
24 |
-
from src.agents.rag_agent_manager import rag_agent_manager
|
25 |
-
from src.models.document import AllDocumentsResponse, StoredDocument
|
26 |
-
from src.models.UserContact import UserContactRequest
|
27 |
-
from src.utils.document_processor import DocumentProcessor
|
28 |
-
from src.utils.drive_document_processor import DriveDocumentProcessor
|
29 |
-
from src.utils.conversation_summarizer import ConversationSummarizer
|
30 |
-
from src.utils.logger import logger
|
31 |
-
from src.utils.llm_utils import get_llm_instance, get_vector_store
|
32 |
-
from src.db.mongodb_store import MongoDBStore
|
33 |
-
from src.implementations.document_service import DocumentService
|
34 |
-
from src.models import (
|
35 |
-
ChatRequest,
|
36 |
-
ChatResponse,
|
37 |
-
BatchUploadResponse,
|
38 |
-
SummarizeRequest,
|
39 |
-
SummaryResponse,
|
40 |
-
FeedbackRequest
|
41 |
-
)
|
42 |
-
from fastapi import HTTPException, Depends
|
43 |
-
from fastapi.security import APIKeyHeader
|
44 |
-
from src.utils.database_cleanup import perform_cleanup
|
45 |
|
46 |
-
from config.config import settings
|
47 |
|
48 |
app = FastAPI(title="Chatbot API")
|
49 |
|
50 |
app.add_middleware(
|
51 |
CORSMiddleware,
|
52 |
-
allow_origins=["http://localhost:8080",
|
|
|
53 |
allow_credentials=True,
|
54 |
allow_methods=["*"], # Allows all methods
|
55 |
allow_headers=["*"], # Allows all headers
|
56 |
)
|
57 |
|
58 |
-
#google_drive_service = GoogleDriveService()
|
59 |
|
60 |
# Initialize MongoDB
|
61 |
mongodb = MongoDBStore(settings.MONGODB_URI)
|
@@ -75,6 +85,7 @@ app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
|
|
75 |
# Security setup
|
76 |
API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
|
77 |
|
|
|
78 |
async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
|
79 |
"""Verify admin API key"""
|
80 |
if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
|
@@ -84,41 +95,16 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
|
|
84 |
)
|
85 |
return api_key
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
# async def google_auth_callback(code: str):
|
98 |
-
# flow = Flow.from_client_config({
|
99 |
-
# "web": {
|
100 |
-
# "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
|
101 |
-
# "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
|
102 |
-
# "auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
103 |
-
# "token_uri": "https://oauth2.googleapis.com/token",
|
104 |
-
# "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
|
105 |
-
# }
|
106 |
-
# }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
|
107 |
-
|
108 |
-
# flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
|
109 |
-
|
110 |
-
# # Add access type and prompt parameters for refresh token
|
111 |
-
# flow.fetch_token(
|
112 |
-
# code=code,
|
113 |
-
# access_type='offline',
|
114 |
-
# prompt='consent'
|
115 |
-
# )
|
116 |
-
# credentials = flow.credentials
|
117 |
-
|
118 |
-
# return {
|
119 |
-
# "message": "Authentication successful",
|
120 |
-
# "credentials": credentials.to_json()
|
121 |
-
# }
|
122 |
|
123 |
|
124 |
@app.get("/documents")
|
@@ -126,7 +112,7 @@ async def get_all_documents():
|
|
126 |
"""Get all documents from MongoDB"""
|
127 |
try:
|
128 |
documents = await mongodb.get_all_documents()
|
129 |
-
|
130 |
formatted_documents = []
|
131 |
for doc in documents:
|
132 |
try:
|
@@ -140,9 +126,10 @@ async def get_all_documents():
|
|
140 |
}
|
141 |
formatted_documents.append(formatted_doc)
|
142 |
except Exception as e:
|
143 |
-
logger.error(
|
|
|
144 |
continue
|
145 |
-
|
146 |
return {
|
147 |
"total_documents": len(formatted_documents),
|
148 |
"documents": formatted_documents
|
@@ -151,6 +138,7 @@ async def get_all_documents():
|
|
151 |
logger.error(f"Error retrieving documents: {str(e)}")
|
152 |
raise HTTPException(status_code=500, detail=str(e))
|
153 |
|
|
|
154 |
@app.get("/documents/{document_id}/download")
|
155 |
async def get_document_file(document_id: str):
|
156 |
"""Serve a document file by its ID"""
|
@@ -159,27 +147,28 @@ async def get_document_file(document_id: str):
|
|
159 |
doc = await mongodb.get_document(document_id)
|
160 |
if not doc:
|
161 |
raise HTTPException(status_code=404, detail="Document not found")
|
162 |
-
|
163 |
# Extract filename from url_path
|
164 |
filename = doc["url_path"].split("/")[-1]
|
165 |
file_path = UPLOADS_DIR / filename
|
166 |
-
|
167 |
if not file_path.exists():
|
168 |
raise HTTPException(
|
169 |
-
status_code=404,
|
170 |
detail=f"File not found on server: {filename}"
|
171 |
)
|
172 |
-
|
173 |
return FileResponse(
|
174 |
path=str(file_path),
|
175 |
filename=doc["filename"],
|
176 |
media_type=doc["content_type"]
|
177 |
)
|
178 |
-
|
179 |
except Exception as e:
|
180 |
logger.error(f"Error serving document file: {str(e)}")
|
181 |
raise HTTPException(status_code=500, detail=str(e))
|
182 |
|
|
|
183 |
@app.post("/documents/upload", response_model=BatchUploadResponse)
|
184 |
async def upload_documents(
|
185 |
files: List[UploadFile] = File(...),
|
@@ -189,14 +178,84 @@ async def upload_documents(
|
|
189 |
try:
|
190 |
vector_store, _ = await get_vector_store()
|
191 |
response = await document_service.process_documents(
|
192 |
-
files,
|
193 |
-
vector_store,
|
194 |
background_tasks
|
195 |
)
|
196 |
return response
|
197 |
except Exception as e:
|
198 |
logger.error(f"Error in document upload: {str(e)}")
|
199 |
-
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
|
202 |
@app.get("/documentChunks/{document_id}")
|
@@ -205,10 +264,10 @@ async def get_document_chunks(document_id: str):
|
|
205 |
try:
|
206 |
vector_store, _ = await get_vector_store()
|
207 |
chunks = vector_store.get_document_chunks(document_id)
|
208 |
-
|
209 |
if not chunks:
|
210 |
raise HTTPException(status_code=404, detail="Document not found")
|
211 |
-
|
212 |
return {
|
213 |
"document_id": document_id,
|
214 |
"total_chunks": len(chunks),
|
@@ -218,53 +277,57 @@ async def get_document_chunks(document_id: str):
|
|
218 |
logger.error(f"Error retrieving document chunks: {str(e)}")
|
219 |
raise HTTPException(status_code=500, detail=str(e))
|
220 |
|
|
|
221 |
@app.delete("/documents/{document_id}")
|
222 |
async def delete_document(document_id: str):
|
223 |
"""Delete document from MongoDB, ChromaDB, and physical storage"""
|
224 |
try:
|
225 |
# First get document details from MongoDB to get file path
|
226 |
document = await mongodb.get_document(document_id)
|
227 |
-
if not document:
|
228 |
-
|
229 |
-
|
230 |
# Get vector store instance
|
231 |
vector_store, _ = await get_vector_store()
|
232 |
-
|
233 |
# Delete physical file using document service
|
234 |
deletion_success = await document_service.delete_document(document_id)
|
235 |
if not deletion_success:
|
236 |
-
logger.warning(
|
237 |
-
|
|
|
238 |
# Delete from vector store
|
239 |
try:
|
240 |
vector_store.delete_document(document_id)
|
241 |
except Exception as e:
|
242 |
-
logger.error(
|
|
|
243 |
raise HTTPException(
|
244 |
-
status_code=500,
|
245 |
detail=f"Failed to delete document from vector store: {str(e)}"
|
246 |
)
|
247 |
-
|
248 |
# Delete from MongoDB - don't check return value since document might already be deleted
|
249 |
await mongodb.delete_document(document_id)
|
250 |
-
|
251 |
return {
|
252 |
"status": "success",
|
253 |
"message": f"Document {document_id} successfully deleted from all stores"
|
254 |
}
|
255 |
-
|
256 |
except HTTPException:
|
257 |
raise
|
258 |
except Exception as e:
|
259 |
logger.error(f"Error in delete_document endpoint: {str(e)}")
|
260 |
raise HTTPException(status_code=500, detail=str(e))
|
261 |
|
|
|
262 |
@app.post("/processDriveDocuments")
|
263 |
async def process_drive_documents():
|
264 |
try:
|
265 |
# Initialize vector store
|
266 |
vector_store, _ = await get_vector_store()
|
267 |
-
|
268 |
# Initialize Drive document processor
|
269 |
drive_processor = DriveDocumentProcessor(
|
270 |
google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
|
@@ -272,18 +335,19 @@ async def process_drive_documents():
|
|
272 |
temp_dir=settings.TEMP_DOWNLOAD_DIR,
|
273 |
doc_processor=doc_processor
|
274 |
)
|
275 |
-
|
276 |
# Process documents
|
277 |
result = await drive_processor.process_documents(vector_store)
|
278 |
return result
|
279 |
-
|
280 |
except Exception as e:
|
281 |
logger.error(f"Error in process_drive_documents: {str(e)}")
|
282 |
raise HTTPException(
|
283 |
status_code=500,
|
284 |
detail=str(e)
|
285 |
)
|
286 |
-
|
|
|
287 |
@app.post("/user/contact", response_model=ChatResponse)
|
288 |
async def create_user_contact(
|
289 |
request: UserContactRequest,
|
@@ -296,7 +360,7 @@ async def create_user_contact(
|
|
296 |
email=request.email,
|
297 |
phone_number=request.phone_number
|
298 |
)
|
299 |
-
|
300 |
if existing_conversation_id:
|
301 |
chat_request = ChatRequest(
|
302 |
query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
|
@@ -315,7 +379,7 @@ async def create_user_contact(
|
|
315 |
email=request.email,
|
316 |
phone_number=request.phone_number
|
317 |
)
|
318 |
-
|
319 |
chat_request = ChatRequest(
|
320 |
query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
|
321 |
llm_provider="openai",
|
@@ -324,14 +388,15 @@ async def create_user_contact(
|
|
324 |
stream=False,
|
325 |
conversation_id=new_conversation_id
|
326 |
)
|
327 |
-
|
328 |
# Call chat_endpoint with the prepared request
|
329 |
return await chat_endpoint(chat_request, background_tasks)
|
330 |
-
|
331 |
except Exception as e:
|
332 |
logger.error(f"Error in create_user_contact: {str(e)}")
|
333 |
raise HTTPException(status_code=500, detail=str(e))
|
334 |
-
|
|
|
335 |
@app.post("/chat", response_model=ChatResponse)
|
336 |
async def chat_endpoint(
|
337 |
request: ChatRequest,
|
@@ -340,33 +405,61 @@ async def chat_endpoint(
|
|
340 |
"""Chat endpoint with RAG support and enhanced Excel handling"""
|
341 |
try:
|
342 |
# Initialize core components
|
343 |
-
logger.info(
|
|
|
344 |
vector_store, embedding_model = await get_vector_store()
|
345 |
-
|
346 |
logger.info(f"Initializing LLM: {str(datetime.now())}")
|
347 |
llm = get_llm_instance(request.llm_provider)
|
348 |
-
|
349 |
-
#
|
350 |
-
rag_agent =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
llm=llm,
|
352 |
-
|
353 |
vector_store=vector_store,
|
354 |
mongodb=mongodb
|
355 |
)
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
# Use provided conversation ID or create new one
|
358 |
conversation_id = request.conversation_id or str(uuid.uuid4())
|
359 |
-
|
360 |
# Process the query
|
361 |
query = request.query
|
362 |
-
|
363 |
# Add specific instructions for certain types of queries
|
364 |
-
#if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
|
365 |
-
#query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
|
366 |
-
|
367 |
# Generate response
|
368 |
logger.info(f"Generating response: {str(datetime.now())}")
|
369 |
-
|
370 |
max_retries = 3
|
371 |
retry_count = 0
|
372 |
response = None
|
@@ -378,7 +471,8 @@ async def chat_endpoint(
|
|
378 |
query=query,
|
379 |
conversation_id=conversation_id,
|
380 |
temperature=request.temperature,
|
381 |
-
max_tokens=request.max_tokens if hasattr(
|
|
|
382 |
)
|
383 |
break
|
384 |
except Exception as e:
|
@@ -388,7 +482,8 @@ async def chat_endpoint(
|
|
388 |
await asyncio.sleep(1) # Brief pause before retry
|
389 |
|
390 |
if response is None:
|
391 |
-
raise last_error or Exception(
|
|
|
392 |
|
393 |
logger.info(f"Response generated: {str(datetime.now())}")
|
394 |
|
@@ -401,13 +496,13 @@ async def chat_endpoint(
|
|
401 |
|
402 |
# Add Excel-specific metadata if present
|
403 |
has_excel_content = any(
|
404 |
-
doc and 'Sheet:' in doc
|
405 |
for doc in (response.context_docs or [])
|
406 |
)
|
407 |
if has_excel_content:
|
408 |
try:
|
409 |
metadata['excel_content'] = True
|
410 |
-
|
411 |
# Extract Excel-specific insights if available
|
412 |
if hasattr(rag_agent, 'get_excel_insights'):
|
413 |
excel_insights = rag_agent.get_excel_insights(
|
@@ -436,13 +531,14 @@ async def chat_endpoint(
|
|
436 |
sources=response.sources,
|
437 |
conversation_id=conversation_id,
|
438 |
timestamp=datetime.now(),
|
439 |
-
relevant_doc_scores=response.scores if hasattr(
|
|
|
440 |
metadata=metadata
|
441 |
)
|
442 |
|
443 |
# Log completion
|
444 |
logger.info(f"Chat response completed: {str(datetime.now())}")
|
445 |
-
|
446 |
return chat_response
|
447 |
|
448 |
except Exception as e:
|
@@ -451,43 +547,48 @@ async def chat_endpoint(
|
|
451 |
if isinstance(e, ValueError):
|
452 |
raise HTTPException(status_code=400, detail=str(e))
|
453 |
elif isinstance(e, (KeyError, AttributeError)):
|
454 |
-
raise HTTPException(
|
|
|
455 |
else:
|
456 |
raise HTTPException(status_code=500, detail=str(e))
|
457 |
|
|
|
458 |
@app.get("/chat/history/{conversation_id}")
|
459 |
async def get_conversation_history(conversation_id: str):
|
460 |
"""Get complete conversation history"""
|
461 |
history = await mongodb.get_conversation_history(conversation_id)
|
462 |
-
|
463 |
if not history:
|
464 |
raise HTTPException(status_code=404, detail="Conversation not found")
|
465 |
-
|
466 |
return {
|
467 |
"conversation_id": conversation_id,
|
468 |
"messages": history
|
469 |
}
|
470 |
|
|
|
471 |
@app.post("/chat/summarize", response_model=SummaryResponse)
|
472 |
async def summarize_conversation(request: SummarizeRequest):
|
473 |
"""Generate a summary of a conversation"""
|
474 |
try:
|
475 |
messages = await mongodb.get_messages_for_summary(request.conversation_id)
|
476 |
-
|
477 |
if not messages:
|
478 |
-
raise HTTPException(
|
479 |
-
|
|
|
480 |
summary = await summarizer.summarize_conversation(
|
481 |
messages,
|
482 |
include_metadata=request.include_metadata
|
483 |
)
|
484 |
-
|
485 |
return SummaryResponse(**summary)
|
486 |
-
|
487 |
except Exception as e:
|
488 |
logger.error(f"Error generating summary: {str(e)}")
|
489 |
raise HTTPException(status_code=500, detail=str(e))
|
490 |
|
|
|
491 |
@app.post("/chat/feedback/{conversation_id}")
|
492 |
async def submit_feedback(
|
493 |
conversation_id: str,
|
@@ -498,21 +599,22 @@ async def submit_feedback(
|
|
498 |
# Validate conversation exists
|
499 |
conversation = await mongodb.get_conversation_metadata(conversation_id)
|
500 |
if not conversation:
|
501 |
-
raise HTTPException(
|
502 |
-
|
|
|
503 |
# Update feedback
|
504 |
success = await mongodb.update_feedback(
|
505 |
conversation_id=conversation_id,
|
506 |
feedback=feedback_request.feedback,
|
507 |
rating=feedback_request.rating
|
508 |
)
|
509 |
-
|
510 |
if not success:
|
511 |
raise HTTPException(
|
512 |
status_code=500,
|
513 |
detail="Failed to update feedback"
|
514 |
)
|
515 |
-
|
516 |
return {
|
517 |
"status": "success",
|
518 |
"message": "Feedback submitted successfully",
|
@@ -522,20 +624,21 @@ async def submit_feedback(
|
|
522 |
"rating": feedback_request.format_rating()
|
523 |
}
|
524 |
}
|
525 |
-
|
526 |
except HTTPException:
|
527 |
raise
|
528 |
except Exception as e:
|
529 |
logger.error(f"Error submitting feedback: {str(e)}")
|
530 |
raise HTTPException(status_code=500, detail=str(e))
|
531 |
|
|
|
532 |
@app.get("/debug/config")
|
533 |
async def debug_config():
|
534 |
"""Debug endpoint to check configuration"""
|
535 |
import os
|
536 |
from config.config import settings
|
537 |
from pathlib import Path
|
538 |
-
|
539 |
debug_info = {
|
540 |
"environment_variables": {
|
541 |
"OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
|
@@ -550,16 +653,17 @@ async def debug_config():
|
|
550 |
"openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
|
551 |
}
|
552 |
}
|
553 |
-
|
554 |
if settings.OPENAI_API_KEY:
|
555 |
key = settings.OPENAI_API_KEY
|
556 |
debug_info["api_key_info"] = {
|
557 |
"length": len(key),
|
558 |
"preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
|
559 |
}
|
560 |
-
|
561 |
return debug_info
|
562 |
|
|
|
563 |
@app.post("/admin/cleanup")
|
564 |
async def cleanup_databases(
|
565 |
include_files: bool = True,
|
@@ -567,20 +671,36 @@ async def cleanup_databases(
|
|
567 |
):
|
568 |
"""
|
569 |
Clean up all data from ChromaDB and MongoDB
|
570 |
-
|
571 |
Args:
|
572 |
include_files (bool): Whether to also delete uploaded files
|
|
|
|
|
|
|
573 |
"""
|
574 |
try:
|
575 |
result = await perform_cleanup(mongodb, include_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
576 |
return result
|
|
|
577 |
except Exception as e:
|
578 |
logger.error(f"Error in cleanup operation: {str(e)}")
|
579 |
raise HTTPException(
|
580 |
status_code=500,
|
581 |
detail=f"Error during cleanup: {str(e)}"
|
582 |
)
|
583 |
-
|
|
|
584 |
@app.get("/health")
|
585 |
async def health_check():
|
586 |
"""Health check endpoint"""
|
@@ -588,4 +708,4 @@ async def health_check():
|
|
588 |
|
589 |
if __name__ == "__main__":
|
590 |
import uvicorn
|
591 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
1 |
# src/main.py
|
2 |
+
from config.config import settings
|
3 |
+
from src.utils.database_cleanup import perform_cleanup
|
4 |
+
from fastapi.security import APIKeyHeader
|
5 |
+
from fastapi import HTTPException, Depends
|
6 |
+
from fastapi.responses import JSONResponse
|
7 |
+
from src.models import (
|
8 |
+
ChatRequest,
|
9 |
+
ChatResponse,
|
10 |
+
BatchUploadResponse,
|
11 |
+
SummarizeRequest,
|
12 |
+
SummaryResponse,
|
13 |
+
FeedbackRequest
|
14 |
+
)
|
15 |
+
from src.implementations.document_service import DocumentService
|
16 |
+
from src.db.mongodb_store import MongoDBStore
|
17 |
+
from src.utils.llm_utils import get_llm_instance, get_vector_store
|
18 |
+
from src.utils.logger import logger
|
19 |
+
from src.utils.conversation_summarizer import ConversationSummarizer
|
20 |
+
from src.utils.drive_document_processor import DriveDocumentProcessor
|
21 |
+
from src.utils.document_processor import DocumentProcessor
|
22 |
+
from src.models.UserContact import UserContactRequest
|
23 |
+
from src.models.document import AllDocumentsResponse, StoredDocument
|
24 |
+
from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
|
25 |
+
from src.utils.google_drive_service import GoogleDriveService
|
26 |
+
from google_auth_oauthlib.flow import Flow
|
27 |
+
from google.oauth2.credentials import Credentials
|
28 |
+
from fastapi.responses import RedirectResponse
|
29 |
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
30 |
from fastapi.responses import StreamingResponse, FileResponse
|
31 |
from fastapi.staticfiles import StaticFiles
|
|
|
36 |
from pathlib import Path
|
37 |
import os
|
38 |
import asyncio
|
|
|
|
|
39 |
|
40 |
+
import chromadb
|
41 |
+
from pathlib import Path
|
42 |
+
import asyncio
|
43 |
+
import gc
|
44 |
+
import random
|
45 |
+
from typing import List
|
46 |
+
from src.utils.logger import logger
|
47 |
+
from config.config import settings
|
48 |
+
|
49 |
+
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
|
50 |
+
# os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
|
51 |
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Import custom modules1
|
54 |
+
# from src.agents.rag_agent import RAGAgent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
|
|
56 |
|
57 |
app = FastAPI(title="Chatbot API")
|
58 |
|
59 |
app.add_middleware(
|
60 |
CORSMiddleware,
|
61 |
+
allow_origins=["http://localhost:8080",
|
62 |
+
"http://localhost:3000"], # Add both ports
|
63 |
allow_credentials=True,
|
64 |
allow_methods=["*"], # Allows all methods
|
65 |
allow_headers=["*"], # Allows all headers
|
66 |
)
|
67 |
|
68 |
+
# google_drive_service = GoogleDriveService()
|
69 |
|
70 |
# Initialize MongoDB
|
71 |
mongodb = MongoDBStore(settings.MONGODB_URI)
|
|
|
85 |
# Security setup
|
86 |
API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
|
87 |
|
88 |
+
|
89 |
async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
|
90 |
"""Verify admin API key"""
|
91 |
if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
|
|
|
95 |
)
|
96 |
return api_key
|
97 |
|
98 |
+
|
99 |
+
def get_chroma_client():
|
100 |
+
"""Get a new ChromaDB client instance"""
|
101 |
+
return chromadb.PersistentClient(
|
102 |
+
path=settings.CHROMA_PATH,
|
103 |
+
settings=chromadb.Settings(
|
104 |
+
allow_reset=True,
|
105 |
+
is_persistent=True
|
106 |
+
)
|
107 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
|
110 |
@app.get("/documents")
|
|
|
112 |
"""Get all documents from MongoDB"""
|
113 |
try:
|
114 |
documents = await mongodb.get_all_documents()
|
115 |
+
|
116 |
formatted_documents = []
|
117 |
for doc in documents:
|
118 |
try:
|
|
|
126 |
}
|
127 |
formatted_documents.append(formatted_doc)
|
128 |
except Exception as e:
|
129 |
+
logger.error(
|
130 |
+
f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
|
131 |
continue
|
132 |
+
|
133 |
return {
|
134 |
"total_documents": len(formatted_documents),
|
135 |
"documents": formatted_documents
|
|
|
138 |
logger.error(f"Error retrieving documents: {str(e)}")
|
139 |
raise HTTPException(status_code=500, detail=str(e))
|
140 |
|
141 |
+
|
142 |
@app.get("/documents/{document_id}/download")
|
143 |
async def get_document_file(document_id: str):
|
144 |
"""Serve a document file by its ID"""
|
|
|
147 |
doc = await mongodb.get_document(document_id)
|
148 |
if not doc:
|
149 |
raise HTTPException(status_code=404, detail="Document not found")
|
150 |
+
|
151 |
# Extract filename from url_path
|
152 |
filename = doc["url_path"].split("/")[-1]
|
153 |
file_path = UPLOADS_DIR / filename
|
154 |
+
|
155 |
if not file_path.exists():
|
156 |
raise HTTPException(
|
157 |
+
status_code=404,
|
158 |
detail=f"File not found on server: {filename}"
|
159 |
)
|
160 |
+
|
161 |
return FileResponse(
|
162 |
path=str(file_path),
|
163 |
filename=doc["filename"],
|
164 |
media_type=doc["content_type"]
|
165 |
)
|
166 |
+
|
167 |
except Exception as e:
|
168 |
logger.error(f"Error serving document file: {str(e)}")
|
169 |
raise HTTPException(status_code=500, detail=str(e))
|
170 |
|
171 |
+
|
172 |
@app.post("/documents/upload", response_model=BatchUploadResponse)
|
173 |
async def upload_documents(
|
174 |
files: List[UploadFile] = File(...),
|
|
|
178 |
try:
|
179 |
vector_store, _ = await get_vector_store()
|
180 |
response = await document_service.process_documents(
|
181 |
+
files,
|
182 |
+
vector_store,
|
183 |
background_tasks
|
184 |
)
|
185 |
return response
|
186 |
except Exception as e:
|
187 |
logger.error(f"Error in document upload: {str(e)}")
|
188 |
+
raise HTTPException(status_code=500, detail=str(e))
|
189 |
+
|
190 |
+
|
191 |
+
@app.get("/documentChunks")
|
192 |
+
async def get_all_document_chunks():
|
193 |
+
"""Get all document chunks from the vector store"""
|
194 |
+
try:
|
195 |
+
# Get vector store instance
|
196 |
+
vector_store, _ = await get_vector_store()
|
197 |
+
|
198 |
+
# Retrieve all documents
|
199 |
+
all_documents = vector_store.get_all_documents()
|
200 |
+
|
201 |
+
# If no documents, return a structured response instead of raising an exception
|
202 |
+
if not all_documents:
|
203 |
+
return {
|
204 |
+
"total_documents": 0,
|
205 |
+
"documents": [],
|
206 |
+
"message": "No documents are currently stored in the vector store. Upload some documents to see chunks."
|
207 |
+
}
|
208 |
+
|
209 |
+
# Group chunks by document_id
|
210 |
+
document_chunks = {}
|
211 |
+
for doc in all_documents:
|
212 |
+
# Safely extract document_id
|
213 |
+
document_id = doc.get('metadata', {}).get('document_id',
|
214 |
+
doc.get('id',
|
215 |
+
str(uuid.uuid4())))
|
216 |
+
|
217 |
+
# Ensure metadata is a dictionary
|
218 |
+
metadata = doc.get('metadata', {}) if isinstance(
|
219 |
+
doc.get('metadata'), dict) else {}
|
220 |
+
|
221 |
+
# Create chunk entry
|
222 |
+
chunk = {
|
223 |
+
'text': str(doc.get('text', '')),
|
224 |
+
'metadata': metadata
|
225 |
+
}
|
226 |
+
|
227 |
+
# Group chunks by document_id
|
228 |
+
if document_id not in document_chunks:
|
229 |
+
document_chunks[document_id] = []
|
230 |
+
|
231 |
+
document_chunks[document_id].append(chunk)
|
232 |
+
|
233 |
+
# Prepare response
|
234 |
+
processed_documents = []
|
235 |
+
for doc_id, chunks in document_chunks.items():
|
236 |
+
processed_documents.append({
|
237 |
+
"document_id": doc_id,
|
238 |
+
"total_chunks": len(chunks),
|
239 |
+
"chunks": chunks
|
240 |
+
})
|
241 |
+
|
242 |
+
return {
|
243 |
+
"total_documents": len(processed_documents),
|
244 |
+
"documents": processed_documents,
|
245 |
+
"message": f"Successfully retrieved {len(processed_documents)} documents"
|
246 |
+
}
|
247 |
+
|
248 |
+
except Exception as e:
|
249 |
+
# Log the full error for debugging
|
250 |
+
logger.error(
|
251 |
+
f"Error retrieving all document chunks: {str(e)}", exc_info=True)
|
252 |
+
|
253 |
+
# Return a structured error response
|
254 |
+
return {
|
255 |
+
"total_documents": 0,
|
256 |
+
"documents": [],
|
257 |
+
"message": f"An error occurred while retrieving document chunks: {str(e)}"
|
258 |
+
}
|
259 |
|
260 |
|
261 |
@app.get("/documentChunks/{document_id}")
|
|
|
264 |
try:
|
265 |
vector_store, _ = await get_vector_store()
|
266 |
chunks = vector_store.get_document_chunks(document_id)
|
267 |
+
|
268 |
if not chunks:
|
269 |
raise HTTPException(status_code=404, detail="Document not found")
|
270 |
+
|
271 |
return {
|
272 |
"document_id": document_id,
|
273 |
"total_chunks": len(chunks),
|
|
|
277 |
logger.error(f"Error retrieving document chunks: {str(e)}")
|
278 |
raise HTTPException(status_code=500, detail=str(e))
|
279 |
|
280 |
+
|
281 |
@app.delete("/documents/{document_id}")
|
282 |
async def delete_document(document_id: str):
|
283 |
"""Delete document from MongoDB, ChromaDB, and physical storage"""
|
284 |
try:
|
285 |
# First get document details from MongoDB to get file path
|
286 |
document = await mongodb.get_document(document_id)
|
287 |
+
# if not document:
|
288 |
+
# raise HTTPException(status_code=404, detail="Document not found")
|
289 |
+
|
290 |
# Get vector store instance
|
291 |
vector_store, _ = await get_vector_store()
|
292 |
+
|
293 |
# Delete physical file using document service
|
294 |
deletion_success = await document_service.delete_document(document_id)
|
295 |
if not deletion_success:
|
296 |
+
logger.warning(
|
297 |
+
f"Failed to delete physical file for document {document_id}")
|
298 |
+
|
299 |
# Delete from vector store
|
300 |
try:
|
301 |
vector_store.delete_document(document_id)
|
302 |
except Exception as e:
|
303 |
+
logger.error(
|
304 |
+
f"Error deleting document from vector store: {str(e)}")
|
305 |
raise HTTPException(
|
306 |
+
status_code=500,
|
307 |
detail=f"Failed to delete document from vector store: {str(e)}"
|
308 |
)
|
309 |
+
|
310 |
# Delete from MongoDB - don't check return value since document might already be deleted
|
311 |
await mongodb.delete_document(document_id)
|
312 |
+
|
313 |
return {
|
314 |
"status": "success",
|
315 |
"message": f"Document {document_id} successfully deleted from all stores"
|
316 |
}
|
317 |
+
|
318 |
except HTTPException:
|
319 |
raise
|
320 |
except Exception as e:
|
321 |
logger.error(f"Error in delete_document endpoint: {str(e)}")
|
322 |
raise HTTPException(status_code=500, detail=str(e))
|
323 |
|
324 |
+
|
325 |
@app.post("/processDriveDocuments")
|
326 |
async def process_drive_documents():
|
327 |
try:
|
328 |
# Initialize vector store
|
329 |
vector_store, _ = await get_vector_store()
|
330 |
+
|
331 |
# Initialize Drive document processor
|
332 |
drive_processor = DriveDocumentProcessor(
|
333 |
google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
|
|
|
335 |
temp_dir=settings.TEMP_DOWNLOAD_DIR,
|
336 |
doc_processor=doc_processor
|
337 |
)
|
338 |
+
|
339 |
# Process documents
|
340 |
result = await drive_processor.process_documents(vector_store)
|
341 |
return result
|
342 |
+
|
343 |
except Exception as e:
|
344 |
logger.error(f"Error in process_drive_documents: {str(e)}")
|
345 |
raise HTTPException(
|
346 |
status_code=500,
|
347 |
detail=str(e)
|
348 |
)
|
349 |
+
|
350 |
+
|
351 |
@app.post("/user/contact", response_model=ChatResponse)
|
352 |
async def create_user_contact(
|
353 |
request: UserContactRequest,
|
|
|
360 |
email=request.email,
|
361 |
phone_number=request.phone_number
|
362 |
)
|
363 |
+
|
364 |
if existing_conversation_id:
|
365 |
chat_request = ChatRequest(
|
366 |
query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
|
|
|
379 |
email=request.email,
|
380 |
phone_number=request.phone_number
|
381 |
)
|
382 |
+
|
383 |
chat_request = ChatRequest(
|
384 |
query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
|
385 |
llm_provider="openai",
|
|
|
388 |
stream=False,
|
389 |
conversation_id=new_conversation_id
|
390 |
)
|
391 |
+
|
392 |
# Call chat_endpoint with the prepared request
|
393 |
return await chat_endpoint(chat_request, background_tasks)
|
394 |
+
|
395 |
except Exception as e:
|
396 |
logger.error(f"Error in create_user_contact: {str(e)}")
|
397 |
raise HTTPException(status_code=500, detail=str(e))
|
398 |
+
|
399 |
+
|
400 |
@app.post("/chat", response_model=ChatResponse)
|
401 |
async def chat_endpoint(
|
402 |
request: ChatRequest,
|
|
|
405 |
"""Chat endpoint with RAG support and enhanced Excel handling"""
|
406 |
try:
|
407 |
# Initialize core components
|
408 |
+
logger.info(
|
409 |
+
f"Initializing vector store and embedding: {str(datetime.now())}")
|
410 |
vector_store, embedding_model = await get_vector_store()
|
411 |
+
|
412 |
logger.info(f"Initializing LLM: {str(datetime.now())}")
|
413 |
llm = get_llm_instance(request.llm_provider)
|
414 |
+
|
415 |
+
# Initialize RAG agent
|
416 |
+
# rag_agent = RAGAgent(
|
417 |
+
# llm=llm,
|
418 |
+
# embedding=embedding_model,
|
419 |
+
# vector_store=vector_store,
|
420 |
+
# mongodb=mongodb
|
421 |
+
# )
|
422 |
+
|
423 |
+
rag_agent = SystemInstructionsRAGAgent(
|
424 |
llm=llm,
|
425 |
+
embedding=embedding_model,
|
426 |
vector_store=vector_store,
|
427 |
mongodb=mongodb
|
428 |
)
|
429 |
+
|
430 |
+
# rag_agent.add_custom_role(
|
431 |
+
# "Knowledge based chatbot and introduction specialist",
|
432 |
+
# """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
|
433 |
+
# Your role is to:
|
434 |
+
# 1. Your response should be short and to the point.
|
435 |
+
# 2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
|
436 |
+
# """
|
437 |
+
# )
|
438 |
+
|
439 |
+
# rag_agent.add_custom_role(
|
440 |
+
# "Knowledge based chatbot",
|
441 |
+
# """You are a knowledge based specialist focusing on knowledge attached.
|
442 |
+
# Your role is to:
|
443 |
+
# 1. Your response should be short and to the point.
|
444 |
+
# 2. if it is not introduction then make sure to share the response from Vector store.
|
445 |
+
# 3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
|
446 |
+
# 4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
|
447 |
+
# """
|
448 |
+
# )
|
449 |
+
|
450 |
# Use provided conversation ID or create new one
|
451 |
conversation_id = request.conversation_id or str(uuid.uuid4())
|
452 |
+
|
453 |
# Process the query
|
454 |
query = request.query
|
455 |
+
|
456 |
# Add specific instructions for certain types of queries
|
457 |
+
# if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
|
458 |
+
# query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
|
459 |
+
|
460 |
# Generate response
|
461 |
logger.info(f"Generating response: {str(datetime.now())}")
|
462 |
+
|
463 |
max_retries = 3
|
464 |
retry_count = 0
|
465 |
response = None
|
|
|
471 |
query=query,
|
472 |
conversation_id=conversation_id,
|
473 |
temperature=request.temperature,
|
474 |
+
max_tokens=request.max_tokens if hasattr(
|
475 |
+
request, 'max_tokens') else None
|
476 |
)
|
477 |
break
|
478 |
except Exception as e:
|
|
|
482 |
await asyncio.sleep(1) # Brief pause before retry
|
483 |
|
484 |
if response is None:
|
485 |
+
raise last_error or Exception(
|
486 |
+
"Failed to generate response after retries")
|
487 |
|
488 |
logger.info(f"Response generated: {str(datetime.now())}")
|
489 |
|
|
|
496 |
|
497 |
# Add Excel-specific metadata if present
|
498 |
has_excel_content = any(
|
499 |
+
doc and 'Sheet:' in doc
|
500 |
for doc in (response.context_docs or [])
|
501 |
)
|
502 |
if has_excel_content:
|
503 |
try:
|
504 |
metadata['excel_content'] = True
|
505 |
+
|
506 |
# Extract Excel-specific insights if available
|
507 |
if hasattr(rag_agent, 'get_excel_insights'):
|
508 |
excel_insights = rag_agent.get_excel_insights(
|
|
|
531 |
sources=response.sources,
|
532 |
conversation_id=conversation_id,
|
533 |
timestamp=datetime.now(),
|
534 |
+
relevant_doc_scores=response.scores if hasattr(
|
535 |
+
response, 'scores') else None,
|
536 |
metadata=metadata
|
537 |
)
|
538 |
|
539 |
# Log completion
|
540 |
logger.info(f"Chat response completed: {str(datetime.now())}")
|
541 |
+
|
542 |
return chat_response
|
543 |
|
544 |
except Exception as e:
|
|
|
547 |
if isinstance(e, ValueError):
|
548 |
raise HTTPException(status_code=400, detail=str(e))
|
549 |
elif isinstance(e, (KeyError, AttributeError)):
|
550 |
+
raise HTTPException(
|
551 |
+
status_code=500, detail="Internal processing error")
|
552 |
else:
|
553 |
raise HTTPException(status_code=500, detail=str(e))
|
554 |
|
555 |
+
|
556 |
@app.get("/chat/history/{conversation_id}")
|
557 |
async def get_conversation_history(conversation_id: str):
|
558 |
"""Get complete conversation history"""
|
559 |
history = await mongodb.get_conversation_history(conversation_id)
|
560 |
+
|
561 |
if not history:
|
562 |
raise HTTPException(status_code=404, detail="Conversation not found")
|
563 |
+
|
564 |
return {
|
565 |
"conversation_id": conversation_id,
|
566 |
"messages": history
|
567 |
}
|
568 |
|
569 |
+
|
570 |
@app.post("/chat/summarize", response_model=SummaryResponse)
|
571 |
async def summarize_conversation(request: SummarizeRequest):
|
572 |
"""Generate a summary of a conversation"""
|
573 |
try:
|
574 |
messages = await mongodb.get_messages_for_summary(request.conversation_id)
|
575 |
+
|
576 |
if not messages:
|
577 |
+
raise HTTPException(
|
578 |
+
status_code=404, detail="Conversation not found")
|
579 |
+
|
580 |
summary = await summarizer.summarize_conversation(
|
581 |
messages,
|
582 |
include_metadata=request.include_metadata
|
583 |
)
|
584 |
+
|
585 |
return SummaryResponse(**summary)
|
586 |
+
|
587 |
except Exception as e:
|
588 |
logger.error(f"Error generating summary: {str(e)}")
|
589 |
raise HTTPException(status_code=500, detail=str(e))
|
590 |
|
591 |
+
|
592 |
@app.post("/chat/feedback/{conversation_id}")
|
593 |
async def submit_feedback(
|
594 |
conversation_id: str,
|
|
|
599 |
# Validate conversation exists
|
600 |
conversation = await mongodb.get_conversation_metadata(conversation_id)
|
601 |
if not conversation:
|
602 |
+
raise HTTPException(
|
603 |
+
status_code=404, detail="Conversation not found")
|
604 |
+
|
605 |
# Update feedback
|
606 |
success = await mongodb.update_feedback(
|
607 |
conversation_id=conversation_id,
|
608 |
feedback=feedback_request.feedback,
|
609 |
rating=feedback_request.rating
|
610 |
)
|
611 |
+
|
612 |
if not success:
|
613 |
raise HTTPException(
|
614 |
status_code=500,
|
615 |
detail="Failed to update feedback"
|
616 |
)
|
617 |
+
|
618 |
return {
|
619 |
"status": "success",
|
620 |
"message": "Feedback submitted successfully",
|
|
|
624 |
"rating": feedback_request.format_rating()
|
625 |
}
|
626 |
}
|
627 |
+
|
628 |
except HTTPException:
|
629 |
raise
|
630 |
except Exception as e:
|
631 |
logger.error(f"Error submitting feedback: {str(e)}")
|
632 |
raise HTTPException(status_code=500, detail=str(e))
|
633 |
|
634 |
+
|
635 |
@app.get("/debug/config")
|
636 |
async def debug_config():
|
637 |
"""Debug endpoint to check configuration"""
|
638 |
import os
|
639 |
from config.config import settings
|
640 |
from pathlib import Path
|
641 |
+
|
642 |
debug_info = {
|
643 |
"environment_variables": {
|
644 |
"OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
|
|
|
653 |
"openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
|
654 |
}
|
655 |
}
|
656 |
+
|
657 |
if settings.OPENAI_API_KEY:
|
658 |
key = settings.OPENAI_API_KEY
|
659 |
debug_info["api_key_info"] = {
|
660 |
"length": len(key),
|
661 |
"preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
|
662 |
}
|
663 |
+
|
664 |
return debug_info
|
665 |
|
666 |
+
|
667 |
@app.post("/admin/cleanup")
|
668 |
async def cleanup_databases(
|
669 |
include_files: bool = True,
|
|
|
671 |
):
|
672 |
"""
|
673 |
Clean up all data from ChromaDB and MongoDB
|
674 |
+
|
675 |
Args:
|
676 |
include_files (bool): Whether to also delete uploaded files
|
677 |
+
|
678 |
+
Returns:
|
679 |
+
Dict: Cleanup operation summary with restart information
|
680 |
"""
|
681 |
try:
|
682 |
result = await perform_cleanup(mongodb, include_files)
|
683 |
+
|
684 |
+
# If restart is needed, return 202 Accepted instead of 200 OK
|
685 |
+
if result.get("restart_needed"):
|
686 |
+
return JSONResponse(
|
687 |
+
status_code=202,
|
688 |
+
content={
|
689 |
+
**result,
|
690 |
+
"message": "Cleanup partially completed. Please restart the server to complete ChromaDB cleanup."
|
691 |
+
}
|
692 |
+
)
|
693 |
+
|
694 |
return result
|
695 |
+
|
696 |
except Exception as e:
|
697 |
logger.error(f"Error in cleanup operation: {str(e)}")
|
698 |
raise HTTPException(
|
699 |
status_code=500,
|
700 |
detail=f"Error during cleanup: {str(e)}"
|
701 |
)
|
702 |
+
|
703 |
+
|
704 |
@app.get("/health")
|
705 |
async def health_check():
|
706 |
"""Health check endpoint"""
|
|
|
708 |
|
709 |
if __name__ == "__main__":
|
710 |
import uvicorn
|
711 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/utils/__pycache__/database_cleanup.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ
|
|
src/utils/__pycache__/document_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
|
|
src/utils/database_cleanup.py
CHANGED
@@ -1,131 +1,180 @@
|
|
1 |
# src/utils/database_cleanup.py
|
2 |
-
from typing import List, Dict
|
3 |
import chromadb
|
4 |
import shutil
|
5 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
6 |
from src.utils.logger import logger
|
7 |
from config.config import settings
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
try:
|
12 |
-
#
|
13 |
client = chromadb.PersistentClient(
|
14 |
path=settings.CHROMA_PATH,
|
15 |
settings=chromadb.Settings(
|
16 |
allow_reset=True,
|
17 |
-
is_persistent=True
|
|
|
18 |
)
|
19 |
)
|
20 |
-
|
21 |
-
# Get
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
except Exception as e:
|
38 |
raise Exception(f"ChromaDB cleanup failed: {str(e)}")
|
39 |
-
|
|
|
40 |
async def cleanup_mongodb(mongodb) -> List[str]:
|
41 |
-
"""
|
42 |
-
Clean up MongoDB collections
|
43 |
-
|
44 |
-
Args:
|
45 |
-
mongodb: MongoDB store instance
|
46 |
-
|
47 |
-
Returns:
|
48 |
-
List[str]: Details of cleanup operations
|
49 |
-
"""
|
50 |
details = []
|
51 |
-
|
52 |
try:
|
53 |
-
#
|
54 |
-
await mongodb.
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
return details
|
71 |
except Exception as e:
|
72 |
raise Exception(f"MongoDB cleanup failed: {str(e)}")
|
73 |
|
|
|
74 |
async def cleanup_files() -> List[str]:
|
75 |
-
"""
|
76 |
-
Clean up uploaded files
|
77 |
-
|
78 |
-
Returns:
|
79 |
-
List[str]: Details of cleanup operations
|
80 |
-
"""
|
81 |
details = []
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return details
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
include_files: bool = True
|
106 |
-
) -> Dict:
|
107 |
"""
|
108 |
-
Perform comprehensive cleanup of all databases
|
109 |
-
|
110 |
Args:
|
111 |
mongodb: MongoDB store instance
|
112 |
include_files (bool): Whether to also delete uploaded files
|
113 |
-
|
114 |
Returns:
|
115 |
-
Dict: Cleanup operation summary
|
116 |
"""
|
117 |
cleanup_summary = {
|
118 |
"chroma_db": {"status": "not_started", "details": []},
|
119 |
"mongodb": {"status": "not_started", "details": []},
|
120 |
"files": {"status": "not_started", "details": []}
|
121 |
}
|
122 |
-
|
123 |
try:
|
124 |
# Clean ChromaDB
|
125 |
try:
|
126 |
-
details = await cleanup_chroma()
|
127 |
cleanup_summary["chroma_db"] = {
|
128 |
-
"status": "success",
|
129 |
"details": details
|
130 |
}
|
131 |
except Exception as e:
|
@@ -166,17 +215,21 @@ async def perform_cleanup(
|
|
166 |
|
167 |
# Determine overall status
|
168 |
overall_status = "success"
|
169 |
-
if
|
|
|
|
|
|
|
170 |
overall_status = "partial_success"
|
171 |
-
|
172 |
overall_status = "error"
|
173 |
|
174 |
return {
|
175 |
"status": overall_status,
|
176 |
-
"message": "Cleanup operation completed",
|
177 |
-
"details": cleanup_summary
|
|
|
178 |
}
|
179 |
|
180 |
except Exception as e:
|
181 |
logger.error(f"Error in cleanup operation: {str(e)}")
|
182 |
-
raise
|
|
|
1 |
# src/utils/database_cleanup.py
|
|
|
2 |
import chromadb
|
3 |
import shutil
|
4 |
from pathlib import Path
|
5 |
+
import asyncio
|
6 |
+
import gc
|
7 |
+
import random
|
8 |
+
from typing import List, Dict, Tuple
|
9 |
from src.utils.logger import logger
|
10 |
from config.config import settings
|
11 |
|
12 |
+
|
13 |
+
async def cleanup_chroma() -> Tuple[List[str], bool]:
|
14 |
+
"""Clean up ChromaDB data while maintaining connection"""
|
15 |
+
details = []
|
16 |
+
restart_needed = False
|
17 |
+
|
18 |
try:
|
19 |
+
# Get existing client
|
20 |
client = chromadb.PersistentClient(
|
21 |
path=settings.CHROMA_PATH,
|
22 |
settings=chromadb.Settings(
|
23 |
allow_reset=True,
|
24 |
+
is_persistent=True,
|
25 |
+
anonymized_telemetry=False
|
26 |
)
|
27 |
)
|
28 |
+
|
29 |
+
# Get all collections
|
30 |
+
collections = client.list_collections()
|
31 |
+
|
32 |
+
if not collections:
|
33 |
+
details.append("No collections found in ChromaDB")
|
34 |
+
return details, restart_needed
|
35 |
+
|
36 |
+
# Delete data from each collection
|
37 |
+
for collection in collections:
|
38 |
+
try:
|
39 |
+
# Get all IDs in the collection
|
40 |
+
all_ids = collection.get()['ids']
|
41 |
+
|
42 |
+
if all_ids:
|
43 |
+
# Delete all documents in the collection
|
44 |
+
collection.delete(ids=all_ids)
|
45 |
+
details.append(
|
46 |
+
f"Deleted {len(all_ids)} documents from collection {collection.name}")
|
47 |
+
else:
|
48 |
+
details.append(
|
49 |
+
f"Collection {collection.name} was already empty")
|
50 |
+
|
51 |
+
# Delete the collection itself
|
52 |
+
client.delete_collection(collection.name)
|
53 |
+
details.append(f"Deleted collection {collection.name}")
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.warning(
|
57 |
+
f"Error cleaning collection {collection.name}: {str(e)}")
|
58 |
+
details.append(
|
59 |
+
f"Error cleaning collection {collection.name}: {str(e)}")
|
60 |
+
restart_needed = True # Set restart flag if any collection fails
|
61 |
+
|
62 |
+
# Optional: Add a check to see if a full reset might be necessary
|
63 |
+
if len(client.list_collections()) > 0:
|
64 |
+
restart_needed = True
|
65 |
+
details.append("Some collections might require manual reset")
|
66 |
+
|
67 |
+
return details, restart_needed
|
68 |
+
|
69 |
except Exception as e:
|
70 |
raise Exception(f"ChromaDB cleanup failed: {str(e)}")
|
71 |
+
|
72 |
+
|
73 |
async def cleanup_mongodb(mongodb) -> List[str]:
|
74 |
+
"""Clean up MongoDB collections"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
details = []
|
76 |
+
|
77 |
try:
|
78 |
+
# Get all collections in the database
|
79 |
+
collections = await mongodb.db.list_collection_names()
|
80 |
+
|
81 |
+
# Core collections from MongoDBStore initialization
|
82 |
+
core_collections = {
|
83 |
+
'chat_history': mongodb.chat_history,
|
84 |
+
'conversations': mongodb.conversations,
|
85 |
+
'knowledge_base': mongodb.documents, # documents maps to knowledge_base
|
86 |
+
# Direct access to vector_metadata collection
|
87 |
+
'vector_metadata': mongodb.db.vector_metadata,
|
88 |
+
}
|
89 |
+
|
90 |
+
# Clean each core collection
|
91 |
+
for name, collection in core_collections.items():
|
92 |
+
try:
|
93 |
+
result = await collection.delete_many({})
|
94 |
+
details.append(
|
95 |
+
f"Cleared {name} ({result.deleted_count} documents)")
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(f"Error clearing {name}: {str(e)}")
|
98 |
+
details.append(f"Error clearing {name}: {str(e)}")
|
99 |
+
|
100 |
+
# Clean any additional collections not in the core set
|
101 |
+
for coll_name in collections:
|
102 |
+
if coll_name not in core_collections:
|
103 |
+
try:
|
104 |
+
result = await mongodb.db[coll_name].delete_many({})
|
105 |
+
details.append(
|
106 |
+
f"Cleared additional collection {coll_name} ({result.deleted_count} documents)")
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(
|
109 |
+
f"Error clearing additional collection {coll_name}: {str(e)}")
|
110 |
+
|
111 |
return details
|
112 |
except Exception as e:
|
113 |
raise Exception(f"MongoDB cleanup failed: {str(e)}")
|
114 |
|
115 |
+
|
116 |
async def cleanup_files() -> List[str]:
|
117 |
+
"""Clean up uploaded files and temporary directories"""
|
|
|
|
|
|
|
|
|
|
|
118 |
details = []
|
119 |
+
|
120 |
+
# Directories to clean
|
121 |
+
directories = {
|
122 |
+
'uploads': Path("uploads"),
|
123 |
+
'temp_downloads': Path(settings.TEMP_DOWNLOAD_DIR),
|
124 |
+
# Additional temp directory used by some components
|
125 |
+
'temp_dir': Path('./temp')
|
126 |
+
}
|
127 |
+
|
128 |
+
for dir_name, dir_path in directories.items():
|
129 |
+
if dir_path.exists():
|
130 |
+
try:
|
131 |
+
# Delete all files in the directory
|
132 |
+
for file in dir_path.glob('*'):
|
133 |
+
try:
|
134 |
+
if file.is_file():
|
135 |
+
file.unlink()
|
136 |
+
details.append(
|
137 |
+
f"Deleted file: {file.name} from {dir_name}")
|
138 |
+
except Exception as e:
|
139 |
+
details.append(
|
140 |
+
f"Error deleting file {file.name} from {dir_name}: {str(e)}")
|
141 |
+
|
142 |
+
# Try to remove the empty directory
|
143 |
+
if not any(dir_path.iterdir()):
|
144 |
+
dir_path.rmdir()
|
145 |
+
details.append(f"Removed empty {dir_name} directory")
|
146 |
+
except Exception as e:
|
147 |
+
details.append(
|
148 |
+
f"Error cleaning {dir_name} directory: {str(e)}")
|
149 |
+
else:
|
150 |
+
details.append(f"No {dir_name} directory found")
|
151 |
+
|
152 |
return details
|
153 |
|
154 |
+
|
155 |
+
async def perform_cleanup(mongodb, include_files: bool = True) -> Dict:
|
|
|
|
|
156 |
"""
|
157 |
+
Perform comprehensive cleanup of all databases and files
|
158 |
+
|
159 |
Args:
|
160 |
mongodb: MongoDB store instance
|
161 |
include_files (bool): Whether to also delete uploaded files
|
162 |
+
|
163 |
Returns:
|
164 |
+
Dict: Cleanup operation summary with detailed status
|
165 |
"""
|
166 |
cleanup_summary = {
|
167 |
"chroma_db": {"status": "not_started", "details": []},
|
168 |
"mongodb": {"status": "not_started", "details": []},
|
169 |
"files": {"status": "not_started", "details": []}
|
170 |
}
|
171 |
+
|
172 |
try:
|
173 |
# Clean ChromaDB
|
174 |
try:
|
175 |
+
details, restart_needed = await cleanup_chroma()
|
176 |
cleanup_summary["chroma_db"] = {
|
177 |
+
"status": "success" if not restart_needed else "partial",
|
178 |
"details": details
|
179 |
}
|
180 |
except Exception as e:
|
|
|
215 |
|
216 |
# Determine overall status
|
217 |
overall_status = "success"
|
218 |
+
if restart_needed:
|
219 |
+
overall_status = "partial_success"
|
220 |
+
cleanup_summary["message"] = "Cleanup partially completed. Server restart required to complete ChromaDB cleanup."
|
221 |
+
elif any(item["status"] == "error" for item in cleanup_summary.values()):
|
222 |
overall_status = "partial_success"
|
223 |
+
elif all(item["status"] == "error" for item in cleanup_summary.values()):
|
224 |
overall_status = "error"
|
225 |
|
226 |
return {
|
227 |
"status": overall_status,
|
228 |
+
"message": cleanup_summary.get("message", "Cleanup operation completed"),
|
229 |
+
"details": cleanup_summary,
|
230 |
+
"restart_needed": restart_needed
|
231 |
}
|
232 |
|
233 |
except Exception as e:
|
234 |
logger.error(f"Error in cleanup operation: {str(e)}")
|
235 |
+
raise
|
src/utils/document_processor.py
CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6 |
import json
|
7 |
from pathlib import Path
|
8 |
import hashlib
|
9 |
-
import magic
|
10 |
from bs4 import BeautifulSoup
|
11 |
import csv
|
12 |
from datetime import datetime
|
@@ -16,41 +16,92 @@ import tiktoken
|
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
import logging
|
18 |
from bs4.element import ProcessingInstruction
|
|
|
|
|
19 |
from .enhanced_excel_processor import EnhancedExcelProcessor
|
20 |
|
|
|
21 |
class DocumentProcessor:
|
22 |
def __init__(
|
23 |
self,
|
24 |
-
chunk_size: int =
|
25 |
-
chunk_overlap: int =
|
26 |
-
max_file_size: int =
|
27 |
supported_formats: Optional[List[str]] = None
|
28 |
):
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
self.processing_queue = Queue()
|
37 |
self.processed_docs = {}
|
38 |
self._initialize_text_splitter()
|
39 |
-
|
40 |
-
# Initialize Excel processor
|
41 |
self.excel_processor = EnhancedExcelProcessor()
|
42 |
-
|
43 |
-
# Check for required packages
|
44 |
try:
|
45 |
import striprtf.striprtf
|
46 |
except ImportError:
|
47 |
-
logging.warning(
|
48 |
-
|
|
|
49 |
try:
|
50 |
from bs4 import BeautifulSoup
|
51 |
import lxml
|
52 |
except ImportError:
|
53 |
-
logging.warning(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def _initialize_text_splitter(self):
|
56 |
"""Initialize the text splitter with custom settings"""
|
@@ -58,13 +109,241 @@ class DocumentProcessor:
|
|
58 |
chunk_size=self.chunk_size,
|
59 |
chunk_overlap=self.chunk_overlap,
|
60 |
length_function=len,
|
61 |
-
separators
|
|
|
|
|
|
|
|
|
62 |
)
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def _extract_content(self, file_path: Path) -> str:
|
65 |
"""Extract content from different file formats"""
|
66 |
suffix = file_path.suffix.lower()
|
67 |
-
|
68 |
try:
|
69 |
if suffix == '.pdf':
|
70 |
return self._extract_pdf(file_path)
|
@@ -87,7 +366,8 @@ class DocumentProcessor:
|
|
87 |
else:
|
88 |
raise ValueError(f"Unsupported format: {suffix}")
|
89 |
except Exception as e:
|
90 |
-
raise Exception(
|
|
|
91 |
|
92 |
def _extract_text(self, file_path: Path) -> str:
|
93 |
"""Extract content from text-based files"""
|
@@ -104,31 +384,31 @@ class DocumentProcessor:
|
|
104 |
with open(file_path, 'rb') as file:
|
105 |
reader = PyPDF2.PdfReader(file)
|
106 |
metadata = reader.metadata
|
107 |
-
|
108 |
for page in reader.pages:
|
109 |
text += page.extract_text() + "\n\n"
|
110 |
-
|
111 |
# Extract images if available
|
112 |
if '/XObject' in page['/Resources']:
|
113 |
for obj in page['/Resources']['/XObject'].get_object():
|
114 |
if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
|
115 |
pass
|
116 |
-
|
117 |
return text.strip()
|
118 |
|
119 |
def _extract_docx(self, file_path: Path) -> str:
|
120 |
"""Extract text from DOCX with formatting"""
|
121 |
doc = docx.Document(file_path)
|
122 |
full_text = []
|
123 |
-
|
124 |
for para in doc.paragraphs:
|
125 |
full_text.append(para.text)
|
126 |
-
|
127 |
for table in doc.tables:
|
128 |
for row in table.rows:
|
129 |
row_text = [cell.text for cell in row.cells]
|
130 |
full_text.append(" | ".join(row_text))
|
131 |
-
|
132 |
return "\n\n".join(full_text)
|
133 |
|
134 |
def _extract_csv(self, file_path: Path) -> str:
|
@@ -146,10 +426,10 @@ class DocumentProcessor:
|
|
146 |
"""Extract text from HTML with structure preservation"""
|
147 |
with open(file_path) as f:
|
148 |
soup = BeautifulSoup(f, 'html.parser')
|
149 |
-
|
150 |
for script in soup(["script", "style"]):
|
151 |
script.decompose()
|
152 |
-
|
153 |
text = soup.get_text(separator='\n')
|
154 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
155 |
return "\n\n".join(lines)
|
@@ -159,12 +439,13 @@ class DocumentProcessor:
|
|
159 |
try:
|
160 |
with open(file_path, 'r', encoding='utf-8') as f:
|
161 |
soup = BeautifulSoup(f, 'xml')
|
162 |
-
|
163 |
for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
|
164 |
pi.extract()
|
165 |
-
|
166 |
text = soup.get_text(separator='\n')
|
167 |
-
lines = [line.strip()
|
|
|
168 |
return "\n\n".join(lines)
|
169 |
except Exception as e:
|
170 |
raise Exception(f"Error processing XML file: {str(e)}")
|
@@ -173,12 +454,13 @@ class DocumentProcessor:
|
|
173 |
"""Extract text from RTF files"""
|
174 |
try:
|
175 |
import striprtf.striprtf as striprtf
|
176 |
-
|
177 |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
178 |
rtf_text = f.read()
|
179 |
-
|
180 |
plain_text = striprtf.rtf_to_text(rtf_text)
|
181 |
-
lines = [line.strip()
|
|
|
182 |
return "\n\n".join(lines)
|
183 |
except ImportError:
|
184 |
raise ImportError("striprtf package is required for RTF support.")
|
@@ -190,14 +472,15 @@ class DocumentProcessor:
|
|
190 |
try:
|
191 |
# Use enhanced Excel processor
|
192 |
processed_content = self.excel_processor.process_excel(file_path)
|
193 |
-
|
194 |
# If processing fails, fall back to basic processing
|
195 |
if not processed_content:
|
196 |
-
logging.warning(
|
|
|
197 |
return self._basic_excel_extract(file_path)
|
198 |
-
|
199 |
return processed_content
|
200 |
-
|
201 |
except Exception as e:
|
202 |
logging.error(f"Error in enhanced Excel processing: {str(e)}")
|
203 |
# Fall back to basic Excel processing
|
@@ -208,12 +491,12 @@ class DocumentProcessor:
|
|
208 |
try:
|
209 |
excel_file = pd.ExcelFile(file_path)
|
210 |
sheets_data = []
|
211 |
-
|
212 |
for sheet_name in excel_file.sheet_names:
|
213 |
df = pd.read_excel(excel_file, sheet_name=sheet_name)
|
214 |
sheet_content = f"\nSheet: {sheet_name}\n"
|
215 |
sheet_content += "=" * (len(sheet_name) + 7) + "\n"
|
216 |
-
|
217 |
if df.empty:
|
218 |
sheet_content += "Empty Sheet\n"
|
219 |
else:
|
@@ -223,11 +506,11 @@ class DocumentProcessor:
|
|
223 |
max_cols=None,
|
224 |
line_width=120
|
225 |
) + "\n"
|
226 |
-
|
227 |
sheets_data.append(sheet_content)
|
228 |
-
|
229 |
return "\n\n".join(sheets_data)
|
230 |
-
|
231 |
except Exception as e:
|
232 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
233 |
|
@@ -239,7 +522,7 @@ class DocumentProcessor:
|
|
239 |
) -> Dict:
|
240 |
"""Generate comprehensive metadata"""
|
241 |
file_stat = file_path.stat()
|
242 |
-
|
243 |
metadata = {
|
244 |
'filename': file_path.name,
|
245 |
'file_type': file_path.suffix,
|
@@ -252,7 +535,7 @@ class DocumentProcessor:
|
|
252 |
'character_count': len(content),
|
253 |
'processing_timestamp': datetime.now().isoformat()
|
254 |
}
|
255 |
-
|
256 |
# Add Excel-specific metadata if applicable
|
257 |
if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
258 |
try:
|
@@ -261,32 +544,42 @@ class DocumentProcessor:
|
|
261 |
metadata.update({'excel_metadata': excel_metadata})
|
262 |
except Exception as e:
|
263 |
logging.warning(f"Could not extract Excel metadata: {str(e)}")
|
264 |
-
|
265 |
if additional_metadata:
|
266 |
metadata.update(additional_metadata)
|
267 |
-
|
268 |
return metadata
|
269 |
|
270 |
def _calculate_hash(self, text: str) -> str:
|
271 |
"""Calculate SHA-256 hash of text"""
|
272 |
return hashlib.sha256(text.encode()).hexdigest()
|
273 |
|
274 |
-
async def process_document(
|
275 |
-
self,
|
276 |
-
file_path: Union[str, Path],
|
277 |
-
metadata: Optional[Dict] = None
|
278 |
-
) -> Dict:
|
279 |
"""Process a document with metadata and content extraction"""
|
280 |
file_path = Path(file_path)
|
281 |
-
|
282 |
if not self._validate_file(file_path):
|
283 |
raise ValueError(f"Invalid file: {file_path}")
|
284 |
|
285 |
content = self._extract_content(file_path)
|
286 |
doc_metadata = self._generate_metadata(file_path, content, metadata)
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
|
289 |
-
|
290 |
return {
|
291 |
'content': content,
|
292 |
'chunks': chunks,
|
@@ -295,20 +588,28 @@ class DocumentProcessor:
|
|
295 |
'statistics': self._generate_statistics(content, chunks)
|
296 |
}
|
297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
def _validate_file(self, file_path: Path) -> bool:
|
299 |
"""Validate file type, size, and content"""
|
300 |
if not file_path.exists():
|
301 |
raise FileNotFoundError(f"File not found: {file_path}")
|
302 |
-
|
303 |
if file_path.suffix.lower() not in self.supported_formats:
|
304 |
raise ValueError(f"Unsupported file format: {file_path.suffix}")
|
305 |
-
|
306 |
if file_path.stat().st_size > self.max_file_size:
|
307 |
raise ValueError(f"File too large: {file_path}")
|
308 |
-
|
309 |
if file_path.stat().st_size == 0:
|
310 |
raise ValueError(f"Empty file: {file_path}")
|
311 |
-
|
312 |
return True
|
313 |
|
314 |
def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
|
@@ -328,7 +629,7 @@ class DocumentProcessor:
|
|
328 |
) -> Dict[str, Dict]:
|
329 |
"""Process multiple documents in parallel"""
|
330 |
results = {}
|
331 |
-
|
332 |
if parallel:
|
333 |
threads = []
|
334 |
for file_path in file_paths:
|
@@ -338,13 +639,13 @@ class DocumentProcessor:
|
|
338 |
)
|
339 |
threads.append(thread)
|
340 |
thread.start()
|
341 |
-
|
342 |
for thread in threads:
|
343 |
thread.join()
|
344 |
else:
|
345 |
for file_path in file_paths:
|
346 |
await self._process_and_store(file_path, results)
|
347 |
-
|
348 |
return results
|
349 |
|
350 |
async def _process_and_store(
|
@@ -357,4 +658,4 @@ class DocumentProcessor:
|
|
357 |
result = await self.process_document(file_path)
|
358 |
results[str(file_path)] = result
|
359 |
except Exception as e:
|
360 |
-
results[str(file_path)] = {'error': str(e)}
|
|
|
6 |
import json
|
7 |
from pathlib import Path
|
8 |
import hashlib
|
9 |
+
import magic
|
10 |
from bs4 import BeautifulSoup
|
11 |
import csv
|
12 |
from datetime import datetime
|
|
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
import logging
|
18 |
from bs4.element import ProcessingInstruction
|
19 |
+
|
20 |
+
from config.config import Settings
|
21 |
from .enhanced_excel_processor import EnhancedExcelProcessor
|
22 |
|
23 |
+
|
24 |
class DocumentProcessor:
|
25 |
def __init__(
|
26 |
self,
|
27 |
+
chunk_size: Optional[int] = None,
|
28 |
+
chunk_overlap: Optional[int] = None,
|
29 |
+
max_file_size: Optional[int] = None,
|
30 |
supported_formats: Optional[List[str]] = None
|
31 |
):
|
32 |
+
"""
|
33 |
+
Initialize DocumentProcessor with configurable parameters
|
34 |
+
|
35 |
+
Args:
|
36 |
+
chunk_size (Optional[int]): Size of text chunks
|
37 |
+
chunk_overlap (Optional[int]): Overlap between chunks
|
38 |
+
max_file_size (Optional[int]): Maximum file size in bytes
|
39 |
+
supported_formats (Optional[List[str]]): List of supported file extensions
|
40 |
+
"""
|
41 |
+
|
42 |
+
logging.basicConfig(
|
43 |
+
level=logging.DEBUG,
|
44 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
45 |
+
)
|
46 |
+
|
47 |
+
# Get settings with validation
|
48 |
+
default_settings = Settings.get_document_processor_settings()
|
49 |
+
|
50 |
+
# Use provided values or defaults from settings
|
51 |
+
self.chunk_size = chunk_size if chunk_size is not None else default_settings[
|
52 |
+
'chunk_size']
|
53 |
+
self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[
|
54 |
+
'chunk_overlap']
|
55 |
+
self.max_file_size = max_file_size if max_file_size is not None else default_settings[
|
56 |
+
'max_file_size']
|
57 |
+
self.supported_formats = supported_formats if supported_formats is not None else default_settings[
|
58 |
+
'supported_formats']
|
59 |
+
|
60 |
+
# Validate settings
|
61 |
+
self._validate_settings()
|
62 |
+
|
63 |
+
# Initialize existing components
|
64 |
self.processing_queue = Queue()
|
65 |
self.processed_docs = {}
|
66 |
self._initialize_text_splitter()
|
|
|
|
|
67 |
self.excel_processor = EnhancedExcelProcessor()
|
68 |
+
|
69 |
+
# Check for required packages (keep existing functionality)
|
70 |
try:
|
71 |
import striprtf.striprtf
|
72 |
except ImportError:
|
73 |
+
logging.warning(
|
74 |
+
"Warning: striprtf package not found. RTF support will be limited.")
|
75 |
+
|
76 |
try:
|
77 |
from bs4 import BeautifulSoup
|
78 |
import lxml
|
79 |
except ImportError:
|
80 |
+
logging.warning(
|
81 |
+
"Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
|
82 |
+
|
83 |
+
def _validate_settings(self):
|
84 |
+
"""Validate and adjust settings if necessary"""
|
85 |
+
# Ensure chunk_size is positive and reasonable
|
86 |
+
self.chunk_size = max(100, self.chunk_size)
|
87 |
+
|
88 |
+
# Ensure chunk_overlap is less than chunk_size
|
89 |
+
self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50)
|
90 |
+
|
91 |
+
# Ensure max_file_size is reasonable (minimum 1MB)
|
92 |
+
self.max_file_size = max(1024 * 1024, self.max_file_size)
|
93 |
+
|
94 |
+
# Ensure supported_formats contains valid extensions
|
95 |
+
if not self.supported_formats:
|
96 |
+
# Fallback to default supported formats if empty
|
97 |
+
self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats']
|
98 |
+
|
99 |
+
# Ensure all formats start with a dot
|
100 |
+
self.supported_formats = [
|
101 |
+
f".{fmt.lower().lstrip('.')}" if not fmt.startswith(
|
102 |
+
'.') else fmt.lower()
|
103 |
+
for fmt in self.supported_formats
|
104 |
+
]
|
105 |
|
106 |
def _initialize_text_splitter(self):
|
107 |
"""Initialize the text splitter with custom settings"""
|
|
|
109 |
chunk_size=self.chunk_size,
|
110 |
chunk_overlap=self.chunk_overlap,
|
111 |
length_function=len,
|
112 |
+
# Modify separators to better handle markdown while maintaining overlap
|
113 |
+
separators=["\n\n", "\n", " ", ""],
|
114 |
+
keep_separator=True,
|
115 |
+
add_start_index=True,
|
116 |
+
strip_whitespace=False # Keep whitespace to maintain markdown formatting
|
117 |
)
|
118 |
|
119 |
+
def split_text(self, text: str) -> List[str]:
|
120 |
+
"""Split text with enforced overlap while preserving structure"""
|
121 |
+
try:
|
122 |
+
# Get initial split using RecursiveCharacterTextSplitter
|
123 |
+
initial_chunks = self.text_splitter.split_text(text)
|
124 |
+
if len(initial_chunks) <= 1:
|
125 |
+
return initial_chunks
|
126 |
+
|
127 |
+
# Process chunks with enforced overlap
|
128 |
+
final_chunks = []
|
129 |
+
|
130 |
+
for i, current_chunk in enumerate(initial_chunks):
|
131 |
+
if i == 0:
|
132 |
+
final_chunks.append(current_chunk)
|
133 |
+
continue
|
134 |
+
|
135 |
+
prev_chunk = final_chunks[-1]
|
136 |
+
|
137 |
+
# Get the last part of previous chunk for overlap
|
138 |
+
overlap_size = min(self.chunk_overlap, len(prev_chunk))
|
139 |
+
overlap_text = prev_chunk[-overlap_size:]
|
140 |
+
|
141 |
+
# For tables, include the header row
|
142 |
+
if '|' in current_chunk and '\n' in current_chunk:
|
143 |
+
table_lines = current_chunk.split('\n')
|
144 |
+
header_lines = []
|
145 |
+
for line in table_lines:
|
146 |
+
if line.strip().startswith('|'):
|
147 |
+
header_lines.append(line)
|
148 |
+
else:
|
149 |
+
break
|
150 |
+
if header_lines:
|
151 |
+
header_text = '\n'.join(header_lines) + '\n'
|
152 |
+
overlap_text = header_text + overlap_text
|
153 |
+
|
154 |
+
# Create new chunk with overlap
|
155 |
+
new_chunk = overlap_text + current_chunk
|
156 |
+
|
157 |
+
# Ensure we don't have duplicate content at the overlap point
|
158 |
+
if current_chunk.startswith(overlap_text):
|
159 |
+
new_chunk = current_chunk
|
160 |
+
|
161 |
+
# Add context from previous chunk when needed
|
162 |
+
if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
|
163 |
+
context_markers = ['**AGENDA**',
|
164 |
+
'**DISCUSSIONS**', '| No |']
|
165 |
+
for marker in context_markers:
|
166 |
+
if marker in prev_chunk and marker not in new_chunk:
|
167 |
+
new_chunk = marker + "\n" + new_chunk
|
168 |
+
break
|
169 |
+
|
170 |
+
final_chunks.append(new_chunk)
|
171 |
+
|
172 |
+
# Validate and log overlaps
|
173 |
+
for i in range(len(final_chunks)-1):
|
174 |
+
actual_overlap = self._find_actual_overlap(
|
175 |
+
final_chunks[i], final_chunks[i+1])
|
176 |
+
logging.debug(
|
177 |
+
f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
|
178 |
+
if len(actual_overlap) < self.chunk_overlap:
|
179 |
+
logging.warning(
|
180 |
+
f"Insufficient overlap between chunks {i} and {i+1}")
|
181 |
+
|
182 |
+
return final_chunks
|
183 |
+
|
184 |
+
for start, end in table_sections:
|
185 |
+
# Process text before table if exists
|
186 |
+
if start > current_position:
|
187 |
+
non_table_text = text[current_position:start]
|
188 |
+
if non_table_text.strip():
|
189 |
+
text_chunks = self.text_splitter.split_text(
|
190 |
+
non_table_text)
|
191 |
+
if chunks and text_chunks:
|
192 |
+
# Ensure overlap with previous chunk
|
193 |
+
prev_chunk = chunks[-1]
|
194 |
+
overlap = self._get_overlap_text(prev_chunk)
|
195 |
+
text_chunks[0] = overlap + text_chunks[0]
|
196 |
+
chunks.extend(text_chunks)
|
197 |
+
|
198 |
+
# Process table as a single chunk with overlap
|
199 |
+
table_text = text[start:end]
|
200 |
+
if chunks:
|
201 |
+
prev_chunk = chunks[-1]
|
202 |
+
overlap = self._get_overlap_text(prev_chunk)
|
203 |
+
table_text = overlap + table_text
|
204 |
+
chunks.append(table_text)
|
205 |
+
current_position = end
|
206 |
+
|
207 |
+
# Process remaining text after last table
|
208 |
+
if current_position < len(text):
|
209 |
+
remaining_text = text[current_position:]
|
210 |
+
if remaining_text.strip():
|
211 |
+
text_chunks = self.text_splitter.split_text(remaining_text)
|
212 |
+
if chunks and text_chunks:
|
213 |
+
# Ensure overlap with previous chunk
|
214 |
+
prev_chunk = chunks[-1]
|
215 |
+
overlap = self._get_overlap_text(prev_chunk)
|
216 |
+
text_chunks[0] = overlap + text_chunks[0]
|
217 |
+
chunks.extend(text_chunks)
|
218 |
+
|
219 |
+
# Validate and adjust overlaps
|
220 |
+
chunks = self._ensure_minimum_overlap(chunks)
|
221 |
+
|
222 |
+
# Log chunk details for debugging
|
223 |
+
for i in range(len(chunks)-1):
|
224 |
+
overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
|
225 |
+
logging.debug(
|
226 |
+
f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
|
227 |
+
logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
|
228 |
+
logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
|
229 |
+
|
230 |
+
return chunks
|
231 |
+
|
232 |
+
except Exception as e:
|
233 |
+
logging.error(f"Error in split_text: {str(e)}")
|
234 |
+
# Fallback to original text splitter
|
235 |
+
return self.text_splitter.split_text(text)
|
236 |
+
|
237 |
+
def _find_break_point(self, text: str, prev_chunk: str) -> int:
|
238 |
+
"""
|
239 |
+
Find suitable breaking point that maintains document structure
|
240 |
+
|
241 |
+
Args:
|
242 |
+
text (str): Text to find break point in (the overlap portion)
|
243 |
+
prev_chunk (str): The complete previous chunk for context
|
244 |
+
|
245 |
+
Returns:
|
246 |
+
int: Position of suitable break point
|
247 |
+
"""
|
248 |
+
# Get the context of how the previous chunk ends
|
249 |
+
prev_chunk_lines = prev_chunk.split('\n')
|
250 |
+
|
251 |
+
# Special handling for markdown tables
|
252 |
+
if '|' in prev_chunk:
|
253 |
+
# Check if we're in the middle of a table
|
254 |
+
table_rows = [
|
255 |
+
line for line in prev_chunk_lines if line.strip().startswith('|')]
|
256 |
+
if table_rows:
|
257 |
+
# Find where the current table starts in the text
|
258 |
+
table_start = text.find('|')
|
259 |
+
if table_start >= 0:
|
260 |
+
# Find the next row boundary
|
261 |
+
next_row = text.find('\n', table_start)
|
262 |
+
if next_row >= 0:
|
263 |
+
return next_row + 1 # Include the newline
|
264 |
+
|
265 |
+
# Define break point markers in order of preference
|
266 |
+
break_markers = [
|
267 |
+
('\n\n', True), # Paragraph breaks (keep marker)
|
268 |
+
('\n', True), # Line breaks (keep marker)
|
269 |
+
('. ', True), # Sentence endings (keep marker)
|
270 |
+
(', ', True), # Clause breaks (keep marker)
|
271 |
+
(' ', False) # Word breaks (don't keep marker)
|
272 |
+
]
|
273 |
+
|
274 |
+
# Check the structure of the previous chunk end
|
275 |
+
last_line = prev_chunk_lines[-1] if prev_chunk_lines else ""
|
276 |
+
|
277 |
+
# Look for each type of break point
|
278 |
+
for marker, keep_marker in break_markers:
|
279 |
+
if marker in text:
|
280 |
+
# Try to find a break point that maintains document structure
|
281 |
+
marker_positions = [i for i in range(
|
282 |
+
len(text)) if text[i:i+len(marker)] == marker]
|
283 |
+
|
284 |
+
for pos in reversed(marker_positions):
|
285 |
+
# Check if this break point would maintain document structure
|
286 |
+
if self._is_valid_break_point(text, pos, last_line):
|
287 |
+
return pos + (len(marker) if keep_marker else 0)
|
288 |
+
|
289 |
+
# If no suitable break point found, default to exact position
|
290 |
+
return min(len(text), self.chunk_overlap)
|
291 |
+
|
292 |
+
def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool:
|
293 |
+
"""
|
294 |
+
Check if a break point would maintain document structure
|
295 |
+
|
296 |
+
Args:
|
297 |
+
text (str): Text being checked
|
298 |
+
position (int): Potential break position
|
299 |
+
last_line (str): Last line of previous chunk
|
300 |
+
|
301 |
+
Returns:
|
302 |
+
bool: True if break point is valid
|
303 |
+
"""
|
304 |
+
# Don't break in the middle of markdown formatting
|
305 |
+
markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#']
|
306 |
+
if position > 0 and position < len(text) - 1:
|
307 |
+
if text[position-1] in markdown_markers or text[position+1] in markdown_markers:
|
308 |
+
return False
|
309 |
+
|
310 |
+
# Don't break in the middle of a table cell
|
311 |
+
if '|' in last_line:
|
312 |
+
cell_count = last_line.count('|')
|
313 |
+
text_before_break = text[:position]
|
314 |
+
if text_before_break.count('|') % cell_count != 0:
|
315 |
+
return False
|
316 |
+
|
317 |
+
# Don't break URLs or code blocks
|
318 |
+
url_patterns = ['http://', 'https://', '```', '`']
|
319 |
+
for pattern in url_patterns:
|
320 |
+
if pattern in text[:position] and pattern not in text[position:]:
|
321 |
+
return False
|
322 |
+
|
323 |
+
return True
|
324 |
+
|
325 |
+
def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool:
|
326 |
+
"""Validate that chunks maintain document integrity"""
|
327 |
+
try:
|
328 |
+
# Remove overlap to check content
|
329 |
+
reconstructed = chunks[0]
|
330 |
+
for chunk in chunks[1:]:
|
331 |
+
if len(chunk) > self.chunk_overlap:
|
332 |
+
reconstructed += chunk[self.chunk_overlap:]
|
333 |
+
|
334 |
+
# Clean both texts for comparison (remove extra whitespace)
|
335 |
+
clean_original = ' '.join(original_text.split())
|
336 |
+
clean_reconstructed = ' '.join(reconstructed.split())
|
337 |
+
|
338 |
+
return clean_original == clean_reconstructed
|
339 |
+
except Exception as e:
|
340 |
+
logging.error(f"Error validating chunks: {str(e)}")
|
341 |
+
return False
|
342 |
+
|
343 |
def _extract_content(self, file_path: Path) -> str:
|
344 |
"""Extract content from different file formats"""
|
345 |
suffix = file_path.suffix.lower()
|
346 |
+
|
347 |
try:
|
348 |
if suffix == '.pdf':
|
349 |
return self._extract_pdf(file_path)
|
|
|
366 |
else:
|
367 |
raise ValueError(f"Unsupported format: {suffix}")
|
368 |
except Exception as e:
|
369 |
+
raise Exception(
|
370 |
+
f"Error extracting content from {file_path}: {str(e)}")
|
371 |
|
372 |
def _extract_text(self, file_path: Path) -> str:
|
373 |
"""Extract content from text-based files"""
|
|
|
384 |
with open(file_path, 'rb') as file:
|
385 |
reader = PyPDF2.PdfReader(file)
|
386 |
metadata = reader.metadata
|
387 |
+
|
388 |
for page in reader.pages:
|
389 |
text += page.extract_text() + "\n\n"
|
390 |
+
|
391 |
# Extract images if available
|
392 |
if '/XObject' in page['/Resources']:
|
393 |
for obj in page['/Resources']['/XObject'].get_object():
|
394 |
if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
|
395 |
pass
|
396 |
+
|
397 |
return text.strip()
|
398 |
|
399 |
def _extract_docx(self, file_path: Path) -> str:
|
400 |
"""Extract text from DOCX with formatting"""
|
401 |
doc = docx.Document(file_path)
|
402 |
full_text = []
|
403 |
+
|
404 |
for para in doc.paragraphs:
|
405 |
full_text.append(para.text)
|
406 |
+
|
407 |
for table in doc.tables:
|
408 |
for row in table.rows:
|
409 |
row_text = [cell.text for cell in row.cells]
|
410 |
full_text.append(" | ".join(row_text))
|
411 |
+
|
412 |
return "\n\n".join(full_text)
|
413 |
|
414 |
def _extract_csv(self, file_path: Path) -> str:
|
|
|
426 |
"""Extract text from HTML with structure preservation"""
|
427 |
with open(file_path) as f:
|
428 |
soup = BeautifulSoup(f, 'html.parser')
|
429 |
+
|
430 |
for script in soup(["script", "style"]):
|
431 |
script.decompose()
|
432 |
+
|
433 |
text = soup.get_text(separator='\n')
|
434 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
435 |
return "\n\n".join(lines)
|
|
|
439 |
try:
|
440 |
with open(file_path, 'r', encoding='utf-8') as f:
|
441 |
soup = BeautifulSoup(f, 'xml')
|
442 |
+
|
443 |
for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
|
444 |
pi.extract()
|
445 |
+
|
446 |
text = soup.get_text(separator='\n')
|
447 |
+
lines = [line.strip()
|
448 |
+
for line in text.splitlines() if line.strip()]
|
449 |
return "\n\n".join(lines)
|
450 |
except Exception as e:
|
451 |
raise Exception(f"Error processing XML file: {str(e)}")
|
|
|
454 |
"""Extract text from RTF files"""
|
455 |
try:
|
456 |
import striprtf.striprtf as striprtf
|
457 |
+
|
458 |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
459 |
rtf_text = f.read()
|
460 |
+
|
461 |
plain_text = striprtf.rtf_to_text(rtf_text)
|
462 |
+
lines = [line.strip()
|
463 |
+
for line in plain_text.splitlines() if line.strip()]
|
464 |
return "\n\n".join(lines)
|
465 |
except ImportError:
|
466 |
raise ImportError("striprtf package is required for RTF support.")
|
|
|
472 |
try:
|
473 |
# Use enhanced Excel processor
|
474 |
processed_content = self.excel_processor.process_excel(file_path)
|
475 |
+
|
476 |
# If processing fails, fall back to basic processing
|
477 |
if not processed_content:
|
478 |
+
logging.warning(
|
479 |
+
f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
|
480 |
return self._basic_excel_extract(file_path)
|
481 |
+
|
482 |
return processed_content
|
483 |
+
|
484 |
except Exception as e:
|
485 |
logging.error(f"Error in enhanced Excel processing: {str(e)}")
|
486 |
# Fall back to basic Excel processing
|
|
|
491 |
try:
|
492 |
excel_file = pd.ExcelFile(file_path)
|
493 |
sheets_data = []
|
494 |
+
|
495 |
for sheet_name in excel_file.sheet_names:
|
496 |
df = pd.read_excel(excel_file, sheet_name=sheet_name)
|
497 |
sheet_content = f"\nSheet: {sheet_name}\n"
|
498 |
sheet_content += "=" * (len(sheet_name) + 7) + "\n"
|
499 |
+
|
500 |
if df.empty:
|
501 |
sheet_content += "Empty Sheet\n"
|
502 |
else:
|
|
|
506 |
max_cols=None,
|
507 |
line_width=120
|
508 |
) + "\n"
|
509 |
+
|
510 |
sheets_data.append(sheet_content)
|
511 |
+
|
512 |
return "\n\n".join(sheets_data)
|
513 |
+
|
514 |
except Exception as e:
|
515 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
516 |
|
|
|
522 |
) -> Dict:
|
523 |
"""Generate comprehensive metadata"""
|
524 |
file_stat = file_path.stat()
|
525 |
+
|
526 |
metadata = {
|
527 |
'filename': file_path.name,
|
528 |
'file_type': file_path.suffix,
|
|
|
535 |
'character_count': len(content),
|
536 |
'processing_timestamp': datetime.now().isoformat()
|
537 |
}
|
538 |
+
|
539 |
# Add Excel-specific metadata if applicable
|
540 |
if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
541 |
try:
|
|
|
544 |
metadata.update({'excel_metadata': excel_metadata})
|
545 |
except Exception as e:
|
546 |
logging.warning(f"Could not extract Excel metadata: {str(e)}")
|
547 |
+
|
548 |
if additional_metadata:
|
549 |
metadata.update(additional_metadata)
|
550 |
+
|
551 |
return metadata
|
552 |
|
553 |
def _calculate_hash(self, text: str) -> str:
|
554 |
"""Calculate SHA-256 hash of text"""
|
555 |
return hashlib.sha256(text.encode()).hexdigest()
|
556 |
|
557 |
+
async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
|
|
|
|
|
|
|
|
|
558 |
"""Process a document with metadata and content extraction"""
|
559 |
file_path = Path(file_path)
|
560 |
+
|
561 |
if not self._validate_file(file_path):
|
562 |
raise ValueError(f"Invalid file: {file_path}")
|
563 |
|
564 |
content = self._extract_content(file_path)
|
565 |
doc_metadata = self._generate_metadata(file_path, content, metadata)
|
566 |
+
|
567 |
+
# Try enhanced splitting with validation
|
568 |
+
chunks = self.split_text(content)
|
569 |
+
if not self._validate_chunks(content, chunks):
|
570 |
+
logging.warning(
|
571 |
+
"Enhanced splitting failed validation, falling back to original splitter")
|
572 |
+
chunks = self.text_splitter.split_text(content)
|
573 |
+
|
574 |
+
# Add logging to verify chunk overlap
|
575 |
+
for i in range(len(chunks)-1):
|
576 |
+
logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
|
577 |
+
logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
|
578 |
+
logging.debug(
|
579 |
+
f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
|
580 |
+
|
581 |
chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
|
582 |
+
|
583 |
return {
|
584 |
'content': content,
|
585 |
'chunks': chunks,
|
|
|
588 |
'statistics': self._generate_statistics(content, chunks)
|
589 |
}
|
590 |
|
591 |
+
def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
|
592 |
+
"""Calculate the size of overlap between two chunks"""
|
593 |
+
min_len = min(len(chunk1), len(chunk2))
|
594 |
+
for i in range(min_len, 0, -1):
|
595 |
+
if chunk1[-i:] == chunk2[:i]:
|
596 |
+
return i
|
597 |
+
return 0
|
598 |
+
|
599 |
def _validate_file(self, file_path: Path) -> bool:
|
600 |
"""Validate file type, size, and content"""
|
601 |
if not file_path.exists():
|
602 |
raise FileNotFoundError(f"File not found: {file_path}")
|
603 |
+
|
604 |
if file_path.suffix.lower() not in self.supported_formats:
|
605 |
raise ValueError(f"Unsupported file format: {file_path.suffix}")
|
606 |
+
|
607 |
if file_path.stat().st_size > self.max_file_size:
|
608 |
raise ValueError(f"File too large: {file_path}")
|
609 |
+
|
610 |
if file_path.stat().st_size == 0:
|
611 |
raise ValueError(f"Empty file: {file_path}")
|
612 |
+
|
613 |
return True
|
614 |
|
615 |
def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
|
|
|
629 |
) -> Dict[str, Dict]:
|
630 |
"""Process multiple documents in parallel"""
|
631 |
results = {}
|
632 |
+
|
633 |
if parallel:
|
634 |
threads = []
|
635 |
for file_path in file_paths:
|
|
|
639 |
)
|
640 |
threads.append(thread)
|
641 |
thread.start()
|
642 |
+
|
643 |
for thread in threads:
|
644 |
thread.join()
|
645 |
else:
|
646 |
for file_path in file_paths:
|
647 |
await self._process_and_store(file_path, results)
|
648 |
+
|
649 |
return results
|
650 |
|
651 |
async def _process_and_store(
|
|
|
658 |
result = await self.process_document(file_path)
|
659 |
results[str(file_path)] = result
|
660 |
except Exception as e:
|
661 |
+
results[str(file_path)] = {'error': str(e)}
|
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc
CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
|
|
src/vectorstores/chroma_vectorstore.py
CHANGED
@@ -6,17 +6,18 @@ import logging
|
|
6 |
|
7 |
from .base_vectorstore import BaseVectorStore
|
8 |
|
|
|
9 |
class ChromaVectorStore(BaseVectorStore):
|
10 |
def __init__(
|
11 |
-
self,
|
12 |
-
embedding_function: Callable[[List[str]], List[List[float]]],
|
13 |
persist_directory: str = './chroma_db',
|
14 |
collection_name: str = "documents",
|
15 |
client_settings: Optional[Dict[str, Any]] = None
|
16 |
):
|
17 |
"""
|
18 |
Initialize Chroma Vector Store
|
19 |
-
|
20 |
Args:
|
21 |
embedding_function (Callable): Function to generate embeddings
|
22 |
persist_directory (str): Directory to persist the vector store
|
@@ -31,23 +32,24 @@ class ChromaVectorStore(BaseVectorStore):
|
|
31 |
self.client = chromadb.PersistentClient(settings=settings)
|
32 |
self.collection = self.client.get_or_create_collection(
|
33 |
name=collection_name,
|
34 |
-
|
|
|
35 |
)
|
36 |
self.embedding_function = embedding_function
|
37 |
except Exception as e:
|
38 |
logging.error(f"Error initializing ChromaDB: {str(e)}")
|
39 |
raise
|
40 |
-
|
41 |
def add_documents(
|
42 |
-
self,
|
43 |
-
documents: List[str],
|
44 |
embeddings: Optional[List[List[float]]] = None,
|
45 |
metadatas: Optional[List[Dict[str, Any]]] = None,
|
46 |
ids: Optional[List[str]] = None
|
47 |
) -> None:
|
48 |
"""
|
49 |
Add documents to the vector store
|
50 |
-
|
51 |
Args:
|
52 |
documents (List[str]): List of document texts
|
53 |
embeddings (Optional[List[List[float]]]): Pre-computed embeddings
|
@@ -63,32 +65,35 @@ class ChromaVectorStore(BaseVectorStore):
|
|
63 |
embeddings = self.embedding_function(documents)
|
64 |
|
65 |
if len(documents) != len(embeddings):
|
66 |
-
raise ValueError(
|
67 |
-
|
|
|
68 |
# Use provided IDs or generate them
|
69 |
-
doc_ids = ids if ids is not None else [
|
70 |
-
|
|
|
71 |
# Prepare add parameters
|
72 |
add_params = {
|
73 |
"documents": documents,
|
74 |
"embeddings": embeddings,
|
75 |
"ids": doc_ids
|
76 |
}
|
77 |
-
|
78 |
# Only include metadatas if provided
|
79 |
if metadatas is not None:
|
80 |
if len(metadatas) != len(documents):
|
81 |
-
raise ValueError(
|
|
|
82 |
add_params["metadatas"] = metadatas
|
83 |
-
|
84 |
self.collection.add(**add_params)
|
85 |
except Exception as e:
|
86 |
logging.error(f"Error adding documents to ChromaDB: {str(e)}")
|
87 |
raise
|
88 |
-
|
89 |
def similarity_search(
|
90 |
-
self,
|
91 |
-
query_embedding: List[float],
|
92 |
top_k: int = 3,
|
93 |
**kwargs
|
94 |
) -> List[Dict[str, Any]]:
|
@@ -102,21 +107,24 @@ class ChromaVectorStore(BaseVectorStore):
|
|
102 |
n_results=10, # Get more initial results
|
103 |
include=['documents', 'metadatas', 'distances']
|
104 |
)
|
105 |
-
|
106 |
if not results or 'documents' not in results or not results['documents']:
|
107 |
logging.warning("No results found in similarity search")
|
108 |
return []
|
109 |
-
|
110 |
formatted_results = []
|
111 |
documents = results['documents'][0] # First query's results
|
112 |
-
metadatas = results['metadatas'][0] if results.get('metadatas') else [
|
113 |
-
|
114 |
-
|
|
|
|
|
115 |
# Process all results
|
116 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
117 |
# Convert distance to similarity score (1 is most similar, 0 is least)
|
118 |
-
similarity_score = 1.0 -
|
119 |
-
|
|
|
120 |
# More permissive threshold and include all results for filtering
|
121 |
if similarity_score is not None and similarity_score > 0.2: # Lower threshold
|
122 |
formatted_results.append({
|
@@ -124,45 +132,47 @@ class ChromaVectorStore(BaseVectorStore):
|
|
124 |
'metadata': meta or {},
|
125 |
'score': similarity_score
|
126 |
})
|
127 |
-
|
128 |
# Sort by score and get top_k results
|
129 |
formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
|
130 |
-
|
131 |
# Check if results are from same document and get consecutive chunks
|
132 |
if formatted_results:
|
133 |
-
first_doc_id = formatted_results[0]['metadata'].get(
|
|
|
134 |
all_chunks_same_doc = []
|
135 |
-
|
136 |
# Get all chunks from the same document
|
137 |
for result in formatted_results:
|
138 |
if result['metadata'].get('document_id') == first_doc_id:
|
139 |
all_chunks_same_doc.append(result)
|
140 |
-
|
141 |
# Sort chunks by their index to maintain document flow
|
142 |
all_chunks_same_doc.sort(
|
143 |
key=lambda x: x['metadata'].get('chunk_index', 0)
|
144 |
)
|
145 |
-
|
146 |
# Return either all chunks from same document or top_k results
|
147 |
if len(all_chunks_same_doc) > 0:
|
148 |
return all_chunks_same_doc[:top_k]
|
149 |
-
|
150 |
return formatted_results[:top_k]
|
151 |
-
|
152 |
except Exception as e:
|
153 |
-
logging.error(
|
|
|
154 |
raise
|
155 |
-
|
156 |
def get_all_documents(
|
157 |
self,
|
158 |
include_embeddings: bool = False
|
159 |
) -> List[Dict[str, Any]]:
|
160 |
"""
|
161 |
Retrieve all documents from the vector store
|
162 |
-
|
163 |
Args:
|
164 |
include_embeddings (bool): Whether to include embeddings in the response
|
165 |
-
|
166 |
Returns:
|
167 |
List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
|
168 |
"""
|
@@ -170,45 +180,46 @@ class ChromaVectorStore(BaseVectorStore):
|
|
170 |
include = ["documents", "metadatas"]
|
171 |
if include_embeddings:
|
172 |
include.append("embeddings")
|
173 |
-
|
174 |
results = self.collection.get(
|
175 |
include=include
|
176 |
)
|
177 |
-
|
178 |
if not results or 'documents' not in results:
|
179 |
return []
|
180 |
-
|
181 |
documents = []
|
182 |
for i in range(len(results['documents'])):
|
183 |
doc = {
|
184 |
'id': str(i), # Generate sequential IDs
|
185 |
'text': results['documents'][i],
|
186 |
}
|
187 |
-
|
188 |
if include_embeddings and 'embeddings' in results:
|
189 |
doc['embedding'] = results['embeddings'][i]
|
190 |
-
|
191 |
if 'metadatas' in results and results['metadatas'][i]:
|
192 |
doc['metadata'] = results['metadatas'][i]
|
193 |
-
|
194 |
# Use document_id from metadata if available
|
195 |
if 'document_id' in results['metadatas'][i]:
|
196 |
doc['id'] = results['metadatas'][i]['document_id']
|
197 |
-
|
198 |
documents.append(doc)
|
199 |
-
|
200 |
return documents
|
201 |
except Exception as e:
|
202 |
-
logging.error(
|
|
|
203 |
raise
|
204 |
-
|
205 |
def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
|
206 |
"""
|
207 |
Retrieve all chunks for a specific document
|
208 |
-
|
209 |
Args:
|
210 |
document_id (str): ID of the document to retrieve chunks for
|
211 |
-
|
212 |
Returns:
|
213 |
List[Dict[str, Any]]: List of document chunks with their metadata
|
214 |
"""
|
@@ -217,10 +228,10 @@ class ChromaVectorStore(BaseVectorStore):
|
|
217 |
where={"document_id": document_id},
|
218 |
include=["documents", "metadatas"]
|
219 |
)
|
220 |
-
|
221 |
if not results or 'documents' not in results:
|
222 |
return []
|
223 |
-
|
224 |
chunks = []
|
225 |
for i in range(len(results['documents'])):
|
226 |
chunk = {
|
@@ -228,10 +239,11 @@ class ChromaVectorStore(BaseVectorStore):
|
|
228 |
'metadata': results['metadatas'][i] if results.get('metadatas') else None
|
229 |
}
|
230 |
chunks.append(chunk)
|
231 |
-
|
232 |
# Sort by chunk_index if available
|
233 |
-
chunks.sort(key=lambda x: x.get(
|
234 |
-
|
|
|
235 |
return chunks
|
236 |
except Exception as e:
|
237 |
logging.error(f"Error retrieving document chunks: {str(e)}")
|
@@ -240,7 +252,7 @@ class ChromaVectorStore(BaseVectorStore):
|
|
240 |
def delete_document(self, document_id: str) -> None:
|
241 |
"""
|
242 |
Delete all chunks associated with a document_id
|
243 |
-
|
244 |
Args:
|
245 |
document_id (str): ID of the document to delete
|
246 |
"""
|
@@ -250,15 +262,17 @@ class ChromaVectorStore(BaseVectorStore):
|
|
250 |
where={"document_id": document_id},
|
251 |
include=["metadatas"]
|
252 |
)
|
253 |
-
|
254 |
if not results or 'ids' not in results:
|
255 |
logging.warning(f"No document found with ID: {document_id}")
|
256 |
return
|
257 |
-
|
258 |
# Delete all chunks associated with the document
|
259 |
-
chunk_ids = [
|
|
|
260 |
self.collection.delete(ids=chunk_ids)
|
261 |
-
|
262 |
except Exception as e:
|
263 |
-
logging.error(
|
264 |
-
|
|
|
|
6 |
|
7 |
from .base_vectorstore import BaseVectorStore
|
8 |
|
9 |
+
|
10 |
class ChromaVectorStore(BaseVectorStore):
|
11 |
def __init__(
|
12 |
+
self,
|
13 |
+
embedding_function: Callable[[List[str]], List[List[float]]],
|
14 |
persist_directory: str = './chroma_db',
|
15 |
collection_name: str = "documents",
|
16 |
client_settings: Optional[Dict[str, Any]] = None
|
17 |
):
|
18 |
"""
|
19 |
Initialize Chroma Vector Store
|
20 |
+
|
21 |
Args:
|
22 |
embedding_function (Callable): Function to generate embeddings
|
23 |
persist_directory (str): Directory to persist the vector store
|
|
|
32 |
self.client = chromadb.PersistentClient(settings=settings)
|
33 |
self.collection = self.client.get_or_create_collection(
|
34 |
name=collection_name,
|
35 |
+
# Using cosine similarity by default
|
36 |
+
metadata={"hnsw:space": "cosine"}
|
37 |
)
|
38 |
self.embedding_function = embedding_function
|
39 |
except Exception as e:
|
40 |
logging.error(f"Error initializing ChromaDB: {str(e)}")
|
41 |
raise
|
42 |
+
|
43 |
def add_documents(
|
44 |
+
self,
|
45 |
+
documents: List[str],
|
46 |
embeddings: Optional[List[List[float]]] = None,
|
47 |
metadatas: Optional[List[Dict[str, Any]]] = None,
|
48 |
ids: Optional[List[str]] = None
|
49 |
) -> None:
|
50 |
"""
|
51 |
Add documents to the vector store
|
52 |
+
|
53 |
Args:
|
54 |
documents (List[str]): List of document texts
|
55 |
embeddings (Optional[List[List[float]]]): Pre-computed embeddings
|
|
|
65 |
embeddings = self.embedding_function(documents)
|
66 |
|
67 |
if len(documents) != len(embeddings):
|
68 |
+
raise ValueError(
|
69 |
+
"Number of documents and embeddings must match")
|
70 |
+
|
71 |
# Use provided IDs or generate them
|
72 |
+
doc_ids = ids if ids is not None else [
|
73 |
+
f"doc_{i}" for i in range(len(documents))]
|
74 |
+
|
75 |
# Prepare add parameters
|
76 |
add_params = {
|
77 |
"documents": documents,
|
78 |
"embeddings": embeddings,
|
79 |
"ids": doc_ids
|
80 |
}
|
81 |
+
|
82 |
# Only include metadatas if provided
|
83 |
if metadatas is not None:
|
84 |
if len(metadatas) != len(documents):
|
85 |
+
raise ValueError(
|
86 |
+
"Number of documents and metadatas must match")
|
87 |
add_params["metadatas"] = metadatas
|
88 |
+
|
89 |
self.collection.add(**add_params)
|
90 |
except Exception as e:
|
91 |
logging.error(f"Error adding documents to ChromaDB: {str(e)}")
|
92 |
raise
|
93 |
+
|
94 |
def similarity_search(
|
95 |
+
self,
|
96 |
+
query_embedding: List[float],
|
97 |
top_k: int = 3,
|
98 |
**kwargs
|
99 |
) -> List[Dict[str, Any]]:
|
|
|
107 |
n_results=10, # Get more initial results
|
108 |
include=['documents', 'metadatas', 'distances']
|
109 |
)
|
110 |
+
|
111 |
if not results or 'documents' not in results or not results['documents']:
|
112 |
logging.warning("No results found in similarity search")
|
113 |
return []
|
114 |
+
|
115 |
formatted_results = []
|
116 |
documents = results['documents'][0] # First query's results
|
117 |
+
metadatas = results['metadatas'][0] if results.get('metadatas') else [
|
118 |
+
None] * len(documents)
|
119 |
+
distances = results['distances'][0] if results.get('distances') else [
|
120 |
+
None] * len(documents)
|
121 |
+
|
122 |
# Process all results
|
123 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
124 |
# Convert distance to similarity score (1 is most similar, 0 is least)
|
125 |
+
similarity_score = 1.0 - \
|
126 |
+
(dist or 0.0) if dist is not None else None
|
127 |
+
|
128 |
# More permissive threshold and include all results for filtering
|
129 |
if similarity_score is not None and similarity_score > 0.2: # Lower threshold
|
130 |
formatted_results.append({
|
|
|
132 |
'metadata': meta or {},
|
133 |
'score': similarity_score
|
134 |
})
|
135 |
+
|
136 |
# Sort by score and get top_k results
|
137 |
formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
|
138 |
+
|
139 |
# Check if results are from same document and get consecutive chunks
|
140 |
if formatted_results:
|
141 |
+
first_doc_id = formatted_results[0]['metadata'].get(
|
142 |
+
'document_id')
|
143 |
all_chunks_same_doc = []
|
144 |
+
|
145 |
# Get all chunks from the same document
|
146 |
for result in formatted_results:
|
147 |
if result['metadata'].get('document_id') == first_doc_id:
|
148 |
all_chunks_same_doc.append(result)
|
149 |
+
|
150 |
# Sort chunks by their index to maintain document flow
|
151 |
all_chunks_same_doc.sort(
|
152 |
key=lambda x: x['metadata'].get('chunk_index', 0)
|
153 |
)
|
154 |
+
|
155 |
# Return either all chunks from same document or top_k results
|
156 |
if len(all_chunks_same_doc) > 0:
|
157 |
return all_chunks_same_doc[:top_k]
|
158 |
+
|
159 |
return formatted_results[:top_k]
|
160 |
+
|
161 |
except Exception as e:
|
162 |
+
logging.error(
|
163 |
+
f"Error performing similarity search in ChromaDB: {str(e)}")
|
164 |
raise
|
165 |
+
|
166 |
def get_all_documents(
|
167 |
self,
|
168 |
include_embeddings: bool = False
|
169 |
) -> List[Dict[str, Any]]:
|
170 |
"""
|
171 |
Retrieve all documents from the vector store
|
172 |
+
|
173 |
Args:
|
174 |
include_embeddings (bool): Whether to include embeddings in the response
|
175 |
+
|
176 |
Returns:
|
177 |
List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
|
178 |
"""
|
|
|
180 |
include = ["documents", "metadatas"]
|
181 |
if include_embeddings:
|
182 |
include.append("embeddings")
|
183 |
+
|
184 |
results = self.collection.get(
|
185 |
include=include
|
186 |
)
|
187 |
+
|
188 |
if not results or 'documents' not in results:
|
189 |
return []
|
190 |
+
|
191 |
documents = []
|
192 |
for i in range(len(results['documents'])):
|
193 |
doc = {
|
194 |
'id': str(i), # Generate sequential IDs
|
195 |
'text': results['documents'][i],
|
196 |
}
|
197 |
+
|
198 |
if include_embeddings and 'embeddings' in results:
|
199 |
doc['embedding'] = results['embeddings'][i]
|
200 |
+
|
201 |
if 'metadatas' in results and results['metadatas'][i]:
|
202 |
doc['metadata'] = results['metadatas'][i]
|
203 |
+
|
204 |
# Use document_id from metadata if available
|
205 |
if 'document_id' in results['metadatas'][i]:
|
206 |
doc['id'] = results['metadatas'][i]['document_id']
|
207 |
+
|
208 |
documents.append(doc)
|
209 |
+
|
210 |
return documents
|
211 |
except Exception as e:
|
212 |
+
logging.error(
|
213 |
+
f"Error retrieving documents from ChromaDB: {str(e)}")
|
214 |
raise
|
215 |
+
|
216 |
def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
|
217 |
"""
|
218 |
Retrieve all chunks for a specific document
|
219 |
+
|
220 |
Args:
|
221 |
document_id (str): ID of the document to retrieve chunks for
|
222 |
+
|
223 |
Returns:
|
224 |
List[Dict[str, Any]]: List of document chunks with their metadata
|
225 |
"""
|
|
|
228 |
where={"document_id": document_id},
|
229 |
include=["documents", "metadatas"]
|
230 |
)
|
231 |
+
|
232 |
if not results or 'documents' not in results:
|
233 |
return []
|
234 |
+
|
235 |
chunks = []
|
236 |
for i in range(len(results['documents'])):
|
237 |
chunk = {
|
|
|
239 |
'metadata': results['metadatas'][i] if results.get('metadatas') else None
|
240 |
}
|
241 |
chunks.append(chunk)
|
242 |
+
|
243 |
# Sort by chunk_index if available
|
244 |
+
chunks.sort(key=lambda x: x.get(
|
245 |
+
'metadata', {}).get('chunk_index', 0))
|
246 |
+
|
247 |
return chunks
|
248 |
except Exception as e:
|
249 |
logging.error(f"Error retrieving document chunks: {str(e)}")
|
|
|
252 |
def delete_document(self, document_id: str) -> None:
|
253 |
"""
|
254 |
Delete all chunks associated with a document_id
|
255 |
+
|
256 |
Args:
|
257 |
document_id (str): ID of the document to delete
|
258 |
"""
|
|
|
262 |
where={"document_id": document_id},
|
263 |
include=["metadatas"]
|
264 |
)
|
265 |
+
|
266 |
if not results or 'ids' not in results:
|
267 |
logging.warning(f"No document found with ID: {document_id}")
|
268 |
return
|
269 |
+
|
270 |
# Delete all chunks associated with the document
|
271 |
+
chunk_ids = [
|
272 |
+
f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
|
273 |
self.collection.delete(ids=chunk_ids)
|
274 |
+
|
275 |
except Exception as e:
|
276 |
+
logging.error(
|
277 |
+
f"Error deleting document {document_id} from ChromaDB: {str(e)}")
|
278 |
+
raise
|
temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx
CHANGED
Binary files a/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx and b/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx differ
|
|