Spaces:
Sleeping
Sleeping
Commit
·
c2e0ca0
1
Parent(s):
11eddc5
Updated embedding to produciton, changed the depenedency for magic library
Browse files
config/config.py
CHANGED
|
@@ -22,8 +22,17 @@ class Settings:
|
|
| 22 |
# Anthropic Configuration
|
| 23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
# Embedding Configuration
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Vector Store Configuration
|
| 29 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
|
|
|
| 22 |
# Anthropic Configuration
|
| 23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
| 24 |
|
| 25 |
+
# Environment Configuration
|
| 26 |
+
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
| 27 |
+
|
| 28 |
# Embedding Configuration
|
| 29 |
+
@property
|
| 30 |
+
def EMBEDDING_MODEL(self):
|
| 31 |
+
if self.ENVIRONMENT == 'production':
|
| 32 |
+
# Better model for demos
|
| 33 |
+
return os.getenv('EMBEDDING_MODEL', 'text-embedding-3-large')
|
| 34 |
+
# Better for development purposes.
|
| 35 |
+
return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
| 36 |
|
| 37 |
# Vector Store Configuration
|
| 38 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
constraints.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
python-magic-bin==0.4.14
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
--constraint constraints.txt
|
| 2 |
fastapi
|
| 3 |
uvicorn
|
| 4 |
torch
|
|
|
|
|
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
torch
|
src/agents/__pycache__/rag_agent.cpython-312.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
|
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
|
|
|
src/utils/__pycache__/document_processor.cpython-312.pyc
CHANGED
|
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
|
|
|
src/utils/document_processor.py
CHANGED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
| 6 |
import json
|
| 7 |
from pathlib import Path
|
| 8 |
import hashlib
|
| 9 |
-
import
|
| 10 |
from bs4 import BeautifulSoup
|
| 11 |
import csv
|
| 12 |
from datetime import datetime
|
|
@@ -514,6 +514,46 @@ class DocumentProcessor:
|
|
| 514 |
except Exception as e:
|
| 515 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
def _generate_metadata(
|
| 518 |
self,
|
| 519 |
file_path: Path,
|
|
@@ -530,7 +570,7 @@ class DocumentProcessor:
|
|
| 530 |
'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
| 531 |
'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
| 532 |
'content_hash': self._calculate_hash(content),
|
| 533 |
-
'mime_type':
|
| 534 |
'word_count': len(content.split()),
|
| 535 |
'character_count': len(content),
|
| 536 |
'processing_timestamp': datetime.now().isoformat()
|
|
@@ -550,6 +590,42 @@ class DocumentProcessor:
|
|
| 550 |
|
| 551 |
return metadata
|
| 552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
def _calculate_hash(self, text: str) -> str:
|
| 554 |
"""Calculate SHA-256 hash of text"""
|
| 555 |
return hashlib.sha256(text.encode()).hexdigest()
|
|
|
|
| 6 |
import json
|
| 7 |
from pathlib import Path
|
| 8 |
import hashlib
|
| 9 |
+
import mimetypes # Add this instead
|
| 10 |
from bs4 import BeautifulSoup
|
| 11 |
import csv
|
| 12 |
from datetime import datetime
|
|
|
|
| 514 |
except Exception as e:
|
| 515 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
| 516 |
|
| 517 |
+
def _get_mime_type(self, file_path: Path) -> str:
|
| 518 |
+
"""
|
| 519 |
+
Get MIME type for a file based on its extension
|
| 520 |
+
|
| 521 |
+
Args:
|
| 522 |
+
file_path (Path): Path to the file
|
| 523 |
+
|
| 524 |
+
Returns:
|
| 525 |
+
str: MIME type of the file
|
| 526 |
+
"""
|
| 527 |
+
# Standard MIME mappings for supported formats
|
| 528 |
+
MIME_MAPPINGS = {
|
| 529 |
+
'.pdf': 'application/pdf',
|
| 530 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 531 |
+
'.doc': 'application/msword',
|
| 532 |
+
'.csv': 'text/csv',
|
| 533 |
+
'.json': 'application/json',
|
| 534 |
+
'.html': 'text/html',
|
| 535 |
+
'.txt': 'text/plain',
|
| 536 |
+
'.md': 'text/markdown',
|
| 537 |
+
'.xml': 'text/xml',
|
| 538 |
+
'.rtf': 'application/rtf',
|
| 539 |
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
| 540 |
+
'.xls': 'application/vnd.ms-excel'
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
suffix = file_path.suffix.lower()
|
| 544 |
+
|
| 545 |
+
# Verify the file format is supported
|
| 546 |
+
if suffix not in self.supported_formats:
|
| 547 |
+
logging.warning(f"Unsupported file format: {suffix}")
|
| 548 |
+
return 'application/octet-stream'
|
| 549 |
+
|
| 550 |
+
# Return known MIME type or fall back to mimetypes module
|
| 551 |
+
if suffix in MIME_MAPPINGS:
|
| 552 |
+
return MIME_MAPPINGS[suffix]
|
| 553 |
+
|
| 554 |
+
mime_type = mimetypes.guess_type(str(file_path))[0]
|
| 555 |
+
return mime_type if mime_type else 'application/octet-stream'
|
| 556 |
+
|
| 557 |
def _generate_metadata(
|
| 558 |
self,
|
| 559 |
file_path: Path,
|
|
|
|
| 570 |
'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
| 571 |
'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
| 572 |
'content_hash': self._calculate_hash(content),
|
| 573 |
+
'mime_type': self._get_mime_type(file_path),
|
| 574 |
'word_count': len(content.split()),
|
| 575 |
'character_count': len(content),
|
| 576 |
'processing_timestamp': datetime.now().isoformat()
|
|
|
|
| 590 |
|
| 591 |
return metadata
|
| 592 |
|
| 593 |
+
# def _generate_metadata(
|
| 594 |
+
# self,
|
| 595 |
+
# file_path: Path,
|
| 596 |
+
# content: str,
|
| 597 |
+
# additional_metadata: Optional[Dict] = None
|
| 598 |
+
# ) -> Dict:
|
| 599 |
+
# """Generate comprehensive metadata"""
|
| 600 |
+
# file_stat = file_path.stat()
|
| 601 |
+
|
| 602 |
+
# metadata = {
|
| 603 |
+
# 'filename': file_path.name,
|
| 604 |
+
# 'file_type': file_path.suffix,
|
| 605 |
+
# 'file_size': file_stat.st_size,
|
| 606 |
+
# 'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
| 607 |
+
# 'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
| 608 |
+
# 'content_hash': self._calculate_hash(content),
|
| 609 |
+
# 'mime_type': magic.from_file(str(file_path), mime=True),
|
| 610 |
+
# 'word_count': len(content.split()),
|
| 611 |
+
# 'character_count': len(content),
|
| 612 |
+
# 'processing_timestamp': datetime.now().isoformat()
|
| 613 |
+
# }
|
| 614 |
+
|
| 615 |
+
# # Add Excel-specific metadata if applicable
|
| 616 |
+
# if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
| 617 |
+
# try:
|
| 618 |
+
# if hasattr(self.excel_processor, 'get_metadata'):
|
| 619 |
+
# excel_metadata = self.excel_processor.get_metadata()
|
| 620 |
+
# metadata.update({'excel_metadata': excel_metadata})
|
| 621 |
+
# except Exception as e:
|
| 622 |
+
# logging.warning(f"Could not extract Excel metadata: {str(e)}")
|
| 623 |
+
|
| 624 |
+
# if additional_metadata:
|
| 625 |
+
# metadata.update(additional_metadata)
|
| 626 |
+
|
| 627 |
+
# return metadata
|
| 628 |
+
|
| 629 |
def _calculate_hash(self, text: str) -> str:
|
| 630 |
"""Calculate SHA-256 hash of text"""
|
| 631 |
return hashlib.sha256(text.encode()).hexdigest()
|