Spaces:
Running
Running
Commit
·
c2e0ca0
1
Parent(s):
11eddc5
Updated embedding to produciton, changed the depenedency for magic library
Browse files
config/config.py
CHANGED
@@ -22,8 +22,17 @@ class Settings:
|
|
22 |
# Anthropic Configuration
|
23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
24 |
|
|
|
|
|
|
|
25 |
# Embedding Configuration
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Vector Store Configuration
|
29 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
|
|
22 |
# Anthropic Configuration
|
23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
24 |
|
25 |
+
# Environment Configuration
|
26 |
+
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
27 |
+
|
28 |
# Embedding Configuration
|
29 |
+
@property
|
30 |
+
def EMBEDDING_MODEL(self):
|
31 |
+
if self.ENVIRONMENT == 'production':
|
32 |
+
# Better model for demos
|
33 |
+
return os.getenv('EMBEDDING_MODEL', 'text-embedding-3-large')
|
34 |
+
# Better for development purposes.
|
35 |
+
return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
36 |
|
37 |
# Vector Store Configuration
|
38 |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
|
constraints.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
python-magic-bin==0.4.14
|
|
|
|
requirements.txt
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
--constraint constraints.txt
|
2 |
fastapi
|
3 |
uvicorn
|
4 |
torch
|
|
|
|
|
1 |
fastapi
|
2 |
uvicorn
|
3 |
torch
|
src/agents/__pycache__/rag_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
|
|
src/utils/__pycache__/document_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
|
|
src/utils/document_processor.py
CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6 |
import json
|
7 |
from pathlib import Path
|
8 |
import hashlib
|
9 |
-
import
|
10 |
from bs4 import BeautifulSoup
|
11 |
import csv
|
12 |
from datetime import datetime
|
@@ -514,6 +514,46 @@ class DocumentProcessor:
|
|
514 |
except Exception as e:
|
515 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
def _generate_metadata(
|
518 |
self,
|
519 |
file_path: Path,
|
@@ -530,7 +570,7 @@ class DocumentProcessor:
|
|
530 |
'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
531 |
'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
532 |
'content_hash': self._calculate_hash(content),
|
533 |
-
'mime_type':
|
534 |
'word_count': len(content.split()),
|
535 |
'character_count': len(content),
|
536 |
'processing_timestamp': datetime.now().isoformat()
|
@@ -550,6 +590,42 @@ class DocumentProcessor:
|
|
550 |
|
551 |
return metadata
|
552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
def _calculate_hash(self, text: str) -> str:
|
554 |
"""Calculate SHA-256 hash of text"""
|
555 |
return hashlib.sha256(text.encode()).hexdigest()
|
|
|
6 |
import json
|
7 |
from pathlib import Path
|
8 |
import hashlib
|
9 |
+
import mimetypes # Add this instead
|
10 |
from bs4 import BeautifulSoup
|
11 |
import csv
|
12 |
from datetime import datetime
|
|
|
514 |
except Exception as e:
|
515 |
raise Exception(f"Error in basic Excel processing: {str(e)}")
|
516 |
|
517 |
+
def _get_mime_type(self, file_path: Path) -> str:
|
518 |
+
"""
|
519 |
+
Get MIME type for a file based on its extension
|
520 |
+
|
521 |
+
Args:
|
522 |
+
file_path (Path): Path to the file
|
523 |
+
|
524 |
+
Returns:
|
525 |
+
str: MIME type of the file
|
526 |
+
"""
|
527 |
+
# Standard MIME mappings for supported formats
|
528 |
+
MIME_MAPPINGS = {
|
529 |
+
'.pdf': 'application/pdf',
|
530 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
531 |
+
'.doc': 'application/msword',
|
532 |
+
'.csv': 'text/csv',
|
533 |
+
'.json': 'application/json',
|
534 |
+
'.html': 'text/html',
|
535 |
+
'.txt': 'text/plain',
|
536 |
+
'.md': 'text/markdown',
|
537 |
+
'.xml': 'text/xml',
|
538 |
+
'.rtf': 'application/rtf',
|
539 |
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
540 |
+
'.xls': 'application/vnd.ms-excel'
|
541 |
+
}
|
542 |
+
|
543 |
+
suffix = file_path.suffix.lower()
|
544 |
+
|
545 |
+
# Verify the file format is supported
|
546 |
+
if suffix not in self.supported_formats:
|
547 |
+
logging.warning(f"Unsupported file format: {suffix}")
|
548 |
+
return 'application/octet-stream'
|
549 |
+
|
550 |
+
# Return known MIME type or fall back to mimetypes module
|
551 |
+
if suffix in MIME_MAPPINGS:
|
552 |
+
return MIME_MAPPINGS[suffix]
|
553 |
+
|
554 |
+
mime_type = mimetypes.guess_type(str(file_path))[0]
|
555 |
+
return mime_type if mime_type else 'application/octet-stream'
|
556 |
+
|
557 |
def _generate_metadata(
|
558 |
self,
|
559 |
file_path: Path,
|
|
|
570 |
'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
571 |
'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
572 |
'content_hash': self._calculate_hash(content),
|
573 |
+
'mime_type': self._get_mime_type(file_path),
|
574 |
'word_count': len(content.split()),
|
575 |
'character_count': len(content),
|
576 |
'processing_timestamp': datetime.now().isoformat()
|
|
|
590 |
|
591 |
return metadata
|
592 |
|
593 |
+
# def _generate_metadata(
|
594 |
+
# self,
|
595 |
+
# file_path: Path,
|
596 |
+
# content: str,
|
597 |
+
# additional_metadata: Optional[Dict] = None
|
598 |
+
# ) -> Dict:
|
599 |
+
# """Generate comprehensive metadata"""
|
600 |
+
# file_stat = file_path.stat()
|
601 |
+
|
602 |
+
# metadata = {
|
603 |
+
# 'filename': file_path.name,
|
604 |
+
# 'file_type': file_path.suffix,
|
605 |
+
# 'file_size': file_stat.st_size,
|
606 |
+
# 'created_at': datetime.fromtimestamp(file_stat.st_ctime),
|
607 |
+
# 'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
|
608 |
+
# 'content_hash': self._calculate_hash(content),
|
609 |
+
# 'mime_type': magic.from_file(str(file_path), mime=True),
|
610 |
+
# 'word_count': len(content.split()),
|
611 |
+
# 'character_count': len(content),
|
612 |
+
# 'processing_timestamp': datetime.now().isoformat()
|
613 |
+
# }
|
614 |
+
|
615 |
+
# # Add Excel-specific metadata if applicable
|
616 |
+
# if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
617 |
+
# try:
|
618 |
+
# if hasattr(self.excel_processor, 'get_metadata'):
|
619 |
+
# excel_metadata = self.excel_processor.get_metadata()
|
620 |
+
# metadata.update({'excel_metadata': excel_metadata})
|
621 |
+
# except Exception as e:
|
622 |
+
# logging.warning(f"Could not extract Excel metadata: {str(e)}")
|
623 |
+
|
624 |
+
# if additional_metadata:
|
625 |
+
# metadata.update(additional_metadata)
|
626 |
+
|
627 |
+
# return metadata
|
628 |
+
|
629 |
def _calculate_hash(self, text: str) -> str:
|
630 |
"""Calculate SHA-256 hash of text"""
|
631 |
return hashlib.sha256(text.encode()).hexdigest()
|