TalatMasood commited on
Commit
c2e0ca0
·
1 Parent(s): 11eddc5

Updated embedding to produciton, changed the depenedency for magic library

Browse files
config/config.py CHANGED
@@ -22,8 +22,17 @@ class Settings:
22
  # Anthropic Configuration
23
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
24
 
 
 
 
25
  # Embedding Configuration
26
- EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
 
 
 
 
 
 
27
 
28
  # Vector Store Configuration
29
  CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
 
22
  # Anthropic Configuration
23
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
24
 
25
+ # Environment Configuration
26
+ ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
27
+
28
  # Embedding Configuration
29
+ @property
30
+ def EMBEDDING_MODEL(self):
31
+ if self.ENVIRONMENT == 'production':
32
+ # Better model for demos
33
+ return os.getenv('EMBEDDING_MODEL', 'text-embedding-3-large')
34
+ # Better for development purposes.
35
+ return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
36
 
37
  # Vector Store Configuration
38
  CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
constraints.txt DELETED
@@ -1 +0,0 @@
1
- python-magic-bin==0.4.14
 
 
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- --constraint constraints.txt
2
  fastapi
3
  uvicorn
4
  torch
 
 
1
  fastapi
2
  uvicorn
3
  torch
src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
 
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
 
src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
 
src/utils/document_processor.py CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
6
  import json
7
  from pathlib import Path
8
  import hashlib
9
- import magic
10
  from bs4 import BeautifulSoup
11
  import csv
12
  from datetime import datetime
@@ -514,6 +514,46 @@ class DocumentProcessor:
514
  except Exception as e:
515
  raise Exception(f"Error in basic Excel processing: {str(e)}")
516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  def _generate_metadata(
518
  self,
519
  file_path: Path,
@@ -530,7 +570,7 @@ class DocumentProcessor:
530
  'created_at': datetime.fromtimestamp(file_stat.st_ctime),
531
  'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
532
  'content_hash': self._calculate_hash(content),
533
- 'mime_type': magic.from_file(str(file_path), mime=True),
534
  'word_count': len(content.split()),
535
  'character_count': len(content),
536
  'processing_timestamp': datetime.now().isoformat()
@@ -550,6 +590,42 @@ class DocumentProcessor:
550
 
551
  return metadata
552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
  def _calculate_hash(self, text: str) -> str:
554
  """Calculate SHA-256 hash of text"""
555
  return hashlib.sha256(text.encode()).hexdigest()
 
6
  import json
7
  from pathlib import Path
8
  import hashlib
9
+ import mimetypes # Add this instead
10
  from bs4 import BeautifulSoup
11
  import csv
12
  from datetime import datetime
 
514
  except Exception as e:
515
  raise Exception(f"Error in basic Excel processing: {str(e)}")
516
 
517
+ def _get_mime_type(self, file_path: Path) -> str:
518
+ """
519
+ Get MIME type for a file based on its extension
520
+
521
+ Args:
522
+ file_path (Path): Path to the file
523
+
524
+ Returns:
525
+ str: MIME type of the file
526
+ """
527
+ # Standard MIME mappings for supported formats
528
+ MIME_MAPPINGS = {
529
+ '.pdf': 'application/pdf',
530
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
531
+ '.doc': 'application/msword',
532
+ '.csv': 'text/csv',
533
+ '.json': 'application/json',
534
+ '.html': 'text/html',
535
+ '.txt': 'text/plain',
536
+ '.md': 'text/markdown',
537
+ '.xml': 'text/xml',
538
+ '.rtf': 'application/rtf',
539
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
540
+ '.xls': 'application/vnd.ms-excel'
541
+ }
542
+
543
+ suffix = file_path.suffix.lower()
544
+
545
+ # Verify the file format is supported
546
+ if suffix not in self.supported_formats:
547
+ logging.warning(f"Unsupported file format: {suffix}")
548
+ return 'application/octet-stream'
549
+
550
+ # Return known MIME type or fall back to mimetypes module
551
+ if suffix in MIME_MAPPINGS:
552
+ return MIME_MAPPINGS[suffix]
553
+
554
+ mime_type = mimetypes.guess_type(str(file_path))[0]
555
+ return mime_type if mime_type else 'application/octet-stream'
556
+
557
  def _generate_metadata(
558
  self,
559
  file_path: Path,
 
570
  'created_at': datetime.fromtimestamp(file_stat.st_ctime),
571
  'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
572
  'content_hash': self._calculate_hash(content),
573
+ 'mime_type': self._get_mime_type(file_path),
574
  'word_count': len(content.split()),
575
  'character_count': len(content),
576
  'processing_timestamp': datetime.now().isoformat()
 
590
 
591
  return metadata
592
 
593
+ # def _generate_metadata(
594
+ # self,
595
+ # file_path: Path,
596
+ # content: str,
597
+ # additional_metadata: Optional[Dict] = None
598
+ # ) -> Dict:
599
+ # """Generate comprehensive metadata"""
600
+ # file_stat = file_path.stat()
601
+
602
+ # metadata = {
603
+ # 'filename': file_path.name,
604
+ # 'file_type': file_path.suffix,
605
+ # 'file_size': file_stat.st_size,
606
+ # 'created_at': datetime.fromtimestamp(file_stat.st_ctime),
607
+ # 'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
608
+ # 'content_hash': self._calculate_hash(content),
609
+ # 'mime_type': magic.from_file(str(file_path), mime=True),
610
+ # 'word_count': len(content.split()),
611
+ # 'character_count': len(content),
612
+ # 'processing_timestamp': datetime.now().isoformat()
613
+ # }
614
+
615
+ # # Add Excel-specific metadata if applicable
616
+ # if file_path.suffix.lower() in ['.xlsx', '.xls']:
617
+ # try:
618
+ # if hasattr(self.excel_processor, 'get_metadata'):
619
+ # excel_metadata = self.excel_processor.get_metadata()
620
+ # metadata.update({'excel_metadata': excel_metadata})
621
+ # except Exception as e:
622
+ # logging.warning(f"Could not extract Excel metadata: {str(e)}")
623
+
624
+ # if additional_metadata:
625
+ # metadata.update(additional_metadata)
626
+
627
+ # return metadata
628
+
629
  def _calculate_hash(self, text: str) -> str:
630
  """Calculate SHA-256 hash of text"""
631
  return hashlib.sha256(text.encode()).hexdigest()