Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

TalatMasood commited on Feb 22

Commit

c2e0ca0

1 Parent(s): 11eddc5

Updated embedding to produciton, changed the depenedency for magic library

Browse files

Files changed (7) hide show

config/config.py +10 -1
constraints.txt +0 -1
requirements.txt +0 -1
src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
src/utils/document_processor.py +78 -2

config/config.py CHANGED Viewed

@@ -22,8 +22,17 @@ class Settings:
     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
     # Embedding Configuration
-    EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
     # Vector Store Configuration
     CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')

     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
+    # Environment Configuration
+    ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
     # Embedding Configuration
+    @property
+    def EMBEDDING_MODEL(self):
+        if self.ENVIRONMENT == 'production':
+            # Better model for demos
+            return os.getenv('EMBEDDING_MODEL', 'text-embedding-3-large')
+        # Better for development purposes.
+        return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
     # Vector Store Configuration
     CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')

constraints.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- python-magic-bin==0.4.14

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
---constraint constraints.txt
 fastapi
 uvicorn
 torch

 fastapi
 uvicorn
 torch

src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ

src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ

src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ

src/utils/document_processor.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 import json
 from pathlib import Path
 import hashlib
-import magic
 from bs4 import BeautifulSoup
 import csv
 from datetime import datetime
@@ -514,6 +514,46 @@ class DocumentProcessor:
         except Exception as e:
             raise Exception(f"Error in basic Excel processing: {str(e)}")
     def _generate_metadata(
         self,
         file_path: Path,
@@ -530,7 +570,7 @@ class DocumentProcessor:
             'created_at': datetime.fromtimestamp(file_stat.st_ctime),
             'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
             'content_hash': self._calculate_hash(content),
-            'mime_type': magic.from_file(str(file_path), mime=True),
             'word_count': len(content.split()),
             'character_count': len(content),
             'processing_timestamp': datetime.now().isoformat()
@@ -550,6 +590,42 @@ class DocumentProcessor:
         return metadata
     def _calculate_hash(self, text: str) -> str:
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()

 import json
 from pathlib import Path
 import hashlib
+import mimetypes  # Add this instead
 from bs4 import BeautifulSoup
 import csv
 from datetime import datetime
         except Exception as e:
             raise Exception(f"Error in basic Excel processing: {str(e)}")
+    def _get_mime_type(self, file_path: Path) -> str:
+        """
+        Get MIME type for a file based on its extension
+        Args:
+            file_path (Path): Path to the file
+        Returns:
+            str: MIME type of the file
+        """
+        # Standard MIME mappings for supported formats
+        MIME_MAPPINGS = {
+            '.pdf': 'application/pdf',
+            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            '.doc': 'application/msword',
+            '.csv': 'text/csv',
+            '.json': 'application/json',
+            '.html': 'text/html',
+            '.txt': 'text/plain',
+            '.md': 'text/markdown',
+            '.xml': 'text/xml',
+            '.rtf': 'application/rtf',
+            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            '.xls': 'application/vnd.ms-excel'
+        }
+        suffix = file_path.suffix.lower()
+        # Verify the file format is supported
+        if suffix not in self.supported_formats:
+            logging.warning(f"Unsupported file format: {suffix}")
+            return 'application/octet-stream'
+        # Return known MIME type or fall back to mimetypes module
+        if suffix in MIME_MAPPINGS:
+            return MIME_MAPPINGS[suffix]
+        mime_type = mimetypes.guess_type(str(file_path))[0]
+        return mime_type if mime_type else 'application/octet-stream'
     def _generate_metadata(
         self,
         file_path: Path,
             'created_at': datetime.fromtimestamp(file_stat.st_ctime),
             'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
             'content_hash': self._calculate_hash(content),
+            'mime_type': self._get_mime_type(file_path),
             'word_count': len(content.split()),
             'character_count': len(content),
             'processing_timestamp': datetime.now().isoformat()
         return metadata
+    # def _generate_metadata(
+    #     self,
+    #     file_path: Path,
+    #     content: str,
+    #     additional_metadata: Optional[Dict] = None
+    # ) -> Dict:
+    #     """Generate comprehensive metadata"""
+    #     file_stat = file_path.stat()
+    #     metadata = {
+    #         'filename': file_path.name,
+    #         'file_type': file_path.suffix,
+    #         'file_size': file_stat.st_size,
+    #         'created_at': datetime.fromtimestamp(file_stat.st_ctime),
+    #         'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
+    #         'content_hash': self._calculate_hash(content),
+    #         'mime_type': magic.from_file(str(file_path), mime=True),
+    #         'word_count': len(content.split()),
+    #         'character_count': len(content),
+    #         'processing_timestamp': datetime.now().isoformat()
+    #     }
+    #     # Add Excel-specific metadata if applicable
+    #     if file_path.suffix.lower() in ['.xlsx', '.xls']:
+    #         try:
+    #             if hasattr(self.excel_processor, 'get_metadata'):
+    #                 excel_metadata = self.excel_processor.get_metadata()
+    #                 metadata.update({'excel_metadata': excel_metadata})
+    #         except Exception as e:
+    #             logging.warning(f"Could not extract Excel metadata: {str(e)}")
+    #     if additional_metadata:
+    #         metadata.update(additional_metadata)
+    #     return metadata
     def _calculate_hash(self, text: str) -> str:
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()