Spaces:
Running
Running
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
# config/config.py | |
import os | |
from dotenv import load_dotenv | |
from google.oauth2.credentials import Credentials | |
from google_auth_oauthlib.flow import Flow | |
# Load environment variables | |
load_dotenv() | |
class Settings: | |
# OpenAI Configuration | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '') | |
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo') | |
ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499' | |
# Ollama Configuration | |
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434') | |
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2') | |
# Anthropic Configuration | |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '') | |
# top number of chunks to retrieve. | |
TOP_CHUNKS = int(os.getenv('TOP_CHUNKS', '10')) | |
# Environment Configuration | |
ENVIRONMENT = os.getenv('ENVIRONMENT').lower() | |
# Embedding Configuration | |
def EMBEDDING_MODEL(self): | |
if self.ENVIRONMENT == 'production': | |
# Better model for demos | |
# return os.getenv('EMBEDDING_MODEL', 'openai/text-embedding-3-large') | |
return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2') | |
# Better for development purposes. | |
return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2') | |
# MongoDB Configuration | |
# MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017') | |
def MONGODB_URI(self): | |
if self.ENVIRONMENT == 'production': | |
# Better model for demos | |
return os.getenv('MONGODB_URI', 'mongodb+srv://talat:[email protected]/?retryWrites=true&w=majority&appName=Chatbot') | |
# Better for development purposes. | |
return os.getenv('MONGODB_URI', 'mongodb://localhost:27017') | |
# New Conversation Summarizer Settings | |
SUMMARIZER_CONFIG = { | |
# 'facebook/bart-large-cnn', for bigger and better model | |
'model_name': os.getenv('SUMMARIZER_MODEL', 'facebook/bart-base'), | |
'max_length': int(os.getenv('SUMMARIZER_MAX_LENGTH', '130')), | |
'min_length': int(os.getenv('SUMMARIZER_MIN_LENGTH', '30')), | |
'device': -1, # CPU | |
'model_kwargs': { | |
'low_cpu_mem_usage': True | |
} | |
} | |
# Vector Store Configuration | |
CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db') | |
# Feedback Configuration | |
MAX_RATING = int(os.getenv('MAX_RATING', '5')) | |
# Temporary directory for downloaded files | |
TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads') | |
# Application Configuration | |
DEBUG = os.getenv('DEBUG', 'False') == 'True' | |
# Google Drive Configuration | |
GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '') | |
GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv( | |
'GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json') | |
# Use explicit type conversion to ensure correct types | |
DOCUMENT_PROCESSOR = { | |
'chunk_size': int(os.getenv('DOCUMENT_CHUNK_SIZE', '1000')), | |
'chunk_overlap': int(os.getenv('DOCUMENT_CHUNK_OVERLAP', '200')), | |
# 20MB in bytes | |
'max_file_size': int(os.getenv('DOCUMENT_MAX_FILE_SIZE', str(20 * 1024 * 1024))), | |
'supported_formats': [ | |
'.txt', '.pdf', '.docx', '.csv', '.json', | |
'.html', '.md', '.xml', '.rtf', '.xlsx', '.xls' | |
] | |
} | |
def get_document_processor_settings(cls) -> dict: | |
""" | |
Get document processor settings with validation | |
Returns: | |
dict: Validated document processor settings | |
""" | |
settings = cls.DOCUMENT_PROCESSOR.copy() | |
# Ensure positive values for numeric settings | |
settings['chunk_size'] = max( | |
100, settings['chunk_size']) # Minimum 100 | |
settings['chunk_overlap'] = min( | |
settings['chunk_overlap'], | |
# Ensure overlap is less than chunk size | |
settings['chunk_size'] - 50 | |
) | |
settings['max_file_size'] = max( | |
1024 * 1024, settings['max_file_size']) # Minimum 1MB | |
return settings | |
settings = Settings() | |
print(f"Current Environment: {settings.ENVIRONMENT}") | |
# print(f"Current MongoDB URI: {settings.MONGODB_URI}") | |