|
from typing import Optional, List, Dict, Literal |
|
|
|
from dotenv import find_dotenv |
|
from pydantic import computed_field |
|
from pydantic_settings import BaseSettings |
|
import torch |
|
|
|
|
|
class Settings(BaseSettings): |
|
|
|
TORCH_DEVICE: Optional[str] = None |
|
IMAGE_DPI: int = 96 |
|
EXTRACT_IMAGES: bool = True |
|
|
|
@computed_field |
|
@property |
|
def TORCH_DEVICE_MODEL(self) -> str: |
|
if self.TORCH_DEVICE is not None: |
|
return self.TORCH_DEVICE |
|
|
|
if torch.cuda.is_available(): |
|
return "cuda" |
|
|
|
if torch.backends.mps.is_available(): |
|
return "mps" |
|
|
|
return "cpu" |
|
|
|
INFERENCE_RAM: int = 40 |
|
VRAM_PER_TASK: float = 4.5 |
|
DEFAULT_LANG: str = "English" |
|
|
|
SUPPORTED_FILETYPES: Dict = { |
|
"application/pdf": "pdf", |
|
} |
|
|
|
|
|
DETECTOR_BATCH_SIZE: Optional[int] = None |
|
SURYA_DETECTOR_DPI: int = 96 |
|
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4 |
|
|
|
|
|
INVALID_CHARS: List[str] = [chr(0xfffd), "�"] |
|
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = "surya" |
|
OCR_ALL_PAGES: bool = False |
|
|
|
|
|
SURYA_OCR_DPI: int = 96 |
|
RECOGNITION_BATCH_SIZE: Optional[int] = None |
|
|
|
|
|
OCR_PARALLEL_WORKERS: int = 2 |
|
TESSERACT_TIMEOUT: int = 20 |
|
TESSDATA_PREFIX: str = "" |
|
|
|
|
|
TEXIFY_MODEL_MAX: int = 384 |
|
TEXIFY_TOKEN_BUFFER: int = 256 |
|
TEXIFY_DPI: int = 96 |
|
TEXIFY_BATCH_SIZE: Optional[int] = None |
|
TEXIFY_MODEL_NAME: str = "vikp/texify" |
|
|
|
|
|
SURYA_LAYOUT_DPI: int = 96 |
|
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"] |
|
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2" |
|
BBOX_INTERSECTION_THRESH: float = 0.7 |
|
LAYOUT_BATCH_SIZE: Optional[int] = None |
|
|
|
|
|
SURYA_ORDER_DPI: int = 96 |
|
ORDER_BATCH_SIZE: Optional[int] = None |
|
ORDER_MAX_BBOXES: int = 255 |
|
|
|
|
|
EDITOR_BATCH_SIZE: Optional[int] = None |
|
EDITOR_MAX_LENGTH: int = 1024 |
|
EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor_t5" |
|
ENABLE_EDITOR_MODEL: bool = False |
|
EDITOR_CUTOFF_THRESH: float = 0.9 |
|
|
|
|
|
RAY_CACHE_PATH: Optional[str] = None |
|
RAY_CORES_PER_WORKER: int = 1 |
|
|
|
|
|
DEBUG: bool = False |
|
DEBUG_DATA_FOLDER: Optional[str] = None |
|
DEBUG_LEVEL: int = 0 |
|
|
|
@computed_field |
|
@property |
|
def CUDA(self) -> bool: |
|
return "cuda" in self.TORCH_DEVICE_MODEL |
|
|
|
@computed_field |
|
@property |
|
def MODEL_DTYPE(self) -> torch.dtype: |
|
if self.TORCH_DEVICE_MODEL == "cuda": |
|
return torch.bfloat16 |
|
else: |
|
return torch.float32 |
|
|
|
@computed_field |
|
@property |
|
def TEXIFY_DTYPE(self) -> torch.dtype: |
|
return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16 |
|
|
|
class Config: |
|
env_file = find_dotenv("local.env") |
|
extra = "ignore" |
|
|
|
|
|
settings = Settings() |