Spaces:
Running
Running
"""RAGLite config.""" | |
import contextlib | |
import os | |
from dataclasses import dataclass, field | |
from io import StringIO | |
from llama_cpp import llama_supports_gpu_offload | |
from sqlalchemy.engine import URL | |
from raglite._flashrank import PatchedFlashRankRanker as FlashRankRanker | |
# Suppress rerankers output on import until [1] is fixed. | |
# [1] https://github.com/AnswerDotAI/rerankers/issues/36 | |
with contextlib.redirect_stdout(StringIO()): | |
from rerankers.models.ranker import BaseRanker | |
class RAGLiteConfig: | |
"""Configuration for RAGLite.""" | |
# Database config. | |
db_url: str | URL = "sqlite:///raglite.sqlite" | |
# LLM config used for generation. | |
llm: str = field( | |
default_factory=lambda: ( | |
"llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192" | |
if llama_supports_gpu_offload() | |
else "llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096" | |
) | |
) | |
llm_max_tries: int = 4 | |
# Embedder config used for indexing. | |
embedder: str = field( | |
default_factory=lambda: ( # Nomic-embed may be better if only English is used. | |
"llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf" | |
if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4 # noqa: PLR2004 | |
else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf" | |
) | |
) | |
embedder_normalize: bool = True | |
embedder_sentence_window_size: int = 3 | |
# Chunk config used to partition documents into chunks. | |
chunk_max_size: int = 1440 # Max number of characters per chunk. | |
# Vector search config. | |
vector_search_index_metric: str = "cosine" # The query adapter supports "dot" and "cosine". | |
vector_search_query_adapter: bool = True | |
# Reranking config. | |
reranker: BaseRanker | tuple[tuple[str, BaseRanker], ...] | None = field( | |
default_factory=lambda: ( | |
("en", FlashRankRanker("ms-marco-MiniLM-L-12-v2", verbose=0)), | |
("other", FlashRankRanker("ms-marco-MultiBERT-L-12", verbose=0)), | |
), | |
compare=False, # Exclude the reranker from comparison to avoid lru_cache misses. | |
) | |
def __post_init__(self) -> None: | |
# Late chunking with llama-cpp-python does not apply sentence windowing. | |
if self.embedder.startswith("llama-cpp-python"): | |
object.__setattr__(self, "embedder_sentence_window_size", 1) | |