# ingest.py """ Create / rebuild FAISS vector stores for Czech and English PDFs. Default behaviour (matches main.py): • English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d) • Czech embeddings : Seznam/retromae-small-cs (768-d) Set use_openai=True if you really want to produce an English store with OpenAI's 3 072-d 'text-embedding-3-large' vectors. """ from pathlib import Path from typing import List from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import DirectoryLoader, PyPDFLoader from langchain.embeddings import ( OpenAIEmbeddings, HuggingFaceEmbeddings, ) class Ingest: # --------------------------------------------------------------------- # def __init__( self, *, # --- embeddings ---------------------------------------------------- english_hf_model: str = "sentence-transformers/all-MiniLM-L6-v2", czech_hf_model: str = "Seznam/retromae-small-cs", english_oa_model: str = "text-embedding-3-large", use_openai: bool = False, # flip to keep legacy store openai_api_key: str | None = None, # --- chunking ------------------------------------------------------ chunk: int = 512, overlap: int = 256, # --- paths --------------------------------------------------------- english_store: str = "stores/english_512", czech_store: str = "stores/czech_512", data_english: str = "data/english", data_czech: str = "data/czech", ): self.use_openai = use_openai self.oa_key = openai_api_key self.english_hf = english_hf_model self.czech_hf = czech_hf_model self.english_oa = english_oa_model self.chunk = chunk self.overlap = overlap self.english_store = Path(english_store) self.czech_store = Path(czech_store) self.data_english = Path(data_english) self.data_czech = Path(data_czech) # --------------------------- helpers ---------------------------------- # @staticmethod def _loader(folder: Path): return DirectoryLoader( str(folder), recursive=True, show_progress=True, loader_cls=PyPDFLoader, use_multithreading=True, ).load() @staticmethod def _split(docs: List, chunk: int, overlap: int): splitter = RecursiveCharacterTextSplitter(chunk_size=chunk, chunk_overlap=overlap) return splitter.split_documents(docs) # --------------------------- English ---------------------------------- # def ingest_english(self): if self.use_openai: if not self.oa_key: raise ValueError("OpenAI API key is required for OpenAI embeddings.") embedding = OpenAIEmbeddings( openai_api_key=self.oa_key, model=self.english_oa, ) mode = f"OpenAI ({self.english_oa}) 3072-d" else: embedding = HuggingFaceEmbeddings( model_name=self.english_hf, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}, ) mode = f"HuggingFace ({self.english_hf}) " \ f"{embedding.client.get_sentence_embedding_dimension()}-d" print(f"\n─ Ingest EN: {mode}") docs = self._loader(self.data_english) texts = self._split(docs, self.chunk, self.overlap) db = FAISS.from_documents(texts, embedding) db.save_local(str(self.english_store)) print("✓ English store written to", self.english_store, "\n") # --------------------------- Czech ------------------------------------ # def ingest_czech(self): embedding = HuggingFaceEmbeddings( model_name=self.czech_hf, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}, ) dim = embedding.client.get_sentence_embedding_dimension() print(f"\n─ Ingest CZ: HuggingFace ({self.czech_hf}) {dim}-d") docs = self._loader(self.data_czech) texts = self._split(docs, self.chunk, self.overlap) db = FAISS.from_documents(texts, embedding) db.save_local(str(self.czech_store)) print("✓ Czech store written to", self.czech_store, "\n") # -------------------- quick CLI helper ------------------------------------ # if __name__ == "__main__": """ Examples: # build both stores with default HF encoders (no OpenAI) python ingest.py # build English store with OpenAI encoder (keeps 3 072-d index) OPENAI_API_KEY=sk-... python ingest.py --openai """ import argparse, os parser = argparse.ArgumentParser() parser.add_argument("--openai", action="store_true", help="Use OpenAI embeddings for English.") parser.add_argument("--only", choices=["en", "cz"], help="Ingest only that language.") args = parser.parse_args() ing = Ingest(use_openai=args.openai, openai_api_key=os.getenv("OPENAI_API_KEY")) if args.only in (None, "en"): ing.ingest_english() if args.only in (None, "cz"): ing.ingest_czech()