Teapack1's picture
Update ingest.py
90a08b2 verified
raw
history blame
5.49 kB
# ingest.py
"""
Create / rebuild FAISS vector stores for Czech and English PDFs.
Default behaviour (matches main.py):
β€’ English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
β€’ Czech embeddings : Seznam/retromae-small-cs (768-d)
Set use_openai=True if you really want to produce an English store
with OpenAI's 3 072-d 'text-embedding-3-large' vectors.
"""
from pathlib import Path
from typing import List
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceEmbeddings,
)
class Ingest:
# --------------------------------------------------------------------- #
def __init__(
self,
*,
# --- embeddings ----------------------------------------------------
english_hf_model: str = "sentence-transformers/all-MiniLM-L6-v2",
czech_hf_model: str = "Seznam/retromae-small-cs",
english_oa_model: str = "text-embedding-3-large",
use_openai: bool = False, # flip to keep legacy store
openai_api_key: str | None = None,
# --- chunking ------------------------------------------------------
chunk: int = 512,
overlap: int = 256,
# --- paths ---------------------------------------------------------
english_store: str = "stores/english_512",
czech_store: str = "stores/czech_512",
data_english: str = "data/english",
data_czech: str = "data/czech",
):
self.use_openai = use_openai
self.oa_key = openai_api_key
self.english_hf = english_hf_model
self.czech_hf = czech_hf_model
self.english_oa = english_oa_model
self.chunk = chunk
self.overlap = overlap
self.english_store = Path(english_store)
self.czech_store = Path(czech_store)
self.data_english = Path(data_english)
self.data_czech = Path(data_czech)
# --------------------------- helpers ---------------------------------- #
@staticmethod
def _loader(folder: Path):
return DirectoryLoader(
str(folder),
recursive=True,
show_progress=True,
loader_cls=PyPDFLoader,
use_multithreading=True,
).load()
@staticmethod
def _split(docs: List, chunk: int, overlap: int):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
chunk_overlap=overlap)
return splitter.split_documents(docs)
# --------------------------- English ---------------------------------- #
def ingest_english(self):
if self.use_openai:
if not self.oa_key:
raise ValueError("OpenAI API key is required for OpenAI embeddings.")
embedding = OpenAIEmbeddings(
openai_api_key=self.oa_key,
model=self.english_oa,
)
mode = f"OpenAI ({self.english_oa}) 3072-d"
else:
embedding = HuggingFaceEmbeddings(
model_name=self.english_hf,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
mode = f"HuggingFace ({self.english_hf}) " \
f"{embedding.client.get_sentence_embedding_dimension()}-d"
print(f"\n─ Ingest EN: {mode}")
docs = self._loader(self.data_english)
texts = self._split(docs, self.chunk, self.overlap)
db = FAISS.from_documents(texts, embedding)
db.save_local(str(self.english_store))
print("βœ“ English store written to", self.english_store, "\n")
# --------------------------- Czech ------------------------------------ #
def ingest_czech(self):
embedding = HuggingFaceEmbeddings(
model_name=self.czech_hf,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
dim = embedding.client.get_sentence_embedding_dimension()
print(f"\n─ Ingest CZ: HuggingFace ({self.czech_hf}) {dim}-d")
docs = self._loader(self.data_czech)
texts = self._split(docs, self.chunk, self.overlap)
db = FAISS.from_documents(texts, embedding)
db.save_local(str(self.czech_store))
print("βœ“ Czech store written to", self.czech_store, "\n")
# -------------------- quick CLI helper ------------------------------------ #
if __name__ == "__main__":
"""
Examples:
# build both stores with default HF encoders (no OpenAI)
python ingest.py
# build English store with OpenAI encoder (keeps 3 072-d index)
OPENAI_API_KEY=sk-... python ingest.py --openai
"""
import argparse, os
parser = argparse.ArgumentParser()
parser.add_argument("--openai", action="store_true",
help="Use OpenAI embeddings for English.")
parser.add_argument("--only", choices=["en", "cz"],
help="Ingest only that language.")
args = parser.parse_args()
ing = Ingest(use_openai=args.openai,
openai_api_key=os.getenv("OPENAI_API_KEY"))
if args.only in (None, "en"):
ing.ingest_english()
if args.only in (None, "cz"):
ing.ingest_czech()