Spaces:
Runtime error
Runtime error
File size: 5,492 Bytes
90a08b2 9f3c9bf 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 1f4bbb8 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 99afe26 90a08b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# ingest.py
"""
Create / rebuild FAISS vector stores for Czech and English PDFs.
Default behaviour (matches main.py):
β’ English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
β’ Czech embeddings : Seznam/retromae-small-cs (768-d)
Set use_openai=True if you really want to produce an English store
with OpenAI's 3 072-d 'text-embedding-3-large' vectors.
"""
from pathlib import Path
from typing import List
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceEmbeddings,
)
class Ingest:
# --------------------------------------------------------------------- #
def __init__(
self,
*,
# --- embeddings ----------------------------------------------------
english_hf_model: str = "sentence-transformers/all-MiniLM-L6-v2",
czech_hf_model: str = "Seznam/retromae-small-cs",
english_oa_model: str = "text-embedding-3-large",
use_openai: bool = False, # flip to keep legacy store
openai_api_key: str | None = None,
# --- chunking ------------------------------------------------------
chunk: int = 512,
overlap: int = 256,
# --- paths ---------------------------------------------------------
english_store: str = "stores/english_512",
czech_store: str = "stores/czech_512",
data_english: str = "data/english",
data_czech: str = "data/czech",
):
self.use_openai = use_openai
self.oa_key = openai_api_key
self.english_hf = english_hf_model
self.czech_hf = czech_hf_model
self.english_oa = english_oa_model
self.chunk = chunk
self.overlap = overlap
self.english_store = Path(english_store)
self.czech_store = Path(czech_store)
self.data_english = Path(data_english)
self.data_czech = Path(data_czech)
# --------------------------- helpers ---------------------------------- #
@staticmethod
def _loader(folder: Path):
return DirectoryLoader(
str(folder),
recursive=True,
show_progress=True,
loader_cls=PyPDFLoader,
use_multithreading=True,
).load()
@staticmethod
def _split(docs: List, chunk: int, overlap: int):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
chunk_overlap=overlap)
return splitter.split_documents(docs)
# --------------------------- English ---------------------------------- #
def ingest_english(self):
if self.use_openai:
if not self.oa_key:
raise ValueError("OpenAI API key is required for OpenAI embeddings.")
embedding = OpenAIEmbeddings(
openai_api_key=self.oa_key,
model=self.english_oa,
)
mode = f"OpenAI ({self.english_oa}) 3072-d"
else:
embedding = HuggingFaceEmbeddings(
model_name=self.english_hf,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
mode = f"HuggingFace ({self.english_hf}) " \
f"{embedding.client.get_sentence_embedding_dimension()}-d"
print(f"\nβ Ingest EN: {mode}")
docs = self._loader(self.data_english)
texts = self._split(docs, self.chunk, self.overlap)
db = FAISS.from_documents(texts, embedding)
db.save_local(str(self.english_store))
print("β English store written to", self.english_store, "\n")
# --------------------------- Czech ------------------------------------ #
def ingest_czech(self):
embedding = HuggingFaceEmbeddings(
model_name=self.czech_hf,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
dim = embedding.client.get_sentence_embedding_dimension()
print(f"\nβ Ingest CZ: HuggingFace ({self.czech_hf}) {dim}-d")
docs = self._loader(self.data_czech)
texts = self._split(docs, self.chunk, self.overlap)
db = FAISS.from_documents(texts, embedding)
db.save_local(str(self.czech_store))
print("β Czech store written to", self.czech_store, "\n")
# -------------------- quick CLI helper ------------------------------------ #
if __name__ == "__main__":
"""
Examples:
# build both stores with default HF encoders (no OpenAI)
python ingest.py
# build English store with OpenAI encoder (keeps 3 072-d index)
OPENAI_API_KEY=sk-... python ingest.py --openai
"""
import argparse, os
parser = argparse.ArgumentParser()
parser.add_argument("--openai", action="store_true",
help="Use OpenAI embeddings for English.")
parser.add_argument("--only", choices=["en", "cz"],
help="Ingest only that language.")
args = parser.parse_args()
ing = Ingest(use_openai=args.openai,
openai_api_key=os.getenv("OPENAI_API_KEY"))
if args.only in (None, "en"):
ing.ingest_english()
if args.only in (None, "cz"):
ing.ingest_czech()
|