Teapack1 commited on
Commit
90a08b2
·
verified ·
1 Parent(s): d532116

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +118 -64
ingest.py CHANGED
@@ -1,92 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from langchain_community.vectorstores import FAISS
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
-
4
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
5
  from langchain.embeddings import (
6
  OpenAIEmbeddings,
7
- HuggingFaceBgeEmbeddings,
8
  HuggingFaceEmbeddings,
9
- HuggingFaceInstructEmbeddings,
10
  )
11
 
12
 
13
  class Ingest:
 
14
  def __init__(
15
  self,
16
- openai_api_key=None,
17
- chunk=512,
18
- overlap=256,
19
- czech_store="stores/czech_512",
20
- english_store="stores/english_512",
21
- data_czech="data/czech",
22
- data_english="data/english",
23
- english_embedding_model="text-embedding-3-large",
24
- czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
 
 
 
 
 
 
25
  ):
26
- self.openai_api_key = openai_api_key
27
- self.chunk = chunk
28
- self.overlap = overlap
29
- self.czech_store = czech_store
30
- self.english_store = english_store
31
- self.data_czech = data_czech
32
- self.data_english = data_english
33
- self.english_embedding_model = english_embedding_model
34
- self.czech_embedding_model = czech_embedding_model
35
 
36
- def ingest_english(self):
 
37
 
38
- embedding = OpenAIEmbeddings(
39
- openai_api_key=self.openai_api_key,
40
- model=self.english_embedding_model,
41
- )
42
 
43
- loader = DirectoryLoader(
44
- self.data_english,
 
 
 
 
45
  show_progress=True,
46
  loader_cls=PyPDFLoader,
47
- )
 
48
 
49
- documents = loader.load()
50
- text_splitter = RecursiveCharacterTextSplitter(
51
- chunk_size=self.chunk,
52
- chunk_overlap=self.overlap,
53
- )
54
- texts = text_splitter.split_documents(documents)
55
 
56
- vectordb = FAISS.from_documents(
57
- documents=texts,
58
- embedding=embedding,
59
- )
60
- vectordb.save_local(self.english_store)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- print("\n English vector Store Created.......\n\n")
 
 
63
 
 
64
  def ingest_czech(self):
65
- embedding_model = self.czech_embedding_model
66
- model_kwargs = {"device": "cpu"}
67
- encode_kwargs = {"normalize_embeddings": False}
68
  embedding = HuggingFaceEmbeddings(
69
- model_name=embedding_model,
70
- model_kwargs=model_kwargs,
71
- encode_kwargs=encode_kwargs,
72
  )
 
 
73
 
74
- loader = DirectoryLoader(
75
- self.data_czech,
76
- show_progress=True,
77
- )
78
 
79
- documents = loader.load()
80
- text_splitter = RecursiveCharacterTextSplitter(
81
- chunk_size=self.chunk,
82
- chunk_overlap=self.overlap,
83
- )
84
 
85
- texts = text_splitter.split_documents(documents)
86
- vectordb = FAISS.from_documents(
87
- documents=texts,
88
- embedding=embedding,
89
- )
90
- vectordb.save_local(self.czech_store)
91
 
92
- print("\n Czech vector Store Created.......\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest.py
2
+ """
3
+ Create / rebuild FAISS vector stores for Czech and English PDFs.
4
+
5
+ Default behaviour (matches main.py):
6
+ • English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
7
+ • Czech embeddings : Seznam/retromae-small-cs (768-d)
8
+
9
+ Set use_openai=True if you really want to produce an English store
10
+ with OpenAI's 3 072-d 'text-embedding-3-large' vectors.
11
+ """
12
+
13
+ from pathlib import Path
14
+ from typing import List
15
+
16
  from langchain_community.vectorstores import FAISS
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
 
19
  from langchain.embeddings import (
20
  OpenAIEmbeddings,
 
21
  HuggingFaceEmbeddings,
 
22
  )
23
 
24
 
25
  class Ingest:
26
+ # --------------------------------------------------------------------- #
27
  def __init__(
28
  self,
29
+ *,
30
+ # --- embeddings ----------------------------------------------------
31
+ english_hf_model: str = "sentence-transformers/all-MiniLM-L6-v2",
32
+ czech_hf_model: str = "Seznam/retromae-small-cs",
33
+ english_oa_model: str = "text-embedding-3-large",
34
+ use_openai: bool = False, # flip to keep legacy store
35
+ openai_api_key: str | None = None,
36
+ # --- chunking ------------------------------------------------------
37
+ chunk: int = 512,
38
+ overlap: int = 256,
39
+ # --- paths ---------------------------------------------------------
40
+ english_store: str = "stores/english_512",
41
+ czech_store: str = "stores/czech_512",
42
+ data_english: str = "data/english",
43
+ data_czech: str = "data/czech",
44
  ):
45
+ self.use_openai = use_openai
46
+ self.oa_key = openai_api_key
47
+ self.english_hf = english_hf_model
48
+ self.czech_hf = czech_hf_model
49
+ self.english_oa = english_oa_model
 
 
 
 
50
 
51
+ self.chunk = chunk
52
+ self.overlap = overlap
53
 
54
+ self.english_store = Path(english_store)
55
+ self.czech_store = Path(czech_store)
56
+ self.data_english = Path(data_english)
57
+ self.data_czech = Path(data_czech)
58
 
59
+ # --------------------------- helpers ---------------------------------- #
60
+ @staticmethod
61
+ def _loader(folder: Path):
62
+ return DirectoryLoader(
63
+ str(folder),
64
+ recursive=True,
65
  show_progress=True,
66
  loader_cls=PyPDFLoader,
67
+ use_multithreading=True,
68
+ ).load()
69
 
70
+ @staticmethod
71
+ def _split(docs: List, chunk: int, overlap: int):
72
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
73
+ chunk_overlap=overlap)
74
+ return splitter.split_documents(docs)
 
75
 
76
+ # --------------------------- English ---------------------------------- #
77
+ def ingest_english(self):
78
+ if self.use_openai:
79
+ if not self.oa_key:
80
+ raise ValueError("OpenAI API key is required for OpenAI embeddings.")
81
+ embedding = OpenAIEmbeddings(
82
+ openai_api_key=self.oa_key,
83
+ model=self.english_oa,
84
+ )
85
+ mode = f"OpenAI ({self.english_oa}) 3072-d"
86
+ else:
87
+ embedding = HuggingFaceEmbeddings(
88
+ model_name=self.english_hf,
89
+ model_kwargs={"device": "cpu"},
90
+ encode_kwargs={"normalize_embeddings": False},
91
+ )
92
+ mode = f"HuggingFace ({self.english_hf}) " \
93
+ f"{embedding.client.get_sentence_embedding_dimension()}-d"
94
+
95
+ print(f"\n─ Ingest EN: {mode}")
96
+ docs = self._loader(self.data_english)
97
+ texts = self._split(docs, self.chunk, self.overlap)
98
 
99
+ db = FAISS.from_documents(texts, embedding)
100
+ db.save_local(str(self.english_store))
101
+ print("✓ English store written to", self.english_store, "\n")
102
 
103
+ # --------------------------- Czech ------------------------------------ #
104
  def ingest_czech(self):
 
 
 
105
  embedding = HuggingFaceEmbeddings(
106
+ model_name=self.czech_hf,
107
+ model_kwargs={"device": "cpu"},
108
+ encode_kwargs={"normalize_embeddings": False},
109
  )
110
+ dim = embedding.client.get_sentence_embedding_dimension()
111
+ print(f"\n─ Ingest CZ: HuggingFace ({self.czech_hf}) {dim}-d")
112
 
113
+ docs = self._loader(self.data_czech)
114
+ texts = self._split(docs, self.chunk, self.overlap)
 
 
115
 
116
+ db = FAISS.from_documents(texts, embedding)
117
+ db.save_local(str(self.czech_store))
118
+ print("✓ Czech store written to", self.czech_store, "\n")
 
 
119
 
 
 
 
 
 
 
120
 
121
+ # -------------------- quick CLI helper ------------------------------------ #
122
+ if __name__ == "__main__":
123
+ """
124
+ Examples:
125
+ # build both stores with default HF encoders (no OpenAI)
126
+ python ingest.py
127
+
128
+ # build English store with OpenAI encoder (keeps 3 072-d index)
129
+ OPENAI_API_KEY=sk-... python ingest.py --openai
130
+ """
131
+ import argparse, os
132
+
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument("--openai", action="store_true",
135
+ help="Use OpenAI embeddings for English.")
136
+ parser.add_argument("--only", choices=["en", "cz"],
137
+ help="Ingest only that language.")
138
+ args = parser.parse_args()
139
+
140
+ ing = Ingest(use_openai=args.openai,
141
+ openai_api_key=os.getenv("OPENAI_API_KEY"))
142
+
143
+ if args.only in (None, "en"):
144
+ ing.ingest_english()
145
+ if args.only in (None, "cz"):
146
+ ing.ingest_czech()