Teapack1 commited on
Commit
05055d0
Β·
verified Β·
1 Parent(s): 553e8f9

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +72 -74
ingest.py CHANGED
@@ -1,13 +1,14 @@
1
  # ingest.py
2
  """
3
- Create / rebuild FAISS vector stores for Czech and English PDFs.
4
 
5
- Default behaviour (matches main.py):
6
- β€’ English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
7
- β€’ Czech embeddings : Seznam/retromae-small-cs (768-d)
8
 
9
- Set use_openai=True if you really want to produce an English store
10
- with OpenAI's 3 072-d 'text-embedding-3-large' vectors.
 
11
  """
12
 
13
  from pathlib import Path
@@ -16,10 +17,10 @@ from typing import List
16
  from langchain_community.vectorstores import FAISS
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain.document_loaders import DirectoryLoader, PyPDFLoader
19
- from langchain.embeddings import (
20
- OpenAIEmbeddings,
21
- HuggingFaceEmbeddings,
22
- )
23
 
24
 
25
  class Ingest:
@@ -27,26 +28,28 @@ class Ingest:
27
  def __init__(
28
  self,
29
  *,
30
- # --- embeddings ----------------------------------------------------
31
- english_hf_model: str = "sentence-transformers/all-MiniLM-L6-v2",
32
- czech_hf_model: str = "Seznam/retromae-small-cs",
33
- english_oa_model: str = "text-embedding-3-large",
34
- use_openai: bool = False, # flip to keep legacy store
35
- openai_api_key: str | None = None,
36
- # --- chunking ------------------------------------------------------
 
37
  chunk: int = 512,
38
  overlap: int = 256,
39
- # --- paths ---------------------------------------------------------
40
  english_store: str = "stores/english_512",
41
  czech_store: str = "stores/czech_512",
42
  data_english: str = "data/english",
43
  data_czech: str = "data/czech",
44
  ):
45
- self.use_openai = use_openai
46
- self.oa_key = openai_api_key
47
- self.english_hf = english_hf_model
48
- self.czech_hf = czech_hf_model
49
- self.english_oa = english_oa_model
 
50
 
51
  self.chunk = chunk
52
  self.overlap = overlap
@@ -58,89 +61,84 @@ class Ingest:
58
 
59
  # --------------------------- helpers ---------------------------------- #
60
  @staticmethod
61
- def _loader(folder: Path):
62
  return DirectoryLoader(
63
  str(folder),
64
  recursive=True,
65
- show_progress=True,
66
  loader_cls=PyPDFLoader,
 
67
  use_multithreading=True,
68
  ).load()
69
 
70
  @staticmethod
71
  def _split(docs: List, chunk: int, overlap: int):
72
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
73
- chunk_overlap=overlap)
74
- return splitter.split_documents(docs)
75
 
76
  # --------------------------- English ---------------------------------- #
77
  def ingest_english(self):
78
- if self.use_openai:
79
- if not self.oa_key:
80
- raise ValueError("OpenAI API key is required for OpenAI embeddings.")
81
- embedding = OpenAIEmbeddings(
82
- openai_api_key=self.oa_key,
83
- model=self.english_oa,
84
  )
85
- mode = f"OpenAI ({self.english_oa}) 3072-d"
86
  else:
87
- embedding = HuggingFaceEmbeddings(
88
- model_name=self.english_hf,
89
  model_kwargs={"device": "cpu"},
90
  encode_kwargs={"normalize_embeddings": False},
91
  )
92
- mode = f"HuggingFace ({self.english_hf}) " \
93
- f"{embedding.client.get_sentence_embedding_dimension()}-d"
94
 
95
- print(f"\n─ Ingest EN: {mode}")
96
- docs = self._loader(self.data_english)
97
- texts = self._split(docs, self.chunk, self.overlap)
98
-
99
- db = FAISS.from_documents(texts, embedding)
100
- db.save_local(str(self.english_store))
101
- print("βœ“ English store written to", self.english_store, "\n")
102
 
103
  # --------------------------- Czech ------------------------------------ #
104
  def ingest_czech(self):
105
- embedding = HuggingFaceEmbeddings(
106
- model_name=self.czech_hf,
107
  model_kwargs={"device": "cpu"},
108
  encode_kwargs={"normalize_embeddings": False},
109
  )
110
- dim = embedding.client.get_sentence_embedding_dimension()
111
- print(f"\n─ Ingest CZ: HuggingFace ({self.czech_hf}) {dim}-d")
112
-
113
- docs = self._loader(self.data_czech)
114
- texts = self._split(docs, self.chunk, self.overlap)
 
115
 
116
- db = FAISS.from_documents(texts, embedding)
117
- db.save_local(str(self.czech_store))
118
- print("βœ“ Czech store written to", self.czech_store, "\n")
119
 
120
-
121
- # -------------------- quick CLI helper ------------------------------------ #
122
  if __name__ == "__main__":
123
  """
124
- Examples:
125
- # build both stores with default HF encoders (no OpenAI)
126
- python ingest.py
127
-
128
- # build English store with OpenAI encoder (keeps 3 072-d index)
129
- OPENAI_API_KEY=sk-... python ingest.py --openai
130
  """
131
  import argparse, os
132
 
133
- parser = argparse.ArgumentParser()
134
- parser.add_argument("--openai", action="store_true",
135
- help="Use OpenAI embeddings for English.")
136
- parser.add_argument("--only", choices=["en", "cz"],
137
- help="Ingest only that language.")
138
- args = parser.parse_args()
139
 
140
- ing = Ingest(use_openai=args.openai,
141
- openai_api_key=os.getenv("OPENAI_API_KEY"))
 
 
142
 
143
- if args.only in (None, "en"):
144
  ing.ingest_english()
145
- if args.only in (None, "cz"):
146
  ing.ingest_czech()
 
1
  # ingest.py
2
  """
3
+ Create FAISS indices for Czech and English PDFs.
4
 
5
+ Default (matches backend/main.py):
6
+ β€’ English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
7
+ β€’ Czech embeddings : Seznam/retromae-small-cs (768-d)
8
 
9
+ If you still need a legacy English store with OpenAI
10
+ `text-embedding-3-large` (3 072-d), instantiate with
11
+ use_openai_embeddings=True and pass OPENAI_API_KEY.
12
  """
13
 
14
  from pathlib import Path
 
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.text_splitter import RecursiveCharacterTextSplitter
19
  from langchain.document_loaders import DirectoryLoader, PyPDFLoader
20
+
21
+ # ← updated import (fixes deprecation warning) ----------------------[2][3]
22
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
23
+ from langchain.embeddings import OpenAIEmbeddings
24
 
25
 
26
  class Ingest:
 
28
  def __init__(
29
  self,
30
  *,
31
+ # names must stay exactly like in backend/main.py
32
+ english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
33
+ czech_embedding_model: str = "Seznam/retromae-small-cs",
34
+ # optional OpenAI path
35
+ use_openai_embeddings: bool = False,
36
+ openai_api_key: str | None = None,
37
+ openai_embedding_model: str = "text-embedding-3-large",
38
+ # chunking
39
  chunk: int = 512,
40
  overlap: int = 256,
41
+ # folders
42
  english_store: str = "stores/english_512",
43
  czech_store: str = "stores/czech_512",
44
  data_english: str = "data/english",
45
  data_czech: str = "data/czech",
46
  ):
47
+ self.english_embedding_model = english_embedding_model
48
+ self.czech_embedding_model = czech_embedding_model
49
+
50
+ self.use_openai_embeddings = use_openai_embeddings
51
+ self.openai_api_key = openai_api_key
52
+ self.openai_embedding_model = openai_embedding_model
53
 
54
  self.chunk = chunk
55
  self.overlap = overlap
 
61
 
62
  # --------------------------- helpers ---------------------------------- #
63
  @staticmethod
64
+ def _load(folder: Path):
65
  return DirectoryLoader(
66
  str(folder),
67
  recursive=True,
 
68
  loader_cls=PyPDFLoader,
69
+ show_progress=True,
70
  use_multithreading=True,
71
  ).load()
72
 
73
  @staticmethod
74
  def _split(docs: List, chunk: int, overlap: int):
75
+ return RecursiveCharacterTextSplitter(
76
+ chunk_size=chunk, chunk_overlap=overlap
77
+ ).split_documents(docs)
78
 
79
  # --------------------------- English ---------------------------------- #
80
  def ingest_english(self):
81
+ if self.use_openai_embeddings:
82
+ if not self.openai_api_key:
83
+ raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.")
84
+ embed = OpenAIEmbeddings(
85
+ openai_api_key=self.openai_api_key,
86
+ model=self.openai_embedding_model,
87
  )
88
+ mode = f"OpenAI ({self.openai_embedding_model}) 3 072-d"
89
  else:
90
+ embed = HuggingFaceEmbeddings(
91
+ model_name=self.english_embedding_model,
92
  model_kwargs={"device": "cpu"},
93
  encode_kwargs={"normalize_embeddings": False},
94
  )
95
+ dim = embed.client.get_sentence_embedding_dimension()
96
+ mode = f"HuggingFace ({self.english_embedding_model}) {dim}-d"
97
 
98
+ print(f"\n── Building English index with {mode}")
99
+ texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
100
+ FAISS.from_documents(texts, embed).save_local(str(self.english_store))
101
+ print("βœ“ English store saved to", self.english_store, "\n")
 
 
 
102
 
103
  # --------------------------- Czech ------------------------------------ #
104
  def ingest_czech(self):
105
+ embed = HuggingFaceEmbeddings(
106
+ model_name=self.czech_embedding_model,
107
  model_kwargs={"device": "cpu"},
108
  encode_kwargs={"normalize_embeddings": False},
109
  )
110
+ dim = embed.client.get_sentence_embedding_dimension()
111
+ print(f"\n── Building Czech index with HuggingFace "
112
+ f"({self.czech_embedding_model}) {dim}-d")
113
+ texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
114
+ FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
115
+ print("βœ“ Czech store saved to", self.czech_store, "\n")
116
 
 
 
 
117
 
118
+ # ───────────── CLI helper (optional) ───────────── #
 
119
  if __name__ == "__main__":
120
  """
121
+ Examples
122
+ --------
123
+ python ingest.py # builds both stores (OSS embeddings)
124
+ OPENAI_API_KEY=sk-... \
125
+ python ingest.py --openai en # rebuild English with OpenAI encoder
 
126
  """
127
  import argparse, os
128
 
129
+ p = argparse.ArgumentParser()
130
+ p.add_argument("--openai", action="store_true",
131
+ help="Use OpenAI embeddings for English store.")
132
+ p.add_argument("lang", nargs="?", choices=["en", "cz"],
133
+ help="Only ingest this language.")
134
+ args = p.parse_args()
135
 
136
+ ing = Ingest(
137
+ use_openai_embeddings=args.openai,
138
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
139
+ )
140
 
141
+ if args.lang in (None, "en"):
142
  ing.ingest_english()
143
+ if args.lang in (None, "cz"):
144
  ing.ingest_czech()