Teapack1 commited on
Commit
cd7b78b
Β·
verified Β·
1 Parent(s): 05055d0

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +19 -75
ingest.py CHANGED
@@ -1,44 +1,24 @@
1
- # ingest.py
2
- """
3
- Create FAISS indices for Czech and English PDFs.
4
-
5
- Default (matches backend/main.py):
6
- β€’ English embeddings : sentence-transformers/all-MiniLM-L6-v2 (384-d)
7
- β€’ Czech embeddings : Seznam/retromae-small-cs (768-d)
8
-
9
- If you still need a legacy English store with OpenAI
10
- `text-embedding-3-large` (3 072-d), instantiate with
11
- use_openai_embeddings=True and pass OPENAI_API_KEY.
12
- """
13
-
14
  from pathlib import Path
15
  from typing import List
16
 
17
  from langchain_community.vectorstores import FAISS
18
- from langchain.text_splitter import RecursiveCharacterTextSplitter
19
- from langchain.document_loaders import DirectoryLoader, PyPDFLoader
20
-
21
- # ← updated import (fixes deprecation warning) ----------------------[2][3]
22
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
23
- from langchain.embeddings import OpenAIEmbeddings
24
-
25
 
26
  class Ingest:
27
- # --------------------------------------------------------------------- #
28
  def __init__(
29
  self,
30
  *,
31
- # names must stay exactly like in backend/main.py
32
  english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
33
  czech_embedding_model: str = "Seznam/retromae-small-cs",
34
- # optional OpenAI path
35
  use_openai_embeddings: bool = False,
36
- openai_api_key: str | None = None,
37
  openai_embedding_model: str = "text-embedding-3-large",
38
- # chunking
39
  chunk: int = 512,
40
  overlap: int = 256,
41
- # folders
42
  english_store: str = "stores/english_512",
43
  czech_store: str = "stores/czech_512",
44
  data_english: str = "data/english",
@@ -46,37 +26,34 @@ class Ingest:
46
  ):
47
  self.english_embedding_model = english_embedding_model
48
  self.czech_embedding_model = czech_embedding_model
49
-
50
  self.use_openai_embeddings = use_openai_embeddings
51
- self.openai_api_key = openai_api_key
52
  self.openai_embedding_model = openai_embedding_model
53
-
54
  self.chunk = chunk
55
  self.overlap = overlap
56
-
57
  self.english_store = Path(english_store)
58
  self.czech_store = Path(czech_store)
59
  self.data_english = Path(data_english)
60
  self.data_czech = Path(data_czech)
61
 
62
- # --------------------------- helpers ---------------------------------- #
63
  @staticmethod
64
  def _load(folder: Path):
65
  return DirectoryLoader(
66
  str(folder),
67
  recursive=True,
68
  loader_cls=PyPDFLoader,
69
- show_progress=True,
70
  use_multithreading=True,
 
71
  ).load()
72
 
73
  @staticmethod
74
  def _split(docs: List, chunk: int, overlap: int):
75
- return RecursiveCharacterTextSplitter(
76
- chunk_size=chunk, chunk_overlap=overlap
77
- ).split_documents(docs)
78
 
79
- # --------------------------- English ---------------------------------- #
80
  def ingest_english(self):
81
  if self.use_openai_embeddings:
82
  if not self.openai_api_key:
@@ -85,60 +62,27 @@ class Ingest:
85
  openai_api_key=self.openai_api_key,
86
  model=self.openai_embedding_model,
87
  )
88
- mode = f"OpenAI ({self.openai_embedding_model}) 3 072-d"
89
  else:
90
  embed = HuggingFaceEmbeddings(
91
  model_name=self.english_embedding_model,
92
  model_kwargs={"device": "cpu"},
93
  encode_kwargs={"normalize_embeddings": False},
94
  )
95
- dim = embed.client.get_sentence_embedding_dimension()
96
- mode = f"HuggingFace ({self.english_embedding_model}) {dim}-d"
97
-
98
- print(f"\n── Building English index with {mode}")
99
  texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
100
  FAISS.from_documents(texts, embed).save_local(str(self.english_store))
101
- print("βœ“ English store saved to", self.english_store, "\n")
102
 
103
- # --------------------------- Czech ------------------------------------ #
104
  def ingest_czech(self):
105
  embed = HuggingFaceEmbeddings(
106
  model_name=self.czech_embedding_model,
107
  model_kwargs={"device": "cpu"},
108
  encode_kwargs={"normalize_embeddings": False},
109
  )
110
- dim = embed.client.get_sentence_embedding_dimension()
111
- print(f"\n── Building Czech index with HuggingFace "
112
- f"({self.czech_embedding_model}) {dim}-d")
113
  texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
114
  FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
115
- print("βœ“ Czech store saved to", self.czech_store, "\n")
116
-
117
-
118
- # ───────────── CLI helper (optional) ───────────── #
119
- if __name__ == "__main__":
120
- """
121
- Examples
122
- --------
123
- python ingest.py # builds both stores (OSS embeddings)
124
- OPENAI_API_KEY=sk-... \
125
- python ingest.py --openai en # rebuild English with OpenAI encoder
126
- """
127
- import argparse, os
128
-
129
- p = argparse.ArgumentParser()
130
- p.add_argument("--openai", action="store_true",
131
- help="Use OpenAI embeddings for English store.")
132
- p.add_argument("lang", nargs="?", choices=["en", "cz"],
133
- help="Only ingest this language.")
134
- args = p.parse_args()
135
-
136
- ing = Ingest(
137
- use_openai_embeddings=args.openai,
138
- openai_api_key=os.getenv("OPENAI_API_KEY"),
139
- )
140
-
141
- if args.lang in (None, "en"):
142
- ing.ingest_english()
143
- if args.lang in (None, "cz"):
144
- ing.ingest_czech()
 
1
+ # ingest.py – works with LangChain v0.2+
 
 
 
 
 
 
 
 
 
 
 
 
2
  from pathlib import Path
3
  from typing import List
4
 
5
  from langchain_community.vectorstores import FAISS
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
 
 
8
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
9
+ from langchain_openai import OpenAIEmbeddings # optional
 
10
 
11
  class Ingest:
 
12
  def __init__(
13
  self,
14
  *,
 
15
  english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
16
  czech_embedding_model: str = "Seznam/retromae-small-cs",
 
17
  use_openai_embeddings: bool = False,
 
18
  openai_embedding_model: str = "text-embedding-3-large",
19
+ openai_api_key: str | None = None,
20
  chunk: int = 512,
21
  overlap: int = 256,
 
22
  english_store: str = "stores/english_512",
23
  czech_store: str = "stores/czech_512",
24
  data_english: str = "data/english",
 
26
  ):
27
  self.english_embedding_model = english_embedding_model
28
  self.czech_embedding_model = czech_embedding_model
 
29
  self.use_openai_embeddings = use_openai_embeddings
 
30
  self.openai_embedding_model = openai_embedding_model
31
+ self.openai_api_key = openai_api_key
32
  self.chunk = chunk
33
  self.overlap = overlap
 
34
  self.english_store = Path(english_store)
35
  self.czech_store = Path(czech_store)
36
  self.data_english = Path(data_english)
37
  self.data_czech = Path(data_czech)
38
 
39
+ # ------------------------------------------------------------------ utils
40
  @staticmethod
41
  def _load(folder: Path):
42
  return DirectoryLoader(
43
  str(folder),
44
  recursive=True,
45
  loader_cls=PyPDFLoader,
 
46
  use_multithreading=True,
47
+ show_progress=True,
48
  ).load()
49
 
50
  @staticmethod
51
  def _split(docs: List, chunk: int, overlap: int):
52
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
53
+ chunk_overlap=overlap)
54
+ return splitter.split_documents(docs)
55
 
56
+ # ------------------------------------------------------------------ ENG
57
  def ingest_english(self):
58
  if self.use_openai_embeddings:
59
  if not self.openai_api_key:
 
62
  openai_api_key=self.openai_api_key,
63
  model=self.openai_embedding_model,
64
  )
65
+ mode = f"OpenAI {self.openai_embedding_model}"
66
  else:
67
  embed = HuggingFaceEmbeddings(
68
  model_name=self.english_embedding_model,
69
  model_kwargs={"device": "cpu"},
70
  encode_kwargs={"normalize_embeddings": False},
71
  )
72
+ mode = f"HuggingFace {self.english_embedding_model}"
73
+ print(f"β€’ English ingest with {mode}")
 
 
74
  texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
75
  FAISS.from_documents(texts, embed).save_local(str(self.english_store))
76
+ print("βœ“ English store saved to", self.english_store)
77
 
78
+ # ------------------------------------------------------------------ CZ
79
  def ingest_czech(self):
80
  embed = HuggingFaceEmbeddings(
81
  model_name=self.czech_embedding_model,
82
  model_kwargs={"device": "cpu"},
83
  encode_kwargs={"normalize_embeddings": False},
84
  )
85
+ print(f"β€’ Czech ingest with {self.czech_embedding_model}")
 
 
86
  texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
87
  FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
88
+ print("βœ“ Czech store saved to", self.czech_store)