MaryamKarimi080 commited on
Commit
32e0ef9
·
verified ·
1 Parent(s): 1458fa6

Update scripts/setup_vectorstore.py

Browse files
Files changed (1) hide show
  1. scripts/setup_vectorstore.py +21 -30
scripts/setup_vectorstore.py CHANGED
@@ -1,30 +1,21 @@
1
- from langchain_community.vectorstores import Chroma
2
- from langchain_openai import OpenAIEmbeddings
3
- import pickle
4
- import os
5
-
6
- CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
7
- DB_DIR = "E:/courses/LangChain Project/main root/db"
8
- BATCH_SIZE = 100 # You can tune this depending on average token size per chunk
9
-
10
- if not os.path.exists(CHUNKS_PATH):
11
- raise FileNotFoundError("Run chunk_and_embed.py first")
12
-
13
- with open(CHUNKS_PATH, "rb") as f:
14
- chunks = pickle.load(f)
15
-
16
- embedding = OpenAIEmbeddings(model="text-embedding-3-small")
17
-
18
- # Create or load the vectorstore
19
- vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
20
-
21
- print(f"🧠 Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...")
22
-
23
- # Add documents in batches to avoid hitting token limits
24
- for i in range(0, len(chunks), BATCH_SIZE):
25
- batch = chunks[i:i + BATCH_SIZE]
26
- vectorstore.add_documents(batch)
27
- print(f"✅ Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}")
28
-
29
- # vectorstore.persist()
30
- print(f"✅ Vectorstore saved to {DB_DIR}")
 
1
+ import pickle
2
+ from pathlib import Path
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_openai import OpenAIEmbeddings
5
+
6
+ BASE_DIR = Path(__file__).resolve().parent.parent
7
+ CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl"
8
+ DB_DIR = BASE_DIR / "db"
9
+
10
+ with open(CHUNKS_PATH, "rb") as f:
11
+ chunks = pickle.load(f)
12
+
13
+ embedding = OpenAIEmbeddings(model="text-embedding-3-small")
14
+ vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embedding)
15
+
16
+ BATCH_SIZE = 100
17
+ print(f"🧠 Embedding and adding {len(chunks)} chunks in batches...")
18
+ for i in range(0, len(chunks), BATCH_SIZE):
19
+ batch = chunks[i:i + BATCH_SIZE]
20
+ vectorstore.add_documents(batch)
21
+ print(f" Added batch {i // BATCH_SIZE + 1}")