MaryamKarimi080 commited on
Commit
854668c
·
verified ·
1 Parent(s): a99c81d

Update scripts/chunk_and_embed.py

Browse files
Files changed (1) hide show
  1. scripts/chunk_and_embed.py +17 -14
scripts/chunk_and_embed.py CHANGED
@@ -3,18 +3,21 @@ from pathlib import Path
3
  import pickle
4
  import os
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- BASE_DIR = Path(__file__).resolve().parent.parent
8
- DOCS_PATH = BASE_DIR / "output" / "all_docs.pkl"
9
- CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl"
10
-
11
- with open(DOCS_PATH, "rb") as f:
12
- all_docs = pickle.load(f)
13
-
14
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
15
- chunks = splitter.split_documents(all_docs)
16
- print(f"✅ Split into {len(chunks)} chunks")
17
-
18
- with open(CHUNKS_PATH, "wb") as f:
19
- pickle.dump(chunks, f)
20
- print(f"📦 Chunks saved to {CHUNKS_PATH}")
 
3
  import pickle
4
  import os
5
 
6
+ def main():
7
+ BASE_DIR = Path(__file__).resolve().parent.parent
8
+ DOCS_PATH = BASE_DIR / "output" / "all_docs.pkl"
9
+ CHUNKS_PATH = BASE_DIR / "output" / "chunks.pkl"
10
+
11
+ with open(DOCS_PATH, "rb") as f:
12
+ all_docs = pickle.load(f)
13
+
14
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
15
+ chunks = splitter.split_documents(all_docs)
16
+ print(f"✅ Split into {len(chunks)} chunks")
17
+
18
+ with open(CHUNKS_PATH, "wb") as f:
19
+ pickle.dump(chunks, f)
20
+ print(f"📦 Chunks saved to {CHUNKS_PATH}")
21
 
22
+ if __name__ == "__main__":
23
+ main()