MaryamKarimi080 commited on
Commit
5f096cc
Β·
verified Β·
1 Parent(s): dfda80f

Upload 3 files

Browse files
scripts/chunk_and_embed.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ import pickle
3
+ import os
4
+
5
+ DOCS_PATH = "E:/courses/LangChain Project/main root/output/all_docs.pkl"
6
+ CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
7
+
8
+ if not os.path.exists(DOCS_PATH):
9
+ raise FileNotFoundError("Run load_documents.py first")
10
+
11
+ with open(DOCS_PATH, "rb") as f:
12
+ all_docs = pickle.load(f)
13
+
14
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
15
+ chunks = splitter.split_documents(all_docs)
16
+ print(f"βœ… Split into {len(chunks)} chunks")
17
+
18
+ with open(CHUNKS_PATH, "wb") as f:
19
+ pickle.dump(chunks, f)
20
+ print(f"πŸ“¦ Chunks saved to {CHUNKS_PATH}")
scripts/load_documents.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain_community.document_loaders import (
3
+ PyPDFLoader,
4
+ TextLoader,
5
+ PythonLoader,
6
+ NotebookLoader,
7
+ )
8
+ import pickle
9
+
10
+ DATA_DIR = Path("E:/courses/LangChain Project/main root/data/")
11
+ OUTPUT_DIR = Path("E:/courses/LangChain Project/main root/output/")
12
+ OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
13
+
14
+ # βœ… Create output folder if it doesn't exist
15
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
16
+
17
+ loaders = {
18
+ ".pdf": PyPDFLoader,
19
+ ".txt": lambda path: TextLoader(path, encoding="utf-8"),
20
+ ".py": PythonLoader,
21
+ ".ipynb": NotebookLoader,
22
+ }
23
+
24
+ documents = []
25
+ for file in DATA_DIR.rglob("*"):
26
+ loader_class = loaders.get(file.suffix.lower())
27
+ if loader_class:
28
+ try:
29
+ docs = loader_class(str(file)).load()
30
+ documents.extend(docs)
31
+ print(f"[βœ“] Loaded: {file.name}")
32
+ except Exception as e:
33
+ print(f"[!] Failed to load {file.name}: {e}")
34
+
35
+ with open(OUTPUT_PATH, "wb") as f:
36
+ pickle.dump(documents, f)
37
+ print(f"πŸ“¦ Saved {len(documents)} documents to {OUTPUT_PATH}")
scripts/setup_vectorstore.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain_openai import OpenAIEmbeddings
3
+ import pickle
4
+ import os
5
+
6
+ CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
7
+ DB_DIR = "E:/courses/LangChain Project/main root/db"
8
+ BATCH_SIZE = 100 # You can tune this depending on average token size per chunk
9
+
10
+ if not os.path.exists(CHUNKS_PATH):
11
+ raise FileNotFoundError("Run chunk_and_embed.py first")
12
+
13
+ with open(CHUNKS_PATH, "rb") as f:
14
+ chunks = pickle.load(f)
15
+
16
+ embedding = OpenAIEmbeddings(model="text-embedding-3-small")
17
+
18
+ # Create or load the vectorstore
19
+ vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
20
+
21
+ print(f"🧠 Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...")
22
+
23
+ # Add documents in batches to avoid hitting token limits
24
+ for i in range(0, len(chunks), BATCH_SIZE):
25
+ batch = chunks[i:i + BATCH_SIZE]
26
+ vectorstore.add_documents(batch)
27
+ print(f"βœ… Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}")
28
+
29
+ # vectorstore.persist()
30
+ print(f"βœ… Vectorstore saved to {DB_DIR}")