Upload 3 files
Browse files- scripts/chunk_and_embed.py +20 -0
- scripts/load_documents.py +37 -0
- scripts/setup_vectorstore.py +30 -0
scripts/chunk_and_embed.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
import pickle
|
3 |
+
import os
|
4 |
+
|
5 |
+
DOCS_PATH = "E:/courses/LangChain Project/main root/output/all_docs.pkl"
|
6 |
+
CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
|
7 |
+
|
8 |
+
if not os.path.exists(DOCS_PATH):
|
9 |
+
raise FileNotFoundError("Run load_documents.py first")
|
10 |
+
|
11 |
+
with open(DOCS_PATH, "rb") as f:
|
12 |
+
all_docs = pickle.load(f)
|
13 |
+
|
14 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
15 |
+
chunks = splitter.split_documents(all_docs)
|
16 |
+
print(f"β
Split into {len(chunks)} chunks")
|
17 |
+
|
18 |
+
with open(CHUNKS_PATH, "wb") as f:
|
19 |
+
pickle.dump(chunks, f)
|
20 |
+
print(f"π¦ Chunks saved to {CHUNKS_PATH}")
|
scripts/load_documents.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from langchain_community.document_loaders import (
|
3 |
+
PyPDFLoader,
|
4 |
+
TextLoader,
|
5 |
+
PythonLoader,
|
6 |
+
NotebookLoader,
|
7 |
+
)
|
8 |
+
import pickle
|
9 |
+
|
10 |
+
DATA_DIR = Path("E:/courses/LangChain Project/main root/data/")
|
11 |
+
OUTPUT_DIR = Path("E:/courses/LangChain Project/main root/output/")
|
12 |
+
OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
|
13 |
+
|
14 |
+
# β
Create output folder if it doesn't exist
|
15 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
16 |
+
|
17 |
+
loaders = {
|
18 |
+
".pdf": PyPDFLoader,
|
19 |
+
".txt": lambda path: TextLoader(path, encoding="utf-8"),
|
20 |
+
".py": PythonLoader,
|
21 |
+
".ipynb": NotebookLoader,
|
22 |
+
}
|
23 |
+
|
24 |
+
documents = []
|
25 |
+
for file in DATA_DIR.rglob("*"):
|
26 |
+
loader_class = loaders.get(file.suffix.lower())
|
27 |
+
if loader_class:
|
28 |
+
try:
|
29 |
+
docs = loader_class(str(file)).load()
|
30 |
+
documents.extend(docs)
|
31 |
+
print(f"[β] Loaded: {file.name}")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"[!] Failed to load {file.name}: {e}")
|
34 |
+
|
35 |
+
with open(OUTPUT_PATH, "wb") as f:
|
36 |
+
pickle.dump(documents, f)
|
37 |
+
print(f"π¦ Saved {len(documents)} documents to {OUTPUT_PATH}")
|
scripts/setup_vectorstore.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.vectorstores import Chroma
|
2 |
+
from langchain_openai import OpenAIEmbeddings
|
3 |
+
import pickle
|
4 |
+
import os
|
5 |
+
|
6 |
+
CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
|
7 |
+
DB_DIR = "E:/courses/LangChain Project/main root/db"
|
8 |
+
BATCH_SIZE = 100 # You can tune this depending on average token size per chunk
|
9 |
+
|
10 |
+
if not os.path.exists(CHUNKS_PATH):
|
11 |
+
raise FileNotFoundError("Run chunk_and_embed.py first")
|
12 |
+
|
13 |
+
with open(CHUNKS_PATH, "rb") as f:
|
14 |
+
chunks = pickle.load(f)
|
15 |
+
|
16 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
|
17 |
+
|
18 |
+
# Create or load the vectorstore
|
19 |
+
vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
|
20 |
+
|
21 |
+
print(f"π§ Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...")
|
22 |
+
|
23 |
+
# Add documents in batches to avoid hitting token limits
|
24 |
+
for i in range(0, len(chunks), BATCH_SIZE):
|
25 |
+
batch = chunks[i:i + BATCH_SIZE]
|
26 |
+
vectorstore.add_documents(batch)
|
27 |
+
print(f"β
Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}")
|
28 |
+
|
29 |
+
# vectorstore.persist()
|
30 |
+
print(f"β
Vectorstore saved to {DB_DIR}")
|