Spaces:

MaryamKarimi080
/

SCR_Course_ChatBot

Paused

MaryamKarimi080 commited on 16 days ago

Commit

5f096cc

verified ·

1 Parent(s): dfda80f

Upload 3 files

Files changed (3) hide show

scripts/chunk_and_embed.py ADDED Viewed

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import pickle
+import os
+DOCS_PATH = "E:/courses/LangChain Project/main root/output/all_docs.pkl"
+CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
+if not os.path.exists(DOCS_PATH):
+    raise FileNotFoundError("Run load_documents.py first")
+with open(DOCS_PATH, "rb") as f:
+    all_docs = pickle.load(f)
+splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+chunks = splitter.split_documents(all_docs)
+print(f"✅ Split into {len(chunks)} chunks")
+with open(CHUNKS_PATH, "wb") as f:
+    pickle.dump(chunks, f)
+print(f"📦 Chunks saved to {CHUNKS_PATH}")

scripts/load_documents.py ADDED Viewed

+from pathlib import Path
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    TextLoader,
+    PythonLoader,
+    NotebookLoader,
+)
+import pickle
+DATA_DIR = Path("E:/courses/LangChain Project/main root/data/")
+OUTPUT_DIR = Path("E:/courses/LangChain Project/main root/output/")
+OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
+# ✅ Create output folder if it doesn't exist
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+loaders = {
+    ".pdf": PyPDFLoader,
+    ".txt": lambda path: TextLoader(path, encoding="utf-8"),
+    ".py": PythonLoader,
+    ".ipynb": NotebookLoader,
+}
+documents = []
+for file in DATA_DIR.rglob("*"):
+    loader_class = loaders.get(file.suffix.lower())
+    if loader_class:
+        try:
+            docs = loader_class(str(file)).load()
+            documents.extend(docs)
+            print(f"[✓] Loaded: {file.name}")
+        except Exception as e:
+            print(f"[!] Failed to load {file.name}: {e}")
+with open(OUTPUT_PATH, "wb") as f:
+    pickle.dump(documents, f)
+print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}")

scripts/setup_vectorstore.py ADDED Viewed

+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+import pickle
+import os
+CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
+DB_DIR = "E:/courses/LangChain Project/main root/db"
+BATCH_SIZE = 100  # You can tune this depending on average token size per chunk
+if not os.path.exists(CHUNKS_PATH):
+    raise FileNotFoundError("Run chunk_and_embed.py first")
+with open(CHUNKS_PATH, "rb") as f:
+    chunks = pickle.load(f)
+embedding = OpenAIEmbeddings(model="text-embedding-3-small")
+# Create or load the vectorstore
+vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
+print(f"🧠 Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...")
+# Add documents in batches to avoid hitting token limits
+for i in range(0, len(chunks), BATCH_SIZE):
+    batch = chunks[i:i + BATCH_SIZE]
+    vectorstore.add_documents(batch)
+    print(f"✅ Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}")
+# vectorstore.persist()
+print(f"✅ Vectorstore saved to {DB_DIR}")