MaryamKarimi080 commited on
Commit
4e3dac9
·
verified ·
1 Parent(s): 5f096cc

Update scripts/load_documents.py

Browse files
Files changed (1) hide show
  1. scripts/load_documents.py +36 -36
scripts/load_documents.py CHANGED
@@ -1,37 +1,37 @@
1
- from pathlib import Path
2
- from langchain_community.document_loaders import (
3
- PyPDFLoader,
4
- TextLoader,
5
- PythonLoader,
6
- NotebookLoader,
7
- )
8
- import pickle
9
-
10
- DATA_DIR = Path("E:/courses/LangChain Project/main root/data/")
11
- OUTPUT_DIR = Path("E:/courses/LangChain Project/main root/output/")
12
- OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
13
-
14
- # ✅ Create output folder if it doesn't exist
15
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
16
-
17
- loaders = {
18
- ".pdf": PyPDFLoader,
19
- ".txt": lambda path: TextLoader(path, encoding="utf-8"),
20
- ".py": PythonLoader,
21
- ".ipynb": NotebookLoader,
22
- }
23
-
24
- documents = []
25
- for file in DATA_DIR.rglob("*"):
26
- loader_class = loaders.get(file.suffix.lower())
27
- if loader_class:
28
- try:
29
- docs = loader_class(str(file)).load()
30
- documents.extend(docs)
31
- print(f"[✓] Loaded: {file.name}")
32
- except Exception as e:
33
- print(f"[!] Failed to load {file.name}: {e}")
34
-
35
- with open(OUTPUT_PATH, "wb") as f:
36
- pickle.dump(documents, f)
37
  print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}")
 
1
+ from pathlib import Path
2
+ from langchain_community.document_loaders import (
3
+ PyPDFLoader,
4
+ TextLoader,
5
+ PythonLoader,
6
+ NotebookLoader,
7
+ )
8
+ import pickle
9
+
10
+ BASE_DIR = Path(__file__).resolve().parent.parent
11
+ DATA_DIR = BASE_DIR / "data"
12
+ OUTPUT_DIR = BASE_DIR / "output"
13
+ OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
14
+
15
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
16
+
17
+ loaders = {
18
+ ".pdf": PyPDFLoader,
19
+ ".txt": lambda path: TextLoader(path, encoding="utf-8"),
20
+ ".py": PythonLoader,
21
+ ".ipynb": NotebookLoader,
22
+ }
23
+
24
+ documents = []
25
+ for file in DATA_DIR.rglob("*"):
26
+ loader_class = loaders.get(file.suffix.lower())
27
+ if loader_class:
28
+ try:
29
+ docs = loader_class(str(file)).load()
30
+ documents.extend(docs)
31
+ print(f"[✓] Loaded: {file.name}")
32
+ except Exception as e:
33
+ print(f"[!] Failed to load {file.name}: {e}")
34
+
35
+ with open(OUTPUT_PATH, "wb") as f:
36
+ pickle.dump(documents, f)
37
  print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}")