MaryamKarimi080 commited on
Commit
a99c81d
·
verified ·
1 Parent(s): 6ca7edb

Update scripts/load_documents.py

Browse files
Files changed (1) hide show
  1. scripts/load_documents.py +31 -27
scripts/load_documents.py CHANGED
@@ -7,31 +7,35 @@ from langchain_community.document_loaders import (
7
  )
8
  import pickle
9
 
10
- BASE_DIR = Path(__file__).resolve().parent.parent
11
- DATA_DIR = BASE_DIR / "data"
12
- OUTPUT_DIR = BASE_DIR / "output"
13
- OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
16
-
17
- loaders = {
18
- ".pdf": PyPDFLoader,
19
- ".txt": lambda path: TextLoader(path, encoding="utf-8"),
20
- ".py": PythonLoader,
21
- ".ipynb": NotebookLoader,
22
- }
23
-
24
- documents = []
25
- for file in DATA_DIR.rglob("*"):
26
- loader_class = loaders.get(file.suffix.lower())
27
- if loader_class:
28
- try:
29
- docs = loader_class(str(file)).load()
30
- documents.extend(docs)
31
- print(f"[✓] Loaded: {file.name}")
32
- except Exception as e:
33
- print(f"[!] Failed to load {file.name}: {e}")
34
-
35
- with open(OUTPUT_PATH, "wb") as f:
36
- pickle.dump(documents, f)
37
- print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}")
 
7
  )
8
  import pickle
9
 
10
+ def main():
11
+ BASE_DIR = Path(__file__).resolve().parent.parent
12
+ DATA_DIR = BASE_DIR / "data"
13
+ OUTPUT_DIR = BASE_DIR / "output"
14
+ OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl"
15
+
16
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
17
+
18
+ loaders = {
19
+ ".pdf": PyPDFLoader,
20
+ ".txt": lambda path: TextLoader(path, encoding="utf-8"),
21
+ ".py": PythonLoader,
22
+ ".ipynb": NotebookLoader,
23
+ }
24
+
25
+ documents = []
26
+ for file in DATA_DIR.rglob("*"):
27
+ loader_class = loaders.get(file.suffix.lower())
28
+ if loader_class:
29
+ try:
30
+ docs = loader_class(str(file)).load()
31
+ documents.extend(docs)
32
+ print(f"[✓] Loaded: {file.name}")
33
+ except Exception as e:
34
+ print(f"[!] Failed to load {file.name}: {e}")
35
+
36
+ with open(OUTPUT_PATH, "wb") as f:
37
+ pickle.dump(documents, f)
38
+ print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}")
39
 
40
+ if __name__ == "__main__":
41
+ main()