LOUIS SANNA commited on
Commit
7f45ab4
·
1 Parent(s): bddb702

clean(load): cut code in subfunctions

Browse files
Files changed (1) hide show
  1. load.py +41 -27
load.py CHANGED
@@ -1,40 +1,54 @@
1
  from dotenv import load_dotenv
2
-
3
- # Load environment variables from .env file
4
- load_dotenv()
5
-
6
- from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
7
- from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
8
- from langchain.vectorstores import Chroma # for the vectorization part
9
  from langchain.text_splitter import CharacterTextSplitter
10
  from glob import glob
11
  import os
12
 
 
 
 
13
  DOCUMENT_PATH = "data/raw/cixiidae"
14
  DB_DIR = "chroma"
15
 
16
- pdf_files = glob(os.path.join(DOCUMENT_PATH, "*.pdf"))
17
- documents = []
18
 
19
- # Iterate through the list of PDF files
20
- for file_path in pdf_files:
21
- try:
22
- loader = UnstructuredFileLoader(file_path)
23
- document = loader.load()
24
- documents.extend(document)
25
- print(f"File added: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- except Exception as e:
28
- print(f"An error occurred while processing the file {file_path}: {str(e)}")
29
 
30
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
31
- documents = text_splitter.split_documents(documents)
 
 
 
32
 
33
- # Now, all_pages contains all the pages from every document
34
- print(f"Total pages: {len(documents)}")
35
 
36
- embeddings = OpenAIEmbeddings()
37
- vectordb = Chroma.from_documents(
38
- documents, embedding=embeddings, persist_directory=DB_DIR
39
- )
40
- vectordb.persist()
 
1
  from dotenv import load_dotenv
2
+ from langchain.document_loaders import UnstructuredFileLoader
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from langchain.vectorstores import Chroma
 
 
 
 
5
  from langchain.text_splitter import CharacterTextSplitter
6
  from glob import glob
7
  import os
8
 
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+
12
  DOCUMENT_PATH = "data/raw/cixiidae"
13
  DB_DIR = "chroma"
14
 
 
 
15
 
16
+ def parse_documents(path):
17
+ pdf_files = glob(os.path.join(path, "*.pdf"))
18
+ documents = []
19
+
20
+ for file_path in pdf_files:
21
+ try:
22
+ loader = UnstructuredFileLoader(file_path)
23
+ document = loader.load()
24
+ documents.extend(document)
25
+ print(f"File added: {file_path}")
26
+
27
+ except Exception as e:
28
+ print(f"An error occurred while processing the file {file_path}: {str(e)}")
29
+
30
+ return documents
31
+
32
+
33
+ def split(documents):
34
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
35
+ return text_splitter.split_documents(documents)
36
+
37
+
38
+ def persist(documents):
39
+ embeddings = OpenAIEmbeddings()
40
+ vectordb = Chroma.from_documents(
41
+ documents, embedding=embeddings, persist_directory=DB_DIR
42
+ )
43
+ vectordb.persist()
44
 
 
 
45
 
46
+ def main():
47
+ documents = parse_documents(DOCUMENT_PATH)
48
+ documents = split(documents)
49
+ print(f"Total pages: {len(documents)}")
50
+ persist(documents)
51
 
 
 
52
 
53
+ if __name__ == "__main__":
54
+ main()