Spaces:

LouisSanna
/

datak

Runtime error

LOUIS SANNA commited on May 18, 2023

Commit

7f45ab4

1 Parent(s): bddb702

clean(load): cut code in subfunctions

Files changed (1) hide show

load.py CHANGED Viewed

@@ -1,40 +1,54 @@
 from dotenv import load_dotenv
-# Load environment variables from .env file
-load_dotenv()
-from langchain.document_loaders import UnstructuredFileLoader  # for loading the pdf
-from langchain.embeddings import OpenAIEmbeddings  # for creating embeddings
-from langchain.vectorstores import Chroma  # for the vectorization part
 from langchain.text_splitter import CharacterTextSplitter
 from glob import glob
 import os
 DOCUMENT_PATH = "data/raw/cixiidae"
 DB_DIR = "chroma"
-pdf_files = glob(os.path.join(DOCUMENT_PATH, "*.pdf"))
-documents = []
-# Iterate through the list of PDF files
-for file_path in pdf_files:
-    try:
-        loader = UnstructuredFileLoader(file_path)
-        document = loader.load()
-        documents.extend(document)
-        print(f"File added: {file_path}")
-    except Exception as e:
-        print(f"An error occurred while processing the file {file_path}: {str(e)}")
-text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
-documents = text_splitter.split_documents(documents)
-# Now, all_pages contains all the pages from every document
-print(f"Total pages: {len(documents)}")
-embeddings = OpenAIEmbeddings()
-vectordb = Chroma.from_documents(
-    documents, embedding=embeddings, persist_directory=DB_DIR
-)
-vectordb.persist()

 from dotenv import load_dotenv
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
 from langchain.text_splitter import CharacterTextSplitter
 from glob import glob
 import os
+# Load environment variables from .env file
+load_dotenv()
 DOCUMENT_PATH = "data/raw/cixiidae"
 DB_DIR = "chroma"
+def parse_documents(path):
+    pdf_files = glob(os.path.join(path, "*.pdf"))
+    documents = []
+    for file_path in pdf_files:
+        try:
+            loader = UnstructuredFileLoader(file_path)
+            document = loader.load()
+            documents.extend(document)
+            print(f"File added: {file_path}")
+        except Exception as e:
+            print(f"An error occurred while processing the file {file_path}: {str(e)}")
+    return documents
+def split(documents):
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
+    return text_splitter.split_documents(documents)
+def persist(documents):
+    embeddings = OpenAIEmbeddings()
+    vectordb = Chroma.from_documents(
+        documents, embedding=embeddings, persist_directory=DB_DIR
+    )
+    vectordb.persist()
+def main():
+    documents = parse_documents(DOCUMENT_PATH)
+    documents = split(documents)
+    print(f"Total pages: {len(documents)}")
+    persist(documents)
+if __name__ == "__main__":
+    main()