Spaces:
Runtime error
Runtime error
File size: 1,448 Bytes
ec0786f 7f45ab4 ec0786f 7f45ab4 ec0786f 7f45ab4 6d540dc 7f45ab4 6d540dc 7f45ab4 ec0786f 7f45ab4 ec0786f 7f45ab4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from dotenv import load_dotenv
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from glob import glob
import os
# Load environment variables from .env file
load_dotenv()
DOCUMENT_PATH = "data/raw/cixiidae"
DB_DIR = "chroma"
def parse_documents(path):
pdf_files = glob(os.path.join(path, "*.pdf"))
documents = []
for file_path in pdf_files:
documents.extend(parse_document(file_path))
return documents
def parse_document(file_path):
try:
loader = UnstructuredFileLoader(file_path)
document = loader.load()
print(f"File parsed: {file_path}")
return document
except Exception as e:
print(f"An error occurred while processing the file {file_path}: {str(e)}")
def split(documents):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
return text_splitter.split_documents(documents)
def persist(documents):
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
documents, embedding=embeddings, persist_directory=DB_DIR
)
vectordb.persist()
def main():
documents = parse_documents(DOCUMENT_PATH)
documents = split(documents)
print(f"Total pages: {len(documents)}")
persist(documents)
if __name__ == "__main__":
main()
|