Spaces:
Runtime error
Runtime error
from dotenv import load_dotenv | |
from langchain.document_loaders import UnstructuredFileLoader | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import CharacterTextSplitter | |
from glob import glob | |
import os | |
# Load environment variables from .env file | |
load_dotenv() | |
DOCUMENT_PATH = "data/raw/cixiidae" | |
DB_DIR = "chroma" | |
def parse_documents(path): | |
pdf_files = glob(os.path.join(path, "*.pdf")) | |
documents = [] | |
for file_path in pdf_files: | |
documents.extend(parse_document(file_path)) | |
return documents | |
def parse_document(file_path): | |
try: | |
loader = UnstructuredFileLoader(file_path) | |
document = loader.load() | |
print(f"File parsed: {file_path}") | |
return document | |
except Exception as e: | |
print(f"An error occurred while processing the file {file_path}: {str(e)}") | |
def split(documents): | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) | |
return text_splitter.split_documents(documents) | |
def persist(documents): | |
embeddings = OpenAIEmbeddings() | |
vectordb = Chroma.from_documents( | |
documents, embedding=embeddings, persist_directory=DB_DIR | |
) | |
vectordb.persist() | |
def main(): | |
documents = parse_documents(DOCUMENT_PATH) | |
documents = split(documents) | |
print(f"Total pages: {len(documents)}") | |
persist(documents) | |
if __name__ == "__main__": | |
main() | |