datak / load.py
LOUIS SANNA
feat(summarization): add document summarization
6d540dc
from dotenv import load_dotenv
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from glob import glob
import os
# Load environment variables from .env file
load_dotenv()
DOCUMENT_PATH = "data/raw/cixiidae"
DB_DIR = "chroma"
def parse_documents(path):
pdf_files = glob(os.path.join(path, "*.pdf"))
documents = []
for file_path in pdf_files:
documents.extend(parse_document(file_path))
return documents
def parse_document(file_path):
try:
loader = UnstructuredFileLoader(file_path)
document = loader.load()
print(f"File parsed: {file_path}")
return document
except Exception as e:
print(f"An error occurred while processing the file {file_path}: {str(e)}")
def split(documents):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
return text_splitter.split_documents(documents)
def persist(documents):
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
documents, embedding=embeddings, persist_directory=DB_DIR
)
vectordb.persist()
def main():
documents = parse_documents(DOCUMENT_PATH)
documents = split(documents)
print(f"Total pages: {len(documents)}")
persist(documents)
if __name__ == "__main__":
main()