|
|
|
from langchain_community.document_loaders import DirectoryLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.schema import Document |
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import Chroma |
|
from dotenv import load_dotenv |
|
import os |
|
import shutil |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
CHROMA_PATH = "chroma" |
|
DATA_PATH = "data/" |
|
|
|
|
|
def main(): |
|
generate_data_store() |
|
|
|
|
|
def generate_data_store(): |
|
logger.info("Loading documents..") |
|
documents = load_documents() |
|
chunks = split_text(documents) |
|
save_to_chroma(chunks) |
|
|
|
|
|
def load_documents(): |
|
loader = DirectoryLoader(DATA_PATH, glob="*.pdf") |
|
documents = loader.load() |
|
logger.info("Found {:d} documents..".format(len(documents))) |
|
|
|
return documents |
|
|
|
|
|
def split_text(documents: list[Document]): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1800, |
|
chunk_overlap=100, |
|
length_function=len, |
|
add_start_index=True, |
|
) |
|
chunks = text_splitter.split_documents(documents) |
|
print(f"Split {len(documents)} documents into {len(chunks)} chunks.") |
|
|
|
document = chunks[10] |
|
print(document.page_content) |
|
print(document.metadata) |
|
|
|
return chunks |
|
|
|
|
|
def save_to_chroma(chunks: list[Document]): |
|
|
|
if os.path.exists(CHROMA_PATH): |
|
shutil.rmtree(CHROMA_PATH) |
|
|
|
|
|
db = Chroma.from_documents( |
|
chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory=CHROMA_PATH |
|
) |
|
db.persist() |
|
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |