import glob import os from langchain_text_splitters import MarkdownHeaderTextSplitter from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer path_to_data = "./data/" def process_markdown(): headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4"), ("#####", "Header 5") ] markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) files = glob.glob(path_to_data+"*.md") print(files) docs = [] for file in files: try: with open(file) as f: docs.append(f.read()) except Exception as e: print("Exception: ", e) docs_processed = [markdown_splitter.split_text(doc) for doc in docs] print(len(docs_processed)) print(docs_processed[0]) def process_pdf(): files = glob.glob(path_to_data+"*.pdf") docs = [] for file in files: try: docs.append(PyMuPDFLoader(file).load()) except Exception as e: print("Exception: ", e) chunk_size = 256 text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], ) docs_processed = [text_splitter.split_documents(doc) for doc in docs] docs_processed = [item for sublist in docs_processed for item in sublist] print(len(docs_processed)) print(docs_processed[0])