import uuid import logging from typing import List from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter # add logger logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # get document def load_and_split_docs(urls: List[str]): MARKDOWN_SEPARATORS = [ "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", "", ] logger.info("Extracting web loader...") loader = WebBaseLoader(urls) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily chunk_overlap=50, # The number of characters to overlap between chunks add_start_index=True, # If `True`, includes chunk's start index in metadata strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document separators=MARKDOWN_SEPARATORS, ) logger.info("Split and documnets...") docs_split = text_splitter.split_documents(docs) for i, doc in enumerate(docs_split): doc.metadata['id'] = str(uuid.uuid4())[:4] doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:] return docs_split