File size: 1,393 Bytes
26de2cd
 
 
03eec7e
26de2cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eccf061
26de2cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import uuid
import logging
from typing import List
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# add logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# get document
def load_and_split_docs(urls: List[str]):
    
    MARKDOWN_SEPARATORS = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]
    logger.info("Extracting web loader...")
    loader = WebBaseLoader(urls)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,  # The maximum number of characters in a chunk: we selected this value arbitrarily
        chunk_overlap=50,  # The number of characters to overlap between chunks
        add_start_index=True,  # If `True`, includes chunk's start index in metadata
        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS,
    )

    logger.info("Split and documnets...")
    docs_split = text_splitter.split_documents(docs)
    for i, doc in enumerate(docs_split):
        doc.metadata['id'] = str(uuid.uuid4())[:4]
        doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
    return docs_split