Spaces:

fahmiaziz
/

ai-agent-prototype

Running

File size: 1,393 Bytes

26de2cd
 
 
03eec7e
26de2cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eccf061
26de2cd

import uuid
import logging
from typing import List
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# add logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# get document
def load_and_split_docs(urls: List[str]):
    
    MARKDOWN_SEPARATORS = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]
    logger.info("Extracting web loader...")
    loader = WebBaseLoader(urls)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,  # The maximum number of characters in a chunk: we selected this value arbitrarily
        chunk_overlap=50,  # The number of characters to overlap between chunks
        add_start_index=True,  # If `True`, includes chunk's start index in metadata
        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS,
    )

    logger.info("Split and documnets...")
    docs_split = text_splitter.split_documents(docs)
    for i, doc in enumerate(docs_split):
        doc.metadata['id'] = str(uuid.uuid4())[:4]
        doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
    return docs_split