|
from uuid import uuid4 |
|
|
|
from langchain.text_splitter import MarkdownTextSplitter |
|
from rag_demo.preprocessing.base import Chunk |
|
from rag_demo.preprocessing.base import Document |
|
|
|
|
|
def chunk_text( |
|
document: Document, chunk_size: int = 500, chunk_overlap: int = 50 |
|
) -> list[Chunk]: |
|
text_splitter = MarkdownTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
chunks = text_splitter.split_text(document.text) |
|
result = [] |
|
for chunk in chunks: |
|
result.append( |
|
Chunk( |
|
content=chunk, |
|
document_id=document.document_id, |
|
chunk_id=uuid4(), |
|
metadata=document.metadata, |
|
) |
|
) |
|
|
|
return result |
|
|