RAG / chunker.py
olenkap's picture
Upload 15 files
a73d4bf verified
raw
history blame contribute delete
808 Bytes
from typing import Generator, List
def chunk_document(
doc: str,
desired_chunk_size: int,
max_chunk_size: int
) -> Generator[str, None, None]:
chunk = ''
for line in doc.splitlines():
chunk += line + '\n'
if len(chunk) >= desired_chunk_size:
yield chunk[:max_chunk_size]
chunk = ''
if chunk:
yield chunk
def chunk_documents(
docs: List[str],
desired_chunk_size: int = 500,
max_chunk_size: int = 3000
) -> List[str]:
chunks = []
for doc in docs:
chunks += list(chunk_document(
doc=doc,
desired_chunk_size=desired_chunk_size,
max_chunk_size=max_chunk_size
))
return chunks