File size: 808 Bytes
a73d4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from typing import Generator, List


def chunk_document(

        doc: str,

        desired_chunk_size: int,

        max_chunk_size: int

        ) -> Generator[str, None, None]:
    chunk = ''
    for line in doc.splitlines():
        chunk += line + '\n'
        if len(chunk) >= desired_chunk_size:
            yield chunk[:max_chunk_size]
            chunk = ''
    if chunk:
        yield chunk


def chunk_documents(

        docs: List[str],

        desired_chunk_size: int = 500,

        max_chunk_size: int = 3000

        ) -> List[str]:
    chunks = []
    for doc in docs:
        chunks += list(chunk_document(
            doc=doc,
            desired_chunk_size=desired_chunk_size,
            max_chunk_size=max_chunk_size
            ))
        
    return chunks