Spaces:
Running
Running
File size: 3,116 Bytes
49d583d ace03e3 49d583d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from typing import Callable, Optional, Union
import semchunk
import tiktoken
import tokenizers
import weave
from rich.progress import track
from transformers import PreTrainedTokenizer
TOKENIZER_OR_TOKEN_COUNTER = Union[
str,
tiktoken.Encoding,
PreTrainedTokenizer,
tokenizers.Tokenizer,
Callable[[str], int],
]
class SemanticChunker:
"""
SemanticChunker is a class that chunks documents into smaller segments and
publishes them as datasets.
This class uses the `semchunk` library to break down large documents into
smaller, manageable chunks based on a specified tokenizer or token counter.
This is particularly useful for processing large text datasets where
smaller segments are needed for analysis or other operations.
!!! example "Example Usage"
```python
import weave
from dotenv import load_dotenv
from medrag_multi_modal.semantic_chunking import SemanticChunker
load_dotenv()
weave.init(project_name="ml-colabs/medrag-multi-modal")
chunker = SemanticChunker(chunk_size=256)
chunker.chunk_and_publish(
document_dataset_name="grays-anatomy-text:v13",
chunk_dataset_name="grays-anatomy-chunks",
)
```
Args:
tokenizer_or_token_counter (TOKENIZER_OR_TOKEN_COUNTER): The tokenizer or
token counter to be used for chunking.
chunk_size (Optional[int]): The size of each chunk. If not specified, the
default chunk size from `semchunk` will be used.
max_token_chars (Optional[int]): The maximum number of characters per token.
If not specified, the default value from `semchunk` will be used.
memoize (bool): Whether to memoize the chunking process for efficiency.
Default is True.
"""
def __init__(
self,
tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
chunk_size: Optional[int] = None,
max_token_chars: Optional[int] = None,
memoize: bool = True,
) -> None:
self.chunker = semchunk.chunkerify(
tokenizer_or_token_counter,
chunk_size=chunk_size,
max_token_chars=max_token_chars,
memoize=memoize,
)
def chunk_and_publish(
self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
) -> None:
document_dataset = weave.ref(document_dataset_name).get().rows
chunks = []
for idx, document in track(
enumerate(document_dataset), description="Chunking documents"
):
document_chunks = self.chunker.chunk(str(document["text"]))
for chunk in document_chunks:
chunks.append(
{
"document_idx": idx,
"document_name": document["document_name"],
"page_idx": document["page_idx"],
"text": chunk,
}
)
weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))
|