Spaces:

geekyrakshit
/

medrag

Runtime error

File size: 3,116 Bytes

from typing import Callable, Optional, Union

import semchunk
import tiktoken
import tokenizers
import weave
from rich.progress import track
from transformers import PreTrainedTokenizer

TOKENIZER_OR_TOKEN_COUNTER = Union[
    str,
    tiktoken.Encoding,
    PreTrainedTokenizer,
    tokenizers.Tokenizer,
    Callable[[str], int],
]


class SemanticChunker:
    """
    SemanticChunker is a class that chunks documents into smaller segments and
    publishes them as datasets.

    This class uses the `semchunk` library to break down large documents into
    smaller, manageable chunks based on a specified tokenizer or token counter.
    This is particularly useful for processing large text datasets where
    smaller segments are needed for analysis or other operations.

    !!! example "Example Usage"
        ```python
        import weave
        from dotenv import load_dotenv

        from medrag_multi_modal.semantic_chunking import SemanticChunker

        load_dotenv()
        weave.init(project_name="ml-colabs/medrag-multi-modal")
        chunker = SemanticChunker(chunk_size=256)
        chunker.chunk_and_publish(
            document_dataset_name="grays-anatomy-text:v13",
            chunk_dataset_name="grays-anatomy-chunks",
        )
        ```

    Args:
        tokenizer_or_token_counter (TOKENIZER_OR_TOKEN_COUNTER): The tokenizer or
            token counter to be used for chunking.
        chunk_size (Optional[int]): The size of each chunk. If not specified, the
            default chunk size from `semchunk` will be used.
        max_token_chars (Optional[int]): The maximum number of characters per token.
            If not specified, the default value from `semchunk` will be used.
        memoize (bool): Whether to memoize the chunking process for efficiency.
            Default is True.
    """

    def __init__(
        self,
        tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
        chunk_size: Optional[int] = None,
        max_token_chars: Optional[int] = None,
        memoize: bool = True,
    ) -> None:
        self.chunker = semchunk.chunkerify(
            tokenizer_or_token_counter,
            chunk_size=chunk_size,
            max_token_chars=max_token_chars,
            memoize=memoize,
        )

    def chunk_and_publish(
        self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
    ) -> None:
        document_dataset = weave.ref(document_dataset_name).get().rows
        chunks = []
        for idx, document in track(
            enumerate(document_dataset), description="Chunking documents"
        ):
            document_chunks = self.chunker.chunk(str(document["text"]))
            for chunk in document_chunks:
                chunks.append(
                    {
                        "document_idx": idx,
                        "document_name": document["document_name"],
                        "page_idx": document["page_idx"],
                        "text": chunk,
                    }
                )
        weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))