Spaces:

geekyrakshit
/

medrag

Runtime error

App Files Files Community

geekyrakshit commited on Oct 18, 2024

Commit

49d583d

1 Parent(s): 56d3953

add: SemanticChunker

Browse files

Files changed (2) hide show

medrag_multi_modal/semantic_chunker.py +52 -0
pyproject.toml +4 -0

medrag_multi_modal/semantic_chunker.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Callable, Optional, Union
+import semchunk
+import tiktoken
+import tokenizers
+import weave
+from rich.progress import track
+from transformers import PreTrainedTokenizer
+TOKENIZER_OR_TOKEN_COUNTER = Union[
+    str,
+    tiktoken.Encoding,
+    PreTrainedTokenizer,
+    tokenizers.Tokenizer,
+    Callable[[str], int],
+]
+class SemanticChunker:
+    def __init__(
+        self,
+        tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
+        chunk_size: Optional[int] = None,
+        max_token_chars: Optional[int] = None,
+        memoize: bool = True,
+    ) -> None:
+        self.chunker = semchunk.chunkerify(
+            tokenizer_or_token_counter,
+            chunk_size=chunk_size,
+            max_token_chars=max_token_chars,
+            memoize=memoize,
+        )
+    def chunk_and_publish(
+        self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
+    ) -> None:
+        document_dataset = weave.ref(document_dataset_name).get().rows
+        chunks = []
+        for idx, document in track(
+            enumerate(document_dataset), description="Chunking documents"
+        ):
+            document_chunks = self.chunker.chunk(str(document["text"]))
+            for chunk in document_chunks:
+                chunks.append(
+                    {
+                        "document_idx": idx,
+                        "document_name": document["document_name"],
+                        "page_idx": document["page_idx"],
+                        "text": chunk,
+                    }
+                )
+        weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))

pyproject.toml CHANGED Viewed

@@ -29,6 +29,8 @@ dependencies = [
     "mkdocs-jupyter>=0.25.0",
     "jupyter>=1.1.1",
     "pdfplumber>=0.11.4",
 ]
 [project.optional-dependencies]
@@ -41,6 +43,8 @@ core = [
     "PyPDF2>=3.0.1",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
     "torch>=2.4.1",
     "weave>=0.51.14",
 ]

     "mkdocs-jupyter>=0.25.0",
     "jupyter>=1.1.1",
     "pdfplumber>=0.11.4",
+    "semchunk>=2.2.0",
+    "tiktoken>=0.8.0",
 ]
 [project.optional-dependencies]
     "PyPDF2>=3.0.1",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
+    "semchunk>=2.2.0",
+    "tiktoken>=0.8.0",
     "torch>=2.4.1",
     "weave>=0.51.14",
 ]