geekyrakshit commited on
Commit
49d583d
·
1 Parent(s): 56d3953

add: SemanticChunker

Browse files
medrag_multi_modal/semantic_chunker.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Optional, Union
2
+
3
+ import semchunk
4
+ import tiktoken
5
+ import tokenizers
6
+ import weave
7
+ from rich.progress import track
8
+ from transformers import PreTrainedTokenizer
9
+
10
+ TOKENIZER_OR_TOKEN_COUNTER = Union[
11
+ str,
12
+ tiktoken.Encoding,
13
+ PreTrainedTokenizer,
14
+ tokenizers.Tokenizer,
15
+ Callable[[str], int],
16
+ ]
17
+
18
+
19
+ class SemanticChunker:
20
+ def __init__(
21
+ self,
22
+ tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
23
+ chunk_size: Optional[int] = None,
24
+ max_token_chars: Optional[int] = None,
25
+ memoize: bool = True,
26
+ ) -> None:
27
+ self.chunker = semchunk.chunkerify(
28
+ tokenizer_or_token_counter,
29
+ chunk_size=chunk_size,
30
+ max_token_chars=max_token_chars,
31
+ memoize=memoize,
32
+ )
33
+
34
+ def chunk_and_publish(
35
+ self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
36
+ ) -> None:
37
+ document_dataset = weave.ref(document_dataset_name).get().rows
38
+ chunks = []
39
+ for idx, document in track(
40
+ enumerate(document_dataset), description="Chunking documents"
41
+ ):
42
+ document_chunks = self.chunker.chunk(str(document["text"]))
43
+ for chunk in document_chunks:
44
+ chunks.append(
45
+ {
46
+ "document_idx": idx,
47
+ "document_name": document["document_name"],
48
+ "page_idx": document["page_idx"],
49
+ "text": chunk,
50
+ }
51
+ )
52
+ weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))
pyproject.toml CHANGED
@@ -29,6 +29,8 @@ dependencies = [
29
  "mkdocs-jupyter>=0.25.0",
30
  "jupyter>=1.1.1",
31
  "pdfplumber>=0.11.4",
 
 
32
  ]
33
 
34
  [project.optional-dependencies]
@@ -41,6 +43,8 @@ core = [
41
  "PyPDF2>=3.0.1",
42
  "python-dotenv>=1.0.1",
43
  "pymupdf4llm>=0.0.17",
 
 
44
  "torch>=2.4.1",
45
  "weave>=0.51.14",
46
  ]
 
29
  "mkdocs-jupyter>=0.25.0",
30
  "jupyter>=1.1.1",
31
  "pdfplumber>=0.11.4",
32
+ "semchunk>=2.2.0",
33
+ "tiktoken>=0.8.0",
34
  ]
35
 
36
  [project.optional-dependencies]
 
43
  "PyPDF2>=3.0.1",
44
  "python-dotenv>=1.0.1",
45
  "pymupdf4llm>=0.0.17",
46
+ "semchunk>=2.2.0",
47
+ "tiktoken>=0.8.0",
48
  "torch>=2.4.1",
49
  "weave>=0.51.14",
50
  ]