Spaces:
Running
Running
Commit
·
49d583d
1
Parent(s):
56d3953
add: SemanticChunker
Browse files- medrag_multi_modal/semantic_chunker.py +52 -0
- pyproject.toml +4 -0
medrag_multi_modal/semantic_chunker.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, Optional, Union
|
2 |
+
|
3 |
+
import semchunk
|
4 |
+
import tiktoken
|
5 |
+
import tokenizers
|
6 |
+
import weave
|
7 |
+
from rich.progress import track
|
8 |
+
from transformers import PreTrainedTokenizer
|
9 |
+
|
10 |
+
TOKENIZER_OR_TOKEN_COUNTER = Union[
|
11 |
+
str,
|
12 |
+
tiktoken.Encoding,
|
13 |
+
PreTrainedTokenizer,
|
14 |
+
tokenizers.Tokenizer,
|
15 |
+
Callable[[str], int],
|
16 |
+
]
|
17 |
+
|
18 |
+
|
19 |
+
class SemanticChunker:
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
|
23 |
+
chunk_size: Optional[int] = None,
|
24 |
+
max_token_chars: Optional[int] = None,
|
25 |
+
memoize: bool = True,
|
26 |
+
) -> None:
|
27 |
+
self.chunker = semchunk.chunkerify(
|
28 |
+
tokenizer_or_token_counter,
|
29 |
+
chunk_size=chunk_size,
|
30 |
+
max_token_chars=max_token_chars,
|
31 |
+
memoize=memoize,
|
32 |
+
)
|
33 |
+
|
34 |
+
def chunk_and_publish(
|
35 |
+
self, document_dataset_name: str, chunk_dataset_name: Optional[str] = None
|
36 |
+
) -> None:
|
37 |
+
document_dataset = weave.ref(document_dataset_name).get().rows
|
38 |
+
chunks = []
|
39 |
+
for idx, document in track(
|
40 |
+
enumerate(document_dataset), description="Chunking documents"
|
41 |
+
):
|
42 |
+
document_chunks = self.chunker.chunk(str(document["text"]))
|
43 |
+
for chunk in document_chunks:
|
44 |
+
chunks.append(
|
45 |
+
{
|
46 |
+
"document_idx": idx,
|
47 |
+
"document_name": document["document_name"],
|
48 |
+
"page_idx": document["page_idx"],
|
49 |
+
"text": chunk,
|
50 |
+
}
|
51 |
+
)
|
52 |
+
weave.publish(weave.Dataset(name=chunk_dataset_name, rows=chunks))
|
pyproject.toml
CHANGED
@@ -29,6 +29,8 @@ dependencies = [
|
|
29 |
"mkdocs-jupyter>=0.25.0",
|
30 |
"jupyter>=1.1.1",
|
31 |
"pdfplumber>=0.11.4",
|
|
|
|
|
32 |
]
|
33 |
|
34 |
[project.optional-dependencies]
|
@@ -41,6 +43,8 @@ core = [
|
|
41 |
"PyPDF2>=3.0.1",
|
42 |
"python-dotenv>=1.0.1",
|
43 |
"pymupdf4llm>=0.0.17",
|
|
|
|
|
44 |
"torch>=2.4.1",
|
45 |
"weave>=0.51.14",
|
46 |
]
|
|
|
29 |
"mkdocs-jupyter>=0.25.0",
|
30 |
"jupyter>=1.1.1",
|
31 |
"pdfplumber>=0.11.4",
|
32 |
+
"semchunk>=2.2.0",
|
33 |
+
"tiktoken>=0.8.0",
|
34 |
]
|
35 |
|
36 |
[project.optional-dependencies]
|
|
|
43 |
"PyPDF2>=3.0.1",
|
44 |
"python-dotenv>=1.0.1",
|
45 |
"pymupdf4llm>=0.0.17",
|
46 |
+
"semchunk>=2.2.0",
|
47 |
+
"tiktoken>=0.8.0",
|
48 |
"torch>=2.4.1",
|
49 |
"weave>=0.51.14",
|
50 |
]
|