leafspark commited on
Commit
a52e760
·
verified ·
1 Parent(s): 4ad3970

model: add test files and support command line arguments

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ golden_ratio_1_million_digits.txt.tokens filter=lfs diff=lfs merge=lfs -text
claude_tokenizer.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  from typing import List, Dict
4
 
5
  class ClaudeTokenizer:
6
- def __init__(self, config_file: str):
7
  with open(config_file, "r") as f:
8
  config = json.load(f)
9
 
@@ -22,6 +22,10 @@ class ClaudeTokenizer:
22
  self.pat = re.compile(self.pat_str)
23
  self.vocab_trie = self._build_trie(self.vocab)
24
 
 
 
 
 
25
  def _build_trie(self, vocab: List[str]) -> Dict:
26
  trie = {}
27
  for token in vocab:
@@ -37,10 +41,13 @@ class ClaudeTokenizer:
37
  return trie
38
 
39
  def tokenize(self, text: str) -> List[str]:
40
- tokens = []
41
- for part in self.pat.findall(text):
42
- tokens.extend(self._tokenize_part(part))
43
- return tokens
 
 
 
44
 
45
  def encode(self, text: str) -> List[int]:
46
  tokens = self.tokenize(text)
@@ -52,7 +59,7 @@ class ClaudeTokenizer:
52
  def decode(self, ids: List[int]) -> str:
53
  return "".join(self.id_to_token.get(id, "") for id in ids)
54
 
55
- def _tokenize_part(self, text: str) -> List[str]:
56
  tokens = []
57
  while text:
58
  current = self.vocab_trie
@@ -65,7 +72,22 @@ class ClaudeTokenizer:
65
  longest_match = current["*"]
66
  if longest_match:
67
  tokens.append(longest_match)
68
- text = text[len(longest_match) :]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
  tokens.append(text[0])
71
  text = text[1:]
@@ -74,14 +96,17 @@ class ClaudeTokenizer:
74
 
75
  # Usage example
76
  if __name__ == "__main__":
77
- tokenizer = ClaudeTokenizer("tokenizer_config.json")
 
 
 
78
 
79
  test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
80
  tokens = tokenizer.tokenize(test_text)
81
- print("Tokens:", tokens)
82
 
83
  encoded = tokenizer.encode(test_text)
84
- print("Encoded:", encoded)
85
 
86
  decoded = tokenizer.decode(encoded)
87
- print("Decoded:", decoded)
 
3
  from typing import List, Dict
4
 
5
  class ClaudeTokenizer:
6
+ def __init__(self, config_file: str, algorithm: str = "trie"):
7
  with open(config_file, "r") as f:
8
  config = json.load(f)
9
 
 
22
  self.pat = re.compile(self.pat_str)
23
  self.vocab_trie = self._build_trie(self.vocab)
24
 
25
+ self.algorithm = algorithm
26
+ if algorithm not in ["trie", "linear"]:
27
+ raise ValueError("Invalid algorithm. Choose 'trie' or 'linear'.")
28
+
29
  def _build_trie(self, vocab: List[str]) -> Dict:
30
  trie = {}
31
  for token in vocab:
 
41
  return trie
42
 
43
  def tokenize(self, text: str) -> List[str]:
44
+ if self.algorithm == "trie":
45
+ tokens = []
46
+ for part in self.pat.findall(text):
47
+ tokens.extend(self._tokenize_part_trie(part))
48
+ return tokens
49
+ else:
50
+ return self._tokenize_part_linear(text)
51
 
52
  def encode(self, text: str) -> List[int]:
53
  tokens = self.tokenize(text)
 
59
  def decode(self, ids: List[int]) -> str:
60
  return "".join(self.id_to_token.get(id, "") for id in ids)
61
 
62
+ def _tokenize_part_trie(self, text: str) -> List[str]:
63
  tokens = []
64
  while text:
65
  current = self.vocab_trie
 
72
  longest_match = current["*"]
73
  if longest_match:
74
  tokens.append(longest_match)
75
+ text = text[len(longest_match):]
76
+ else:
77
+ tokens.append(text[0])
78
+ text = text[1:]
79
+ return tokens
80
+
81
+ def _tokenize_part_linear(self, text: str) -> List[str]:
82
+ tokens = []
83
+ while text:
84
+ longest_match = ""
85
+ for token in self.vocab:
86
+ if text.startswith(token) and len(token) > len(longest_match):
87
+ longest_match = token
88
+ if longest_match:
89
+ tokens.append(longest_match)
90
+ text = text[len(longest_match):]
91
  else:
92
  tokens.append(text[0])
93
  text = text[1:]
 
96
 
97
  # Usage example
98
  if __name__ == "__main__":
99
+ # Choose the algorithm: "trie" or "linear"
100
+ algorithm = "linear" # or "trie"
101
+
102
+ tokenizer = ClaudeTokenizer("tokenizer_config.json", algorithm=algorithm)
103
 
104
  test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
105
  tokens = tokenizer.tokenize(test_text)
106
+ print(f"Tokens ({algorithm}):", tokens)
107
 
108
  encoded = tokenizer.encode(test_text)
109
+ print(f"Encoded ({algorithm}):", encoded)
110
 
111
  decoded = tokenizer.decode(encoded)
112
+ print(f"Decoded ({algorithm}):", decoded)
golden_ratio_1_million_digits.txt ADDED
The diff for this file is too large to render. See raw diff
 
golden_ratio_1_million_digits.txt.tokens ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:609595bf41569a898a1b036eb67f6c2a2213fc84de7099f3e11730073b3d6c04
3
+ size 16842482
groups_merged-enhancedV3.txt ADDED
The diff for this file is too large to render. See raw diff
 
groups_merged-enhancedV3.txt.tokens ADDED
The diff for this file is too large to render. See raw diff
 
prompt_test.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2000-2009 JetBrains s.r.o.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ Explain this copyright license.
prompt_test.txt.tokens ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "token": "/*",
4
+ "id": 32125
5
+ },
6
+ {
7
+ "token": "\n",
8
+ "id": 38
9
+ },
10
+ {
11
+ "token": " *",
12
+ "id": 1020
13
+ },
14
+ {
15
+ "token": " Copyright",
16
+ "id": 3800
17
+ },
18
+ {
19
+ "token": " 2000",
20
+ "id": 1453
21
+ },
22
+ {
23
+ "token": "-",
24
+ "id": 31893
25
+ },
26
+ {
27
+ "token": "2009",
28
+ "id": 32787
29
+ },
30
+ {
31
+ "token": " Jet",
32
+ "id": 6203
33
+ },
34
+ {
35
+ "token": "Brain",
36
+ "id": 35944
37
+ },
38
+ {
39
+ "token": "s",
40
+ "id": 58607
41
+ },
42
+ {
43
+ "token": " s",
44
+ "id": 25844
45
+ },
46
+ {
47
+ "token": ".",
48
+ "id": 31979
49
+ },
50
+ {
51
+ "token": "r",
52
+ "id": 57555
53
+ },
54
+ {
55
+ "token": ".",
56
+ "id": 31979
57
+ },
58
+ {
59
+ "token": "o",
60
+ "id": 55302
61
+ },
62
+ {
63
+ "token": ".",
64
+ "id": 31979
65
+ },
66
+ {
67
+ "token": "\n",
68
+ "id": 38
69
+ },
70
+ {
71
+ "token": " *",
72
+ "id": 1020
73
+ },
74
+ {
75
+ "token": "\n",
76
+ "id": 38
77
+ },
78
+ {
79
+ "token": " *",
80
+ "id": 1020
81
+ },
82
+ {
83
+ "token": " Licensed",
84
+ "id": 6684
85
+ },
86
+ {
87
+ "token": " under",
88
+ "id": 28977
89
+ },
90
+ {
91
+ "token": " the",
92
+ "id": 28194
93
+ },
94
+ {
95
+ "token": " Apache",
96
+ "id": 2347
97
+ },
98
+ {
99
+ "token": " License",
100
+ "id": 6683
101
+ },
102
+ {
103
+ "token": ",",
104
+ "id": 31833
105
+ },
106
+ {
107
+ "token": " Version",
108
+ "id": 10644
109
+ },
110
+ {
111
+ "token": " 2",
112
+ "id": 1450
113
+ },
114
+ {
115
+ "token": ".",
116
+ "id": 31979
117
+ },
118
+ {
119
+ "token": "0",
120
+ "id": 32168
121
+ },
122
+ {
123
+ "token": " ",
124
+ "id": 411
125
+ },
126
+ {
127
+ "token": "(",
128
+ "id": 31193
129
+ },
130
+ {
131
+ "token": "the",
132
+ "id": 60198
133
+ },
134
+ {
135
+ "token": " ",
136
+ "id": 411
137
+ },
138
+ {
139
+ "token": "\"",
140
+ "id": 30684
141
+ },
142
+ {
143
+ "token": "License",
144
+ "id": 39610
145
+ },
146
+ {
147
+ "token": "\");",
148
+ "id": 30727
149
+ },
150
+ {
151
+ "token": "\n",
152
+ "id": 38
153
+ },
154
+ {
155
+ "token": " *",
156
+ "id": 1020
157
+ },
158
+ {
159
+ "token": " you",
160
+ "id": 30159
161
+ },
162
+ {
163
+ "token": " may",
164
+ "id": 21448
165
+ },
166
+ {
167
+ "token": " not",
168
+ "id": 22440
169
+ },
170
+ {
171
+ "token": " use",
172
+ "id": 29209
173
+ },
174
+ {
175
+ "token": " this",
176
+ "id": 28263
177
+ },
178
+ {
179
+ "token": " file",
180
+ "id": 17775
181
+ },
182
+ {
183
+ "token": " except",
184
+ "id": 17281
185
+ },
186
+ {
187
+ "token": " in",
188
+ "id": 19621
189
+ },
190
+ {
191
+ "token": " compliance",
192
+ "id": 14422
193
+ },
194
+ {
195
+ "token": " with",
196
+ "id": 29944
197
+ },
198
+ {
199
+ "token": " the",
200
+ "id": 28194
201
+ },
202
+ {
203
+ "token": " License",
204
+ "id": 6683
205
+ },
206
+ {
207
+ "token": ".",
208
+ "id": 31979
209
+ },
210
+ {
211
+ "token": "\n",
212
+ "id": 38
213
+ },
214
+ {
215
+ "token": " *",
216
+ "id": 1020
217
+ },
218
+ {
219
+ "token": " You",
220
+ "id": 11050
221
+ },
222
+ {
223
+ "token": " may",
224
+ "id": 21448
225
+ },
226
+ {
227
+ "token": " obtain",
228
+ "id": 22612
229
+ },
230
+ {
231
+ "token": " a",
232
+ "id": 11238
233
+ },
234
+ {
235
+ "token": " copy",
236
+ "id": 14878
237
+ },
238
+ {
239
+ "token": " of",
240
+ "id": 22656
241
+ },
242
+ {
243
+ "token": " the",
244
+ "id": 28194
245
+ },
246
+ {
247
+ "token": " License",
248
+ "id": 6683
249
+ },
250
+ {
251
+ "token": " at",
252
+ "id": 12373
253
+ },
254
+ {
255
+ "token": "\n",
256
+ "id": 38
257
+ },
258
+ {
259
+ "token": " *",
260
+ "id": 1020
261
+ },
262
+ {
263
+ "token": "\n",
264
+ "id": 38
265
+ },
266
+ {
267
+ "token": " *",
268
+ "id": 1020
269
+ },
270
+ {
271
+ "token": " http",
272
+ "id": 19315
273
+ },
274
+ {
275
+ "token": "://",
276
+ "id": 33845
277
+ },
278
+ {
279
+ "token": "www",
280
+ "id": 62180
281
+ },
282
+ {
283
+ "token": ".",
284
+ "id": 31979
285
+ },
286
+ {
287
+ "token": "apache",
288
+ "id": 45677
289
+ },
290
+ {
291
+ "token": ".",
292
+ "id": 31979
293
+ },
294
+ {
295
+ "token": "org",
296
+ "id": 56034
297
+ },
298
+ {
299
+ "token": "/",
300
+ "id": 32099
301
+ },
302
+ {
303
+ "token": "licenses",
304
+ "id": 53796
305
+ },
306
+ {
307
+ "token": "/",
308
+ "id": 32099
309
+ },
310
+ {
311
+ "token": "LICENSE",
312
+ "id": 39458
313
+ },
314
+ {
315
+ "token": "-",
316
+ "id": 31893
317
+ },
318
+ {
319
+ "token": "2",
320
+ "id": 32769
321
+ },
322
+ {
323
+ "token": ".",
324
+ "id": 31979
325
+ },
326
+ {
327
+ "token": "0",
328
+ "id": 32168
329
+ },
330
+ {
331
+ "token": "\n",
332
+ "id": 38
333
+ },
334
+ {
335
+ "token": " *",
336
+ "id": 1020
337
+ },
338
+ {
339
+ "token": "\n",
340
+ "id": 38
341
+ },
342
+ {
343
+ "token": " *",
344
+ "id": 1020
345
+ },
346
+ {
347
+ "token": " Unless",
348
+ "id": 10498
349
+ },
350
+ {
351
+ "token": " required",
352
+ "id": 25350
353
+ },
354
+ {
355
+ "token": " by",
356
+ "id": 13397
357
+ },
358
+ {
359
+ "token": " applicable",
360
+ "id": 12089
361
+ },
362
+ {
363
+ "token": " law",
364
+ "id": 20697
365
+ },
366
+ {
367
+ "token": " or",
368
+ "id": 22820
369
+ },
370
+ {
371
+ "token": " agreed",
372
+ "id": 11648
373
+ },
374
+ {
375
+ "token": " to",
376
+ "id": 28411
377
+ },
378
+ {
379
+ "token": " in",
380
+ "id": 19621
381
+ },
382
+ {
383
+ "token": " writing",
384
+ "id": 30057
385
+ },
386
+ {
387
+ "token": ",",
388
+ "id": 31833
389
+ },
390
+ {
391
+ "token": " software",
392
+ "id": 26849
393
+ },
394
+ {
395
+ "token": "\n",
396
+ "id": 38
397
+ },
398
+ {
399
+ "token": " *",
400
+ "id": 1020
401
+ },
402
+ {
403
+ "token": " distributed",
404
+ "id": 16197
405
+ },
406
+ {
407
+ "token": " under",
408
+ "id": 28977
409
+ },
410
+ {
411
+ "token": " the",
412
+ "id": 28194
413
+ },
414
+ {
415
+ "token": " License",
416
+ "id": 6683
417
+ },
418
+ {
419
+ "token": " is",
420
+ "id": 20239
421
+ },
422
+ {
423
+ "token": " distributed",
424
+ "id": 16197
425
+ },
426
+ {
427
+ "token": " on",
428
+ "id": 22716
429
+ },
430
+ {
431
+ "token": " an",
432
+ "id": 11871
433
+ },
434
+ {
435
+ "token": " ",
436
+ "id": 411
437
+ },
438
+ {
439
+ "token": "\"",
440
+ "id": 30684
441
+ },
442
+ {
443
+ "token": "AS",
444
+ "id": 35173
445
+ },
446
+ {
447
+ "token": " IS",
448
+ "id": 5883
449
+ },
450
+ {
451
+ "token": "\"",
452
+ "id": 30684
453
+ },
454
+ {
455
+ "token": " BASIS",
456
+ "id": 2583
457
+ },
458
+ {
459
+ "token": ",",
460
+ "id": 31833
461
+ },
462
+ {
463
+ "token": "\n",
464
+ "id": 38
465
+ },
466
+ {
467
+ "token": " *",
468
+ "id": 1020
469
+ },
470
+ {
471
+ "token": " WITHOUT",
472
+ "id": 10757
473
+ },
474
+ {
475
+ "token": " WARRANTIES",
476
+ "id": 10735
477
+ },
478
+ {
479
+ "token": " OR",
480
+ "id": 7753
481
+ },
482
+ {
483
+ "token": " CONDITIONS",
484
+ "id": 3187
485
+ },
486
+ {
487
+ "token": " OF",
488
+ "id": 7733
489
+ },
490
+ {
491
+ "token": " ANY",
492
+ "id": 2014
493
+ },
494
+ {
495
+ "token": " KIND",
496
+ "id": 6288
497
+ },
498
+ {
499
+ "token": ",",
500
+ "id": 31833
501
+ },
502
+ {
503
+ "token": " either",
504
+ "id": 16672
505
+ },
506
+ {
507
+ "token": " express",
508
+ "id": 17456
509
+ },
510
+ {
511
+ "token": " or",
512
+ "id": 22820
513
+ },
514
+ {
515
+ "token": " implied",
516
+ "id": 19580
517
+ },
518
+ {
519
+ "token": ".",
520
+ "id": 31979
521
+ },
522
+ {
523
+ "token": "\n",
524
+ "id": 38
525
+ },
526
+ {
527
+ "token": " *",
528
+ "id": 1020
529
+ },
530
+ {
531
+ "token": " See",
532
+ "id": 9386
533
+ },
534
+ {
535
+ "token": " the",
536
+ "id": 28194
537
+ },
538
+ {
539
+ "token": " License",
540
+ "id": 6683
541
+ },
542
+ {
543
+ "token": " for",
544
+ "id": 18039
545
+ },
546
+ {
547
+ "token": " the",
548
+ "id": 28194
549
+ },
550
+ {
551
+ "token": " specific",
552
+ "id": 26985
553
+ },
554
+ {
555
+ "token": " language",
556
+ "id": 20643
557
+ },
558
+ {
559
+ "token": " governing",
560
+ "id": 18631
561
+ },
562
+ {
563
+ "token": " permissions",
564
+ "id": 23414
565
+ },
566
+ {
567
+ "token": " and",
568
+ "id": 11913
569
+ },
570
+ {
571
+ "token": "\n",
572
+ "id": 38
573
+ },
574
+ {
575
+ "token": " *",
576
+ "id": 1020
577
+ },
578
+ {
579
+ "token": " limitations",
580
+ "id": 20911
581
+ },
582
+ {
583
+ "token": " under",
584
+ "id": 28977
585
+ },
586
+ {
587
+ "token": " the",
588
+ "id": 28194
589
+ },
590
+ {
591
+ "token": " License",
592
+ "id": 6683
593
+ },
594
+ {
595
+ "token": ".",
596
+ "id": 31979
597
+ },
598
+ {
599
+ "token": "\n",
600
+ "id": 38
601
+ },
602
+ {
603
+ "token": " */",
604
+ "id": 1036
605
+ },
606
+ {
607
+ "token": "\n",
608
+ "id": 38
609
+ },
610
+ {
611
+ "token": " Expl",
612
+ "id": 4745
613
+ },
614
+ {
615
+ "token": "ain",
616
+ "id": 45149
617
+ },
618
+ {
619
+ "token": " this",
620
+ "id": 28263
621
+ },
622
+ {
623
+ "token": " copyright",
624
+ "id": 14880
625
+ },
626
+ {
627
+ "token": " license",
628
+ "id": 20861
629
+ },
630
+ {
631
+ "token": ".",
632
+ "id": 31979
633
+ },
634
+ {
635
+ "total": 158
636
+ }
637
+ ]
tokenize.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import argparse
4
+ from typing import List, Dict
5
+ import bisect
6
+
7
+ class ClaudeTokenizer:
8
+ def __init__(self, config_file: str, algorithm: str = "trie"):
9
+ with open(config_file, "r") as f:
10
+ config = json.load(f)
11
+
12
+ self.vocab = sorted(config["vocab"]) # Sort vocab for binary search
13
+ self.vocab_size = config["n_vocab_size"]
14
+ self.pat_str = config["pat_str"]
15
+ self.special_tokens = config["special_tokens"]
16
+
17
+ self.token_to_id = {token: i for i, token in enumerate(self.vocab)}
18
+ self.id_to_token = {i: token for token, i in self.token_to_id.items()}
19
+
20
+ for token, id in self.special_tokens.items():
21
+ self.token_to_id[token] = id
22
+ self.id_to_token[id] = token
23
+
24
+ self.pat = re.compile(self.pat_str)
25
+ self.vocab_trie = self._build_trie(self.vocab)
26
+
27
+ self.algorithm = algorithm
28
+ if algorithm not in ["trie", "linear"]:
29
+ raise ValueError("Invalid algorithm. Choose 'trie' or 'linear'.")
30
+
31
+ def _build_trie(self, vocab: List[str]) -> Dict:
32
+ trie = {}
33
+ for token in vocab:
34
+ current = trie
35
+ for char in token:
36
+ if isinstance(current, str):
37
+ break
38
+ if char not in current:
39
+ current[char] = {}
40
+ current = current[char]
41
+ if isinstance(current, dict):
42
+ current["*"] = token
43
+ return trie
44
+
45
+ def tokenize(self, text: str) -> List[str]:
46
+ if self.algorithm == "trie":
47
+ tokens = []
48
+ for part in self.pat.findall(text):
49
+ tokens.extend(self._tokenize_part_trie(part))
50
+ return tokens
51
+ else:
52
+ return self._tokenize_part_linear(text)
53
+
54
+ def encode(self, text: str) -> List[int]:
55
+ tokens = self.tokenize(text)
56
+ return [
57
+ self.token_to_id.get(token, self.special_tokens["<META>"])
58
+ for token in tokens
59
+ ]
60
+
61
+ def decode(self, ids: List[int]) -> str:
62
+ return "".join(self.id_to_token.get(id, "") for id in ids)
63
+
64
+ def _tokenize_part_trie(self, text: str) -> List[str]:
65
+ tokens = []
66
+ while text:
67
+ current = self.vocab_trie
68
+ longest_match = ""
69
+ for i, char in enumerate(text):
70
+ if char not in current:
71
+ break
72
+ current = current[char]
73
+ if "*" in current:
74
+ longest_match = current["*"]
75
+ if longest_match:
76
+ tokens.append(longest_match)
77
+ text = text[len(longest_match):]
78
+ else:
79
+ tokens.append(text[0])
80
+ text = text[1:]
81
+ return tokens
82
+
83
+ def _tokenize_part_linear(self, text: str) -> List[str]:
84
+ tokens = []
85
+ while text:
86
+ longest_match = self._binary_search_prefix(text)
87
+ if longest_match:
88
+ tokens.append(longest_match)
89
+ text = text[len(longest_match):]
90
+ else:
91
+ tokens.append(text[0])
92
+ text = text[1:]
93
+ return tokens
94
+
95
+ def _binary_search_prefix(self, text: str) -> str:
96
+ left, right = 0, len(self.vocab) - 1
97
+ longest_match = ""
98
+
99
+ while left <= right:
100
+ mid = (left + right) // 2
101
+ if text.startswith(self.vocab[mid]):
102
+ longest_match = self.vocab[mid]
103
+ left = mid + 1
104
+ elif self.vocab[mid] < text:
105
+ left = mid + 1
106
+ else:
107
+ right = mid - 1
108
+
109
+ return longest_match
110
+
111
+ def process_file(file_path: str, tokenizer: ClaudeTokenizer) -> List[Dict]:
112
+ encodings = ['utf-8', 'utf-16', 'latin-1', 'iso-8859-1']
113
+
114
+ for encoding in encodings:
115
+ try:
116
+ with open(file_path, 'r', encoding=encoding) as f:
117
+ text = f.read()
118
+ break
119
+ except UnicodeDecodeError:
120
+ continue
121
+ else:
122
+ raise ValueError(f"Unable to decode the file {file_path} with any of the attempted encodings.")
123
+
124
+ tokens = tokenizer.tokenize(text)
125
+ encoded = tokenizer.encode(text)
126
+
127
+ result = [{"token": token, "id": id} for token, id in zip(tokens, encoded)]
128
+ result.append({"total": len(tokens)})
129
+
130
+ return result
131
+
132
+ def main():
133
+ parser = argparse.ArgumentParser(description="Tokenize text using Claude Tokenizer")
134
+ parser.add_argument("--text", type=str, help="Text to tokenize")
135
+ parser.add_argument("--file", type=str, help="File to tokenize")
136
+ parser.add_argument("--algo", type=str, choices=["linear", "trie"], required=True, help="Tokenization algorithm")
137
+ args = parser.parse_args()
138
+
139
+ if not args.text and not args.file:
140
+ parser.error("Either --text or --file must be specified")
141
+
142
+ try:
143
+ tokenizer = ClaudeTokenizer("tokenizer_config.json", algorithm=args.algo)
144
+
145
+ if args.file:
146
+ result = process_file(args.file, tokenizer)
147
+ output_file = args.file + ".tokens"
148
+ with open(output_file, 'w', encoding='utf-8') as f:
149
+ json.dump(result, f, indent=2, ensure_ascii=False)
150
+ print(f"Tokenization results saved to {output_file}")
151
+ else:
152
+ tokens = tokenizer.tokenize(args.text)
153
+ encoded = tokenizer.encode(args.text)
154
+ result = [{"token": token, "id": id} for token, id in zip(tokens, encoded)]
155
+ result.append({"total": len(tokens)})
156
+ print(json.dumps(result, indent=2, ensure_ascii=False))
157
+ except Exception as e:
158
+ print(f"An error occurred: {str(e)}")
159
+ import traceback
160
+ traceback.print_exc()
161
+
162
+ if __name__ == "__main__":
163
+ main()