k-l-lambda commited on
Commit
e3f17eb
·
1 Parent(s): 784faa6

converter indices tensors added.

Browse files
Files changed (3) hide show
  1. README.md +36 -0
  2. inv_token_indices.pt +3 -0
  3. token_indices.pt +3 -0
README.md CHANGED
@@ -21,6 +21,7 @@ This is the code example:
21
  import torch
22
  from transformers import pipeline
23
 
 
24
  pipe = pipeline(
25
  "text-generation",
26
  model='k-l-lambda/Llama-3.2-1B-vocab32k',
@@ -51,3 +52,38 @@ input_ids = tokenizer.encode("Hello, ", return_tensors="pt")
51
  output = model.generate(input_ids)
52
  print(tokenizer.decode(output[0]))
53
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  import torch
22
  from transformers import pipeline
23
 
24
+
25
  pipe = pipeline(
26
  "text-generation",
27
  model='k-l-lambda/Llama-3.2-1B-vocab32k',
 
52
  output = model.generate(input_ids)
53
  print(tokenizer.decode(output[0]))
54
  ```
55
+
56
+
57
+ ## Token converter
58
+
59
+ You can map a ID value in 32k vocab to the ID value in original 128k vocab, by the tensor in `token_indices.pt`.
60
+
61
+ ```python
62
+ import torch
63
+ from huggingface_hub import hf_hub_download
64
+ from transformers import AutoTokenizer
65
+
66
+
67
+ tokenizer128k = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-1B-Instruct')
68
+ tokenizer32k = AutoTokenizer.from_pretrained('k-l-lambda/Llama-3.2-1B-vocab32k')
69
+
70
+ indices_path = hf_hub_download(repo_id='k-l-lambda/Llama-3.2-1B-vocab32k', filename='token_indices.pt')
71
+ inv_indices_path = hf_hub_download(repo_id='k-l-lambda/Llama-3.2-1B-vocab32k', filename='inv_token_indices.pt')
72
+ token_indices = torch.load(indices_path)
73
+ inv_token_indices = torch.load(inv_indices_path)
74
+
75
+ ids_32k = tokenizer32k.encode('This is an example sentence.')
76
+ ids_128k = [token_indices[id] for id in ids1]
77
+ print(f'{ids_32k=}')
78
+ print(f'{ids_128k=}')
79
+
80
+ print(tokenizer128k.decode(ids_128k))
81
+
82
+
83
+ ids_128k = tokenizer128k.encode('This is another example sentence.')
84
+ ids_32k = [inv_token_indices[id] for id in ids1]
85
+ print(f'{ids_128k=}')
86
+ print(f'{ids_32k=}') # non-exist tokens in 32k vocab will map to -1
87
+
88
+ print(tokenizer32k.decode(ids_32k))
89
+ ```
inv_token_indices.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b97585a6bdd88d65084d9b9dd6f608cd1491e4b15a02faab8eb9659448da45
3
+ size 1027278
token_indices.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f455f59558b410ee6eb8ef8676975157a3b6ba8e9e0cb8a05d59809e331e9c6f
3
+ size 259258