Commit
·
e3f17eb
1
Parent(s):
784faa6
converter indices tensors added.
Browse files- README.md +36 -0
- inv_token_indices.pt +3 -0
- token_indices.pt +3 -0
README.md
CHANGED
@@ -21,6 +21,7 @@ This is the code example:
|
|
21 |
import torch
|
22 |
from transformers import pipeline
|
23 |
|
|
|
24 |
pipe = pipeline(
|
25 |
"text-generation",
|
26 |
model='k-l-lambda/Llama-3.2-1B-vocab32k',
|
@@ -51,3 +52,38 @@ input_ids = tokenizer.encode("Hello, ", return_tensors="pt")
|
|
51 |
output = model.generate(input_ids)
|
52 |
print(tokenizer.decode(output[0]))
|
53 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
import torch
|
22 |
from transformers import pipeline
|
23 |
|
24 |
+
|
25 |
pipe = pipeline(
|
26 |
"text-generation",
|
27 |
model='k-l-lambda/Llama-3.2-1B-vocab32k',
|
|
|
52 |
output = model.generate(input_ids)
|
53 |
print(tokenizer.decode(output[0]))
|
54 |
```
|
55 |
+
|
56 |
+
|
57 |
+
## Token converter
|
58 |
+
|
59 |
+
You can map a ID value in 32k vocab to the ID value in original 128k vocab, by the tensor in `token_indices.pt`.
|
60 |
+
|
61 |
+
```python
|
62 |
+
import torch
|
63 |
+
from huggingface_hub import hf_hub_download
|
64 |
+
from transformers import AutoTokenizer
|
65 |
+
|
66 |
+
|
67 |
+
tokenizer128k = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-1B-Instruct')
|
68 |
+
tokenizer32k = AutoTokenizer.from_pretrained('k-l-lambda/Llama-3.2-1B-vocab32k')
|
69 |
+
|
70 |
+
indices_path = hf_hub_download(repo_id='k-l-lambda/Llama-3.2-1B-vocab32k', filename='token_indices.pt')
|
71 |
+
inv_indices_path = hf_hub_download(repo_id='k-l-lambda/Llama-3.2-1B-vocab32k', filename='inv_token_indices.pt')
|
72 |
+
token_indices = torch.load(indices_path)
|
73 |
+
inv_token_indices = torch.load(inv_indices_path)
|
74 |
+
|
75 |
+
ids_32k = tokenizer32k.encode('This is an example sentence.')
|
76 |
+
ids_128k = [token_indices[id] for id in ids1]
|
77 |
+
print(f'{ids_32k=}')
|
78 |
+
print(f'{ids_128k=}')
|
79 |
+
|
80 |
+
print(tokenizer128k.decode(ids_128k))
|
81 |
+
|
82 |
+
|
83 |
+
ids_128k = tokenizer128k.encode('This is another example sentence.')
|
84 |
+
ids_32k = [inv_token_indices[id] for id in ids1]
|
85 |
+
print(f'{ids_128k=}')
|
86 |
+
print(f'{ids_32k=}') # non-exist tokens in 32k vocab will map to -1
|
87 |
+
|
88 |
+
print(tokenizer32k.decode(ids_32k))
|
89 |
+
```
|
inv_token_indices.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7b97585a6bdd88d65084d9b9dd6f608cd1491e4b15a02faab8eb9659448da45
|
3 |
+
size 1027278
|
token_indices.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f455f59558b410ee6eb8ef8676975157a3b6ba8e9e0cb8a05d59809e331e9c6f
|
3 |
+
size 259258
|