tokenizer; pretrain
Browse files- merges.txt +0 -0
- scripts/requirements.in +2 -2
- scripts/train_tokenizer.py +1 -1
- special_tokens_map.json +5 -0
- tokenizer.json +3 -0
- tokenizer_config.json +0 -0
- vocab.json +0 -0
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/requirements.in
CHANGED
@@ -5,8 +5,8 @@ datasets
|
|
5 |
jinja2
|
6 |
transformers
|
7 |
wandb
|
8 |
-
litgpt[all]
|
9 |
-
|
10 |
litdata
|
11 |
grokadamw
|
12 |
# bitsandbytes
|
|
|
5 |
jinja2
|
6 |
transformers
|
7 |
wandb
|
8 |
+
# litgpt[all]
|
9 |
+
litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
|
10 |
litdata
|
11 |
grokadamw
|
12 |
# bitsandbytes
|
scripts/train_tokenizer.py
CHANGED
@@ -259,7 +259,7 @@ tokenizer.normalizer = normalizers.Sequence([
|
|
259 |
normalizers.Replace(' ', '▁'),
|
260 |
])
|
261 |
|
262 |
-
tokenizer.pre_tokenizer =
|
263 |
|
264 |
tokenizer.post_processor = processors.TemplateProcessing(
|
265 |
single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
|
|
|
259 |
normalizers.Replace(' ', '▁'),
|
260 |
])
|
261 |
|
262 |
+
tokenizer.pre_tokenizer = None
|
263 |
|
264 |
tokenizer.post_processor = processors.TemplateProcessing(
|
265 |
single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|begin_of_text|>",
|
3 |
+
"eos_token": "<|end_of_text|>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87f950ea35683d2d80e36a35aa88b4c80fe5bd6d53aaab0a3ff3380ed0bc823e
|
3 |
+
size 19928939
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|