mtasic85 commited on
Commit
cbf2cc7
1 Parent(s): bf9b120

tokenizer; pretrain

Browse files
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
scripts/requirements.in CHANGED
@@ -5,8 +5,8 @@ datasets
5
  jinja2
6
  transformers
7
  wandb
8
- litgpt[all]
9
- # litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
10
  litdata
11
  grokadamw
12
  # bitsandbytes
 
5
  jinja2
6
  transformers
7
  wandb
8
+ # litgpt[all]
9
+ litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
10
  litdata
11
  grokadamw
12
  # bitsandbytes
scripts/train_tokenizer.py CHANGED
@@ -259,7 +259,7 @@ tokenizer.normalizer = normalizers.Sequence([
259
  normalizers.Replace(' ', '▁'),
260
  ])
261
 
262
- tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
263
 
264
  tokenizer.post_processor = processors.TemplateProcessing(
265
  single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
 
259
  normalizers.Replace(' ', '▁'),
260
  ])
261
 
262
+ tokenizer.pre_tokenizer = None
263
 
264
  tokenizer.post_processor = processors.TemplateProcessing(
265
  single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|begin_of_text|>",
3
+ "eos_token": "<|end_of_text|>",
4
+ "unk_token": "<unk>"
5
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87f950ea35683d2d80e36a35aa88b4c80fe5bd6d53aaab0a3ff3380ed0bc823e
3
+ size 19928939
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff