|
|
|
|
|
|
|
|
|
词典大小 250680 来自 https://huggingface.co/bigscience/bloom#preprocessing |
|
"vocab_size": 250880 |
|
|
|
|
|
## OOV |
|
|
|
有些空格没编码进去,详见`test_oov.py` |
|
|
|
## 中文词典 |
|
|
|
一个中文几个id? |
|
|
|
|
|
## |
|
|
|
``` |
|
"pre_tokenizer": { |
|
"type": "Sequence", |
|
"pretokenizers": [ |
|
{ |
|
"type": "Split", |
|
"pattern": { |
|
"Regex": " ?[^(\\s|[.,!?…。,、।۔،])]+" |
|
}, |
|
"behavior": "Isolated", |
|
"invert": false |
|
}, |
|
{ |
|
"type": "ByteLevel", |
|
"add_prefix_space": false, |
|
"trim_offsets": true, |
|
"use_regex": false |
|
} |
|
] |
|
}, |
|
"post_processor": { |
|
"type": "ByteLevel", |
|
"add_prefix_space": true, |
|
"trim_offsets": false, |
|
"use_regex": false |
|
|
|
}, |
|
``` |