tangled-llama-e-128k-v0.1 / scripts /train_tokenizer.py
mtasic85's picture
pretrain fixed bigcode/the-stack-smol-xl dataset
dfc94d9
import gc
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
#
# datasets
#
def batch_iterator():
# code
dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
for row in dataset:
for n in row['keywords']:
yield n
del dataset
gc.collect()
# code
dataset = (
load_dataset('bigcode/the-stack-smol-xs', data_dir=f'data/{name}', split='train', trust_remote_code=True)
for name in [
# 'batchfile' - unsafe
# 'powershell' - unsafe
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
'augeas', 'awk', 'bison', 'bluespec', 'c',
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
'literate-agda', 'literate-coffeescript', 'literate-haskell',
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
'ocaml', 'pascal', 'perl', 'php', 'prolog',
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
'yacc', 'zig',
]
)
for d in dataset:
for text in d['content']:
yield text
del dataset
gc.collect()
## math - unsafe
# dataset = load_dataset('gair-prox/open-web-math-pro', split='train[:1%]')
#
# for text in dataset['text']:
# yield text
#
# del dataset
# gc.collect()
# math
dataset = load_dataset('OleehyO/latex-formulas', 'cleaned_formulas', split='train[:5%]')
for text in dataset['latex_formula']:
yield text
del dataset
gc.collect()
# # text
# dataset = load_dataset('JeanKaddour/minipile', split='train[:1%]')
#
# for text in dataset['text']:
# yield text
#
# del dataset
# gc.collect()
# text
dataset = (
load_dataset('saillab/taco-datasets', data_dir=data_dir, split='train[:5%]')
for data_dir in [
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
]
)
for d in dataset:
for row in d:
for n in row:
yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
del dataset
gc.collect()
# text
dataset = (
load_dataset('xu-song/cc100-samples', lang, split='train[:5%]')
for lang in [
'en', 'hr', 'sr', 'ru',
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'eo', 'es',
'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'ht', 'hu',
'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
'qu', 'rm', 'ro', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
'so', 'sq', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
'zh-Hans', 'zh-Hant', 'zu',
]
)
for d in dataset:
for text in d['text']:
yield text
del dataset
gc.collect()
#
# special_tokens
#
special_tokens = [
'<unk>',
'<|begin_of_text|>',
'<|end_of_text|>',
'<|start_header_id|>',
'<|end_header_id|>',
'<|eom_id|>',
'<|eot_id|>',
'system',
'user',
'assistant',
'tool',
'agent',
'internal', # thinking
# tool/function calling
'<tools>',
'</tools>',
'<tool>',
'</tool>',
'<tool_call>',
'</tool_call>',
'<tool_response>',
'</tool_response>',
'"arguments"',
'"name"',
# misc
'<input>',
'</input>',
'<output>',
'</output>',
'<query>',
'</query>',
'<key>',
'</key>',
'<value>',
'</value>',
'<text>',
'</text>',
'<code>',
'</code>',
'<image>',
'</image>',
'<file>',
'</file>',
# qa
'<question>',
'</question>',
'<answer>',
'</answer>',
# thought
'<thought>',
'</thought>',
'<plan>',
'</plan>',
'<vote>',
'</vote>',
'<passage>',
'</passage>',
# reasoning
'<reasoning>',
'</reasoning>',
'<acting>',
'</acting>',
'<action>',
'</action>',
'<observation>',
'</observation>',
'<claim>',
'</claim>',
# reflection
'<thinking>',
'</thinking>',
'<reflection>',
'</reflection>',
'<step>',
'</step>',
# graph
'<graph>',
'</graph>',
'<edge>',
'</edge>',
'<source>',
'</source>',
'<destination>',
'</destination>',
'<relation>',
'</relation>',
# '<value>',
# '</value>',
]
for i in range(2, 25):
special_tokens.append(' ' * i)
for i in range(2, 25):
special_tokens.append('\t' * i)
for i in range(2, 25):
special_tokens.append('\n' * i)
for i in range(2, 25):
special_tokens.append('\r' * i)
for i in range(2, 25):
special_tokens.append('\r\n' * i)
for i in range(256):
special_tokens.append(f'<0x{i:02X}>')
for i in range(256):
special_tokens.append(f'<|reserved_special_token_{i}|>')
#
# train tokenizer
#
bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
tokenizer = Tokenizer(bpe)
tokenizer.normalizer = normalizers.Sequence([
normalizers.Prepend('▁'),
normalizers.Replace(' ', '▁'),
])
tokenizer.pre_tokenizer = None
tokenizer.post_processor = processors.TemplateProcessing(
single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
pair='$A:0 $B:1', # For pairs, we specify type IDs for both tokens
special_tokens=[],
)
tokenizer.decoder = decoders.Sequence([
decoders.Replace('▁', ' '),
decoders.ByteFallback(),
decoders.Fuse(),
decoders.Strip(' ', 1, 0),
])
trainer = BpeTrainer(
vocab_size=262144, # 256 * 1024
min_frequency=10,
special_tokens=special_tokens,
max_token_length=8,
)
tokenizer.train_from_iterator(batch_iterator(), trainer)
tokenizer.save('../tokenizer.json')
tokenizer.model.save('../')
CHAT_TEMPLATE = (
"{{ bos_token }}"
"{% for message in messages %}"
"{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|start_header_id|>assistant<|end_header_id|>' }}"
"{% else %}"
"{{ eos_token }}"
"{% endif %}"
)
fast_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
chat_template=CHAT_TEMPLATE,
bos_token='<|begin_of_text|>',
eos_token='<|end_of_text|>',
unk_token='<unk>',
clean_up_tokenization_spaces=True,
)
fast_tokenizer.save_pretrained('../')