NeMo
PyTorch
English
seq2seq
masked language modeling
MaximumEntropy commited on
Commit
1261159
·
1 Parent(s): c4f63b6

Upload t5_3b_nemo_config.yaml

Browse files
Files changed (1) hide show
  1. t5_3b_nemo_config.yaml +135 -0
t5_3b_nemo_config.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ micro_batch_size: 27
2
+ tensor_model_parallel_size: 2
3
+ pipeline_model_parallel_size: 1
4
+ make_vocab_size_divisible_by: 128
5
+ pre_process: true
6
+ post_process: true
7
+ megatron_amp_O2: false
8
+ seq_length: 512
9
+ max_position_embeddings: 512
10
+ num_layers: 24
11
+ hidden_size: 1024
12
+ ffn_hidden_size: 16384
13
+ num_attention_heads: 32
14
+ init_method_std: 0.015
15
+ hidden_dropout: 0.1
16
+ attention_dropout: 0.1
17
+ kv_channels: 128
18
+ apply_query_key_layer_scaling: true
19
+ layernorm_epsilon: 1.0e-05
20
+ persist_layer_norm: true
21
+ gradient_as_bucket_view: true
22
+ encoder_arch: transformer
23
+ decoder_arch: transformer
24
+ activation: gelu
25
+ tokenizer:
26
+ library: megatron
27
+ type: BertWordPieceCase
28
+ model: null
29
+ vocab_file: bert_vocab.txt
30
+ merge_file: null
31
+ num_sentinel_tokens: 100
32
+ native_amp_init_scale: 4294967296
33
+ native_amp_growth_interval: 1000
34
+ fp32_residual_connection: false
35
+ fp16_lm_cross_entropy: false
36
+ seed: 1234
37
+ use_cpu_initialization: false
38
+ onnx_safe: false
39
+ activations_checkpoint_method: null
40
+ activations_checkpoint_num_layers: 1
41
+ data:
42
+ data_prefix:
43
+ - 0.0333
44
+ - /preproc_data/my-t5_00_bert_tokenizer_text_document
45
+ - 0.0333
46
+ - /preproc_data/my-t5_01_bert_tokenizer_text_document
47
+ - 0.0333
48
+ - /preproc_data/my-t5_02_bert_tokenizer_text_document
49
+ - 0.0333
50
+ - /preproc_data/my-t5_03_bert_tokenizer_text_document
51
+ - 0.0333
52
+ - /preproc_data/my-t5_04_bert_tokenizer_text_document
53
+ - 0.0333
54
+ - /preproc_data/my-t5_05_bert_tokenizer_text_document
55
+ - 0.0333
56
+ - /preproc_data/my-t5_06_bert_tokenizer_text_document
57
+ - 0.0333
58
+ - /preproc_data/my-t5_07_bert_tokenizer_text_document
59
+ - 0.0333
60
+ - /preproc_data/my-t5_08_bert_tokenizer_text_document
61
+ - 0.0333
62
+ - /preproc_data/my-t5_09_bert_tokenizer_text_document
63
+ - 0.0333
64
+ - /preproc_data/my-t5_10_bert_tokenizer_text_document
65
+ - 0.0333
66
+ - /preproc_data/my-t5_11_bert_tokenizer_text_document
67
+ - 0.0333
68
+ - /preproc_data/my-t5_12_bert_tokenizer_text_document
69
+ - 0.0333
70
+ - /preproc_data/my-t5_13_bert_tokenizer_text_document
71
+ - 0.0333
72
+ - /preproc_data/my-t5_14_bert_tokenizer_text_document
73
+ - 0.0333
74
+ - /preproc_data/my-t5_15_bert_tokenizer_text_document
75
+ - 0.0333
76
+ - /preproc_data/my-t5_16_bert_tokenizer_text_document
77
+ - 0.0333
78
+ - /preproc_data/my-t5_17_bert_tokenizer_text_document
79
+ - 0.0333
80
+ - /preproc_data/my-t5_18_bert_tokenizer_text_document
81
+ - 0.0333
82
+ - /preproc_data/my-t5_19_bert_tokenizer_text_document
83
+ - 0.0333
84
+ - /preproc_data/my-t5_20_bert_tokenizer_text_document
85
+ - 0.0333
86
+ - /preproc_data/my-t5_21_bert_tokenizer_text_document
87
+ - 0.0333
88
+ - /preproc_data/my-t5_22_bert_tokenizer_text_document
89
+ - 0.0333
90
+ - /preproc_data/my-t5_23_bert_tokenizer_text_document
91
+ - 0.0333
92
+ - /preproc_data/my-t5_24_bert_tokenizer_text_document
93
+ - 0.0333
94
+ - /preproc_data/my-t5_25_bert_tokenizer_text_document
95
+ - 0.0333
96
+ - /preproc_data/my-t5_26_bert_tokenizer_text_document
97
+ - 0.0333
98
+ - /preproc_data/my-t5_27_bert_tokenizer_text_document
99
+ - 0.0333
100
+ - /preproc_data/my-t5_28_bert_tokenizer_text_document
101
+ - 0.0334
102
+ - /preproc_data/my-t5_29_bert_tokenizer_text_document
103
+ data_impl: mmap
104
+ splits_string: 99982,9,9
105
+ seq_length: 512
106
+ seq_length_dec: 128
107
+ skip_warmup: true
108
+ num_workers: 4
109
+ dataloader_type: single
110
+ masked_lm_prob: 0.15
111
+ dataset_type: t5
112
+ short_seq_prob: 0.0
113
+ max_ngram_size: 10
114
+ mean_ngram_size: null
115
+ geometric_dist: true
116
+ permutation: false
117
+ whole_word_masking: true
118
+ favor_longer_ngrams: false
119
+ optim:
120
+ name: fused_adam
121
+ lr: 0.0001
122
+ betas:
123
+ - 0.9
124
+ - 0.999
125
+ eps: 1.0e-08
126
+ weight_decay: 0.01
127
+ sched:
128
+ name: WarmupAnnealing
129
+ min_lr: 1.0e-05
130
+ last_epoch: -1
131
+ warmup_ratio: 0.01
132
+ precision: bf16
133
+ target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
134
+ nemo_version: 1.7.1
135
+ vocab_file: nemo:6b9a052d82a744389fbe256fea20c06f_vocab.txt