patrickvonplaten commited on
Commit
7de4b33
1 Parent(s): b58436a

Upload model

Browse files
Files changed (1) hide show
  1. operative_config.gin +377 -0
operative_config.gin ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer
6
+ import mesh_tensorflow.transformer.transformer_layers
7
+ import mesh_tensorflow.transformer.utils
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 1024
13
+ d_kv = 64
14
+ d_model = 256
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'c4_v220_unsupervised'
19
+ noise_density = 0.15
20
+ num_heads = 4
21
+ num_layers = 4
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.offset = 0
26
+
27
+ # Parameters for AdafactorOptimizer:
28
+ # ==============================================================================
29
+ AdafactorOptimizer.beta1 = 0.0
30
+ AdafactorOptimizer.clipping_threshold = 1.0
31
+ AdafactorOptimizer.decay_rate = None
32
+ AdafactorOptimizer.epsilon1 = 1e-30
33
+ AdafactorOptimizer.epsilon2 = 0.001
34
+ AdafactorOptimizer.factored = True
35
+ AdafactorOptimizer.min_dim_size_to_factor = 128
36
+ AdafactorOptimizer.multiply_by_parameter_scale = True
37
+
38
+ # Parameters for Bitransformer:
39
+ # ==============================================================================
40
+ Bitransformer.shared_embedding = True
41
+
42
+ # Parameters for denoise:
43
+ # ==============================================================================
44
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
45
+ denoise.noise_density = %noise_density
46
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
47
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
48
+
49
+ # Parameters for decoder/DenseReluDense:
50
+ # ==============================================================================
51
+ decoder/DenseReluDense.activation = 'relu'
52
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
53
+ decoder/DenseReluDense.hidden_size = %d_ff
54
+ decoder/DenseReluDense.use_bias = False
55
+
56
+ # Parameters for encoder/DenseReluDense:
57
+ # ==============================================================================
58
+ encoder/DenseReluDense.activation = 'relu'
59
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
60
+ encoder/DenseReluDense.hidden_size = %d_ff
61
+ encoder/DenseReluDense.use_bias = False
62
+
63
+ # Parameters for enc_dec_attention:
64
+ # ==============================================================================
65
+ # None.
66
+
67
+ # Parameters for enc_dec_attention_bias:
68
+ # ==============================================================================
69
+ # None.
70
+
71
+ # Parameters for decoder/EncDecAttention:
72
+ # ==============================================================================
73
+ decoder/EncDecAttention.relative_attention_type = None
74
+
75
+ # Parameters for get_variable_dtype:
76
+ # ==============================================================================
77
+ get_variable_dtype.activation_dtype = 'bfloat16'
78
+
79
+ # Parameters for get_vocab_embedding_cls:
80
+ # ==============================================================================
81
+ # None.
82
+
83
+ # Parameters for get_vocabulary:
84
+ # ==============================================================================
85
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
86
+
87
+ # Parameters for decoder/LayerStack:
88
+ # ==============================================================================
89
+ decoder/LayerStack.dropout_rate = None
90
+ decoder/LayerStack.norm_epsilon = None
91
+ decoder/LayerStack.recompute_grads = False
92
+ decoder/LayerStack.sublayers_final = \
93
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
94
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
95
+ decoder/LayerStack.sublayers_per_layer = \
96
+ [@transformer.sublayer_rms_norm,
97
+ @transformer.sublayer_call_layer,
98
+ @transformer.sublayer_dropout,
99
+ @transformer.sublayer_residual]
100
+
101
+ # Parameters for encoder/LayerStack:
102
+ # ==============================================================================
103
+ encoder/LayerStack.dropout_rate = None
104
+ encoder/LayerStack.norm_epsilon = None
105
+ encoder/LayerStack.recompute_grads = False
106
+ encoder/LayerStack.sublayers_final = \
107
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
108
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
109
+ encoder/LayerStack.sublayers_per_layer = \
110
+ [@transformer.sublayer_rms_norm,
111
+ @transformer.sublayer_call_layer,
112
+ @transformer.sublayer_dropout,
113
+ @transformer.sublayer_residual]
114
+
115
+ # Parameters for learning_rate_schedule_noam:
116
+ # ==============================================================================
117
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
118
+ learning_rate_schedule_noam.multiplier = 1.0
119
+ learning_rate_schedule_noam.offset = 0
120
+ learning_rate_schedule_noam.warmup_steps = 10000
121
+
122
+ # Parameters for make_bitransformer:
123
+ # ==============================================================================
124
+ make_bitransformer.decoder_name = 'decoder'
125
+ make_bitransformer.encoder_name = 'decoder'
126
+
127
+ # Parameters for decoder/make_layer_stack:
128
+ # ==============================================================================
129
+ decoder/make_layer_stack.block_scope = True
130
+ decoder/make_layer_stack.layers = \
131
+ [('self_attention',
132
+ @mesh_tensorflow.transformer.transformer_layers.SelfAttention),
133
+ ('enc_dec_attention',
134
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention),
135
+ ('dense_relu_dense',
136
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense)]
137
+ decoder/make_layer_stack.num_layers = %num_layers
138
+
139
+ # Parameters for encoder/make_layer_stack:
140
+ # ==============================================================================
141
+ encoder/make_layer_stack.block_scope = True
142
+ encoder/make_layer_stack.layers = \
143
+ [('self_attention',
144
+ @mesh_tensorflow.transformer.transformer_layers.SelfAttention),
145
+ ('dense_relu_dense',
146
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense)]
147
+ encoder/make_layer_stack.num_layers = %num_layers
148
+
149
+ # Parameters for mesh_train_dataset_fn:
150
+ # ==============================================================================
151
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
152
+ mesh_train_dataset_fn.pack = True
153
+ mesh_train_dataset_fn.seed = None
154
+ mesh_train_dataset_fn.shuffle = True
155
+ mesh_train_dataset_fn.use_cached = 1
156
+
157
+ # Parameters for noise_span_to_unique_sentinel:
158
+ # ==============================================================================
159
+ # None.
160
+
161
+ # Parameters for nonnoise_span_to_unique_sentinel:
162
+ # ==============================================================================
163
+ # None.
164
+
165
+ # Parameters for pack_dataset:
166
+ # ==============================================================================
167
+ pack_dataset.use_custom_ops = True
168
+
169
+ # Parameters for pack_or_pad:
170
+ # ==============================================================================
171
+ # None.
172
+
173
+ # Parameters for random_spans_helper:
174
+ # ==============================================================================
175
+ random_spans_helper.extra_tokens_per_span_inputs = 1
176
+ random_spans_helper.extra_tokens_per_span_targets = 1
177
+ random_spans_helper.inputs_length = %inputs_length
178
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
179
+ random_spans_helper.noise_density = %noise_density
180
+ random_spans_helper.verbose = False
181
+
182
+ # Parameters for random_spans_noise_mask:
183
+ # ==============================================================================
184
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
185
+
186
+ # Parameters for random_spans_tokens_length:
187
+ # ==============================================================================
188
+ # None.
189
+
190
+ # Parameters for reduce_concat_tokens:
191
+ # ==============================================================================
192
+ reduce_concat_tokens.batch_size = 128
193
+ reduce_concat_tokens.feature_key = 'targets'
194
+
195
+ # Parameters for rewrite_stack_variables:
196
+ # ==============================================================================
197
+ rewrite_stack_variables.max_combined_variable_size = 536870912
198
+
199
+ # Parameters for run:
200
+ # ==============================================================================
201
+ run.autostack = True
202
+ run.batch_size = ('tokens_per_batch', 65536)
203
+ run.checkpoint_input_pipeline = False
204
+ run.dataset_split = 'train'
205
+ run.ensemble_inputs = None
206
+ run.eval_checkpoint_step = None
207
+ run.eval_dataset_fn = None
208
+ run.eval_summary_dir = None
209
+ run.export_checkpoint_step = None
210
+ run.export_path = ''
211
+ run.init_checkpoint = None
212
+ run.iterations_per_loop = 100
213
+ run.keep_checkpoint_max = None
214
+ run.layout_rules = \
215
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
216
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
217
+ run.mesh_devices = None
218
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
219
+ run.mode = 'train'
220
+ run.model_type = 'bitransformer'
221
+ run.optimizer = @optimize.AdafactorOptimizer
222
+ run.output_eval_examples = True
223
+ run.perplexity_eval_steps = 100
224
+ run.predict_fn = None
225
+ run.save_checkpoints_steps = 5000
226
+ run.seen_data_init_step = 0
227
+ run.sequence_length = {'inputs': 512, 'targets': 128}
228
+ run.skip_seen_data = False
229
+ run.total_run_steps = None
230
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
231
+ run.train_steps = 524288
232
+ run.variable_filter = None
233
+
234
+ # Parameters for select_random_chunk:
235
+ # ==============================================================================
236
+ select_random_chunk.additional_feature_keys = None
237
+ select_random_chunk.additional_passthrough_keys = None
238
+ select_random_chunk.feature_key = 'targets'
239
+ select_random_chunk.max_length = 65536
240
+ select_random_chunk.uniform_random_start = False
241
+
242
+ # Parameters for decoder/SelfAttention:
243
+ # ==============================================================================
244
+ decoder/SelfAttention.attention_func = None
245
+ decoder/SelfAttention.attention_kwargs = None
246
+ decoder/SelfAttention.combine_dims = True
247
+ decoder/SelfAttention.dropout_rate = %dropout_rate
248
+ decoder/SelfAttention.fold_scaling_into_initializer = True
249
+ decoder/SelfAttention.keep_query_heads_dims = False
250
+ decoder/SelfAttention.key_value_size = %d_kv
251
+ decoder/SelfAttention.num_heads = %num_heads
252
+ decoder/SelfAttention.num_memory_heads = 0
253
+ decoder/SelfAttention.relative_attention_num_buckets = 32
254
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
255
+ decoder/SelfAttention.shared_kv = False
256
+
257
+ # Parameters for encoder/SelfAttention:
258
+ # ==============================================================================
259
+ encoder/SelfAttention.attention_func = None
260
+ encoder/SelfAttention.attention_kwargs = None
261
+ encoder/SelfAttention.combine_dims = True
262
+ encoder/SelfAttention.dropout_rate = %dropout_rate
263
+ encoder/SelfAttention.fold_scaling_into_initializer = True
264
+ encoder/SelfAttention.keep_query_heads_dims = False
265
+ encoder/SelfAttention.key_value_size = %d_kv
266
+ encoder/SelfAttention.num_heads = %num_heads
267
+ encoder/SelfAttention.num_memory_heads = 0
268
+ encoder/SelfAttention.relative_attention_num_buckets = 32
269
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
270
+ encoder/SelfAttention.shared_kv = False
271
+
272
+ # Parameters for serialize_num_microbatches:
273
+ # ==============================================================================
274
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
275
+
276
+ # Parameters for SimdMeshImpl:
277
+ # ==============================================================================
278
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
279
+
280
+ # Parameters for split_tokens:
281
+ # ==============================================================================
282
+ split_tokens.additional_feature_keys = None
283
+ split_tokens.feature_key = 'targets'
284
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
285
+ split_tokens.min_tokens_per_segment = None
286
+ split_tokens.passthrough_feature_keys = None
287
+
288
+ # Parameters for sublayer_call_layer:
289
+ # ==============================================================================
290
+ # None.
291
+
292
+ # Parameters for sublayer_dropout:
293
+ # ==============================================================================
294
+ sublayer_dropout.dropout_rate = %dropout_rate
295
+
296
+ # Parameters for sublayer_mask_padding:
297
+ # ==============================================================================
298
+ # None.
299
+
300
+ # Parameters for sublayer_residual:
301
+ # ==============================================================================
302
+ # None.
303
+
304
+ # Parameters for sublayer_rms_norm:
305
+ # ==============================================================================
306
+ sublayer_rms_norm.epsilon = 1e-06
307
+ sublayer_rms_norm.name = 'rms_norm'
308
+
309
+ # Parameters for tpu_estimator_model_fn:
310
+ # ==============================================================================
311
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
312
+ tpu_estimator_model_fn.init_variable_filter = ''
313
+ tpu_estimator_model_fn.model_info_file = ''
314
+ tpu_estimator_model_fn.outer_batch_size = 1
315
+ tpu_estimator_model_fn.tpu_summaries = False
316
+
317
+ # Parameters for tpu_mesh_shape:
318
+ # ==============================================================================
319
+ tpu_mesh_shape.ensemble_parallelism = None
320
+ tpu_mesh_shape.model_parallelism = 1
321
+ tpu_mesh_shape.tpu_topology = '4x4'
322
+
323
+ # Parameters for unit_scaling_convention:
324
+ # ==============================================================================
325
+ unit_scaling_convention.value = False
326
+
327
+ # Parameters for decoder/Unitransformer:
328
+ # ==============================================================================
329
+ decoder/Unitransformer.d_model = %d_model
330
+ decoder/Unitransformer.ensemble = None
331
+ decoder/Unitransformer.input_full_attention = False
332
+ decoder/Unitransformer.label_smoothing = 0.0
333
+ decoder/Unitransformer.loss_denominator = None
334
+ decoder/Unitransformer.loss_fn = None
335
+ decoder/Unitransformer.loss_on_targets_only = False
336
+ decoder/Unitransformer.max_length = 512
337
+ decoder/Unitransformer.positional_embedding = False
338
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
339
+ decoder/Unitransformer.sinusoid_positional_embedding = False
340
+ decoder/Unitransformer.token_dropout_rate = 0.0
341
+ decoder/Unitransformer.vocab_divisor = 128
342
+ decoder/Unitransformer.z_loss = 0.0001
343
+
344
+ # Parameters for encoder/Unitransformer:
345
+ # ==============================================================================
346
+ encoder/Unitransformer.d_model = %d_model
347
+ encoder/Unitransformer.ensemble = None
348
+ encoder/Unitransformer.input_full_attention = False
349
+ encoder/Unitransformer.label_smoothing = 0.0
350
+ encoder/Unitransformer.loss_denominator = None
351
+ encoder/Unitransformer.loss_fn = None
352
+ encoder/Unitransformer.loss_on_targets_only = False
353
+ encoder/Unitransformer.max_length = 512
354
+ encoder/Unitransformer.positional_embedding = False
355
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
356
+ encoder/Unitransformer.sinusoid_positional_embedding = False
357
+ encoder/Unitransformer.token_dropout_rate = 0.0
358
+ encoder/Unitransformer.vocab_divisor = 128
359
+ encoder/Unitransformer.z_loss = 0.0001
360
+
361
+ # Parameters for unsupervised:
362
+ # ==============================================================================
363
+ unsupervised.preprocessors = \
364
+ [@preprocessors.select_random_chunk,
365
+ @preprocessors.reduce_concat_tokens,
366
+ @preprocessors.split_tokens,
367
+ @preprocessors.denoise]
368
+
369
+ # Parameters for VarianceScalingInitializer:
370
+ # ==============================================================================
371
+ VarianceScalingInitializer.distribution = 'normal'
372
+ VarianceScalingInitializer.mode = 'fan_in'
373
+ VarianceScalingInitializer.scale = 1.0
374
+
375
+ # Parameters for VocabEmbedding:
376
+ # ==============================================================================
377
+ VocabEmbedding.scale_variable_like_classifier_weights = False