# @package __global__ | |
defaults: | |
- _self_ | |
- /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly | |
lm_model: transformer_lm | |
codebooks_pattern: | |
modeling: parallel | |
transformer_lm: | |
dim: 512 | |
num_heads: 8 | |
num_layers: 8 | |
hidden_scale: 4 | |
n_q: 8 # number of streams to model | |
card: 1024 | |
dropout: 0. | |
emb_lr: null | |
activation: gelu | |
norm_first: false # use pre-norm instead of post-norm | |
bias_ff: true # use bias for the feedforward | |
bias_attn: true # use bias for the attention | |
bias_proj: true # use bias for the output projections | |
past_context: null | |
causal: true | |
custom: false # use custom MHA implementation | |
memory_efficient: false # use flash attention | |
attention_as_float32: false # use float32 for the attention part, | |
# recommended at the moment when memory_efficient is True. | |
layer_scale: null | |
positional_embedding: sin # positional embedding strategy (sin, rope, or sin_rope). | |
xpos: false # apply xpos decay (rope only). | |
checkpointing: none # layer checkpointing method, can be none, torch, xformers_default. | |
# torch is the slowest but uses the least memory, | |
# xformers_default is somewhere in between. | |
weight_init: null # weight initialization (null, gaussian or uniform) | |
depthwise_init: null # perform depthwise initialization (null, current, global) | |
zero_bias_init: false # initialize bias to zero if bias in linears and | |
# if a weight_init method is used. | |
norm: layer_norm # normalization method to use in transformer. | |
cross_attention: false | |
qk_layer_norm: false | |
qk_layer_norm_cross: false | |
attention_dropout: null | |
kv_repeat: 1 | |
two_step_cfg: false # whether to do true 2 steps CFG, potentially resolving some padding issues or not... | |