File size: 3,407 Bytes
7657bb3 3aea04f 7657bb3 3aea04f 7657bb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
name: "iwslt14_deenfr_prompt"
joeynmt_version: "2.3.0"
model_dir: "iwslt14_prompt"
use_cuda: True
fp16: True
random_seed: 42
data:
#train: "iwslt14_prompt/train" # cf. https://wit3.fbk.eu/2014-01
#dev: "iwslt14_prompt/dev"
test: "iwslt14_prompt/test.ref.de-en" # ['TED.dev2010', 'TEDX.dev2012', 'TED.tst2010', 'TED.tst2011', 'TED.tst2012']
dataset_type: "tsv"
sample_dev_subset: 500
src:
lang: "src"
max_length: 512
lowercase: False
normalize: False
level: "bpe"
voc_limit: 32000
voc_min_freq: 1
voc_file: "iwslt14_prompt/src_vocab.txt"
tokenizer_type: "sentencepiece"
tokenizer_cfg:
model_file: "iwslt14_prompt/sp.model"
model_type: "unigram"
character_coverage: 1.0
trg:
lang: "trg"
max_length: 512
lowercase: False
normalize: False
level: "bpe"
voc_limit: 32000
voc_min_freq: 1
voc_file: "iwslt14_prompt/trg_vocab.txt"
tokenizer_type: "sentencepiece"
tokenizer_cfg:
model_file: "iwslt14_prompt/sp.model"
model_type: "unigram"
character_coverage: 1.0
special_symbols:
unk_token: "<unk>"
unk_id: 0
pad_token: "<pad>"
pad_id: 1
bos_token: "<s>"
bos_id: 2
eos_token: "</s>"
eos_id: 3
sep_token: "<sep>"
sep_id: 4
lang_tags: ["<de>", "<en>", "<fr>"]
testing:
load_model: "iwslt14_prompt/avg5.ckpt"
n_best: 1
beam_size: 5
beam_alpha: 1.0
batch_size: 32
batch_type: "sentence"
max_output_length: 512
eval_metrics: ["bleu"]
sacrebleu_cfg:
tokenize: "13a"
lowercase: True
training:
#load_model: "iwslt14_prompt/latest.ckpt"
#reset_best_ckpt: True
#reset_scheduler: True
#reset_optimizer: True
#reset_iter_state: True
optimizer: "adamw"
normalization: "tokens"
adam_betas: [0.9, 0.98]
scheduling: "warmupinversesquareroot"
learning_rate_warmup: 10000
learning_rate: 0.0002
learning_rate_min: 0.0000001
weight_decay: 0.001
label_smoothing: 0.1
loss: "crossentropy"
batch_size: 32
batch_type: "sentence"
batch_multiplier: 4
early_stopping_metric: "bleu"
epochs: 50
validation_freq: 1000
logging_freq: 100
overwrite: False
shuffle: True
print_valid_sents: [0, 1, 2, 3]
keep_best_ckpts: 5
model:
initializer: "xavier_uniform"
bias_initializer: "zeros"
init_gain: 1.0
embed_initializer: "xavier_uniform"
embed_init_gain: 1.0
tied_embeddings: True
tied_softmax: True
encoder:
type: "transformer"
num_layers: 6
num_heads: 8
embeddings:
embedding_dim: 1024
scale: True
dropout: 0.1
# typically ff_size = 4 x hidden_size
hidden_size: 1024
ff_size: 4096
dropout: 0.1
layer_norm: "pre"
activation: "relu"
decoder:
type: "transformer"
num_layers: 6
num_heads: 8
embeddings:
embedding_dim: 1024
scale: True
dropout: 0.1
# typically ff_size = 4 x hidden_size
hidden_size: 1024
ff_size: 4096
dropout: 0.1
layer_norm: "pre"
activation: "relu"
|