File size: 4,588 Bytes
ad16788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# Copyright 2020 Hirofumi Inaguma
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Transformer common arguments."""
from distutils.util import strtobool
def add_arguments_transformer_common(group):
"""Add Transformer common arguments."""
group.add_argument(
"--transformer-init",
type=str,
default="pytorch",
choices=[
"pytorch",
"xavier_uniform",
"xavier_normal",
"kaiming_uniform",
"kaiming_normal",
],
help="how to initialize transformer parameters",
)
group.add_argument(
"--transformer-input-layer",
type=str,
default="conv2d",
choices=["conv2d", "linear", "embed"],
help="transformer input layer type",
)
group.add_argument(
"--transformer-attn-dropout-rate",
default=None,
type=float,
help="dropout in transformer attention. use --dropout-rate if None is set",
)
group.add_argument(
"--transformer-lr",
default=10.0,
type=float,
help="Initial value of learning rate",
)
group.add_argument(
"--transformer-warmup-steps",
default=25000,
type=int,
help="optimizer warmup steps",
)
group.add_argument(
"--transformer-length-normalized-loss",
default=True,
type=strtobool,
help="normalize loss by length",
)
group.add_argument(
"--transformer-encoder-selfattn-layer-type",
type=str,
default="selfattn",
choices=[
"selfattn",
"rel_selfattn",
"lightconv",
"lightconv2d",
"dynamicconv",
"dynamicconv2d",
"light-dynamicconv2d",
],
help="transformer encoder self-attention layer type",
)
group.add_argument(
"--transformer-decoder-selfattn-layer-type",
type=str,
default="selfattn",
choices=[
"selfattn",
"lightconv",
"lightconv2d",
"dynamicconv",
"dynamicconv2d",
"light-dynamicconv2d",
],
help="transformer decoder self-attention layer type",
)
# Lightweight/Dynamic convolution related parameters.
# See https://arxiv.org/abs/1912.11793v2
# and https://arxiv.org/abs/1901.10430 for detail of the method.
# Configurations used in the first paper are in
# egs/{csj, librispeech}/asr1/conf/tuning/ld_conv/
group.add_argument(
"--wshare",
default=4,
type=int,
help="Number of parameter shargin for lightweight convolution",
)
group.add_argument(
"--ldconv-encoder-kernel-length",
default="21_23_25_27_29_31_33_35_37_39_41_43",
type=str,
help="kernel size for lightweight/dynamic convolution: "
'Encoder side. For example, "21_23_25" means kernel length 21 for '
"First layer, 23 for Second layer and so on.",
)
group.add_argument(
"--ldconv-decoder-kernel-length",
default="11_13_15_17_19_21",
type=str,
help="kernel size for lightweight/dynamic convolution: "
'Decoder side. For example, "21_23_25" means kernel length 21 for '
"First layer, 23 for Second layer and so on.",
)
group.add_argument(
"--ldconv-usebias",
type=strtobool,
default=False,
help="use bias term in lightweight/dynamic convolution",
)
group.add_argument(
"--dropout-rate",
default=0.0,
type=float,
help="Dropout rate for the encoder",
)
# Encoder
group.add_argument(
"--elayers",
default=4,
type=int,
help="Number of encoder layers (for shared recognition part "
"in multi-speaker asr mode)",
)
group.add_argument(
"--eunits",
"-u",
default=300,
type=int,
help="Number of encoder hidden units",
)
# Attention
group.add_argument(
"--adim",
default=320,
type=int,
help="Number of attention transformation dimensions",
)
group.add_argument(
"--aheads",
default=4,
type=int,
help="Number of heads for multi head attention",
)
# Decoder
group.add_argument(
"--dlayers", default=1, type=int, help="Number of decoder layers"
)
group.add_argument(
"--dunits", default=320, type=int, help="Number of decoder hidden units"
)
return group
|