File size: 1,885 Bytes
62e9ca6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

# ####################################
# Hubert SCT2T ED model #
# ####################################

world_size=$1
update_freq=$2
exp_name=$3
[ -z $world_size ] && world_size=8
[ -z $update_freq ] && update_freq=1
[ -z $exp_name ] && exp_name=sc2t_base_enes_${world_size}gpu_${update_freq}accum6666


FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_enes"
TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx"
MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/$exp_name"

[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR


python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
  --config-dir $CONFIG_DIR/pretrain \
  --config-name sc2t_base_librispeech \
  \
  +task.store_labels=true \
  task.labels='["km"]' \
  model.label_rate=50 \
  task.data=$DATA_DIR \
  task.label_dir=$DATA_DIR \
  task.text_cfg.text_data=$TEXT_DATA_DIR \
  +task.text_cfg.data_config=config.yaml \
  task.text_cfg.text_maxtokens_ratio=3.0 \
  \
  +criterion.dec_loss_type="ce" \
  \
  criterion.text_weight=1.0 \
  \
  model.use_rel_pos_enc=true \
  +model.code_use_rel_pos_enc=true \
  +model.pad_with_code=true \
  model.text_transformer.no_scale_embedding=true \
  model.text_transformer.layernorm_embedding=true \
  +model.share_decoder_input_output_embed=true \
  \
  dataset.train_subset=\"train_all+en.kmu-spm\" \
  dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
  dataset.num_workers=0 \
  dataset.max_tokens=1000000 \
  optimization.update_freq=[${update_freq}] \
  optimization.max_update=400000 \
  \
  distributed_training.distributed_world_size=${world_size} \
  \
  common.tensorboard_logdir=$MODEL_DIR \
  checkpoint.save_dir=$MODEL_DIR \
  hydra.run.dir=$MODEL_DIR \
  hydra.job.name=${exp_name}


sleep 5m
echo "All finished"