File size: 3,966 Bytes
e2baad4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/usr/bin/env bash
export PYTHONPATH=""
source /esat/spchtemp/scratch/jponcele/anaconda3/bin/activate espnet2
python --version
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
####################################################
stage=11
stop_stage=11
####################################################
# notes: geen speed perturbation, geen LM, geen word LM, geen NGRAM LM
# EXP
outdir=/esat/spchtemp/scratch/jponcele/espnet2
expdir=${outdir}/exp/exp-subs-chained
st_tag=train_subtitling_chained_PL_C8_new
st_stats_dir=/esat/spchtemp/scratch/jponcele/espnet2/exp/exp-st/st_stats_fbank_pitch_vl_joint_bpe5000
nj=4
ngpu=1
# DATA
st_train_set=st_train
st_valid_set=st_valid
st_test_set="subs_annot"
asr_train_set=train_s
asr_valid_set=valid_s
asr_test_set=dev_s
subs_train_set=subs_train
subs_valid_set=subs_valid
subs_test_set=subs_test
traincomps="a;b;c;d;f;g;h;i;j;k;l;m;n;o"
decodecomps="b;f;g;h;i;j;k;l;m;n;o"
local_data_opts="--repstr false --lowercase true --outdir data --traincomps ${traincomps} --decodecomps ${decodecomps}"
subs_dir=/users/spraak/jponcele/vrt-scraper/vrtnew_subtitles_4feb
local_subs_opts="--outdir data --subsdir ${subs_dir}"
feats_type=fbank_pitch
# LM
use_word_lm=false # not yet supported!
use_lm=false
lm_config=conf/train_lm_transformer.yaml
use_ngram=false
# ST
feats_normalize=utterance_mvn # recommended for pretrained models instead of globalmvn
st_config=conf/tuning/train_subtitling_chained_C8_new.yaml
inference_config=conf/st_decode_chained.yaml
inference_nj=64
inference_st_model=averaged_model_81epochs.pth #valid.acc_asr.ave.pth
st_args="--batch_type custom_folded --valid_batch_type custom_folded" # "--input_size 0" # to use raw audio for w2v2 encoder
./subs.sh \
--stage ${stage} \
--stop_stage ${stop_stage} \
--ngpu ${ngpu} \
--nj ${nj} \
--gpu_inference false \
--dumpdir ${outdir}/dump \
--expdir ${expdir} \
--feats_type ${feats_type} \
--audio_format wav \
--min_wav_duration 0.1 \
--max_wav_duration 30 \
--token_joint true \
--src_token_type bpe \
--src_nbpe 5000 \
--src_bpemode unigram \
--src_case lc \
--tgt_token_type bpe \
--tgt_nbpe 5000 \
--tgt_bpemode unigram \
--tgt_case lc \
--oov "<unk>" \
--lang "vl" \
--src_lang "verbatim" \
--tgt_lang "subtitle" \
--local_subs_opts "${local_subs_opts}" \
--local_data_opts "${local_data_opts}" \
--use_lm ${use_lm} \
--use_word_lm ${use_word_lm} \
--lm_config ${lm_config} \
--use_ngram ${use_ngram} \
--st_config ${st_config} \
--st_args "${st_args}" \
--st_tag ${st_tag} \
--inference_config ${inference_config} \
--inference_nj ${inference_nj} \
--feats_normalize ${feats_normalize} \
--st_train_set "${st_train_set}" \
--st_valid_set "${st_valid_set}" \
--st_test_set "${st_test_set}" \
--asr_train_set ${asr_train_set} \
--asr_valid_set ${asr_valid_set} \
--asr_test_set ${asr_test_set} \
--subs_train_set ${subs_train_set} \
--subs_valid_set ${subs_valid_set} \
--subs_test_set ${subs_test_set} \
--st_stats_dir ${st_stats_dir} \
--inference_st_model ${inference_st_model} \
# --pretrained_asr ${pretrained_asr} \
#train_set=train_si284
#valid_set=test_dev93
#test_sets="test_dev93 test_eval92"
#
#./asr.sh \
# --lang "en" \
# --use_lm true \
# --token_type char \
# --nbpe 80 \
# --nlsyms_txt data/nlsyms.txt \
# --lm_config conf/train_lm_transformer.yaml \
# --asr_config conf/train_asr_transformer.yaml \
# --inference_config conf/decode.yaml \
# --train_set "${train_set}" \
# --valid_set "${valid_set}" \
# --test_sets "${test_sets}" \
# --bpe_train_text "data/train_si284/text" \
# --lm_train_text "data/train_si284/text data/local/other_text/text" "$@"
|