wangpuupup commited on
Commit
60c3c32
1 Parent(s): 7ef953f

Upload 12 files

Browse files
RESULTS.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RESULTS
2
+ ## Environments
3
+ - date: `Wed Aug 7 20:16:17 CEST 2024`
4
+ - python version: `3.10.14 (main, May 6 2024, 19:42:50) [GCC 11.2.0]`
5
+ - espnet version: `espnet 202402`
6
+ - pytorch version: `pytorch 2.1.0+cu121`
7
+
8
+ ##cgn/decode_s2t_nl_s2t_model_valid.acc.ave/cgn_test/
9
+ ### WER
10
+
11
+ #Sentences: 51615
12
+ #Words: 782520
13
+ Error_Rate: 20.79% S+I+D + 0.51% C
14
+ Details: (#S #I #D #C) 93369 41619 27685 2329+1678
15
+
16
+ ### CER
17
+
18
+ #Sentences: 51615
19
+ #Words: 12956975
20
+ Error_Rate: 9.18% S+I+D + 0.00% C
21
+ Details: (#S #I #D #C) 175585 574851 439205 0+0
resources/TODO ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ add to nl_fillers.lst:
2
+
3
+ oh,filler|<h>|
4
+ ah,filler|<h>|
5
+ pff,filler|<h>|
6
+
7
+
8
+ add to nl_nbest.lst:
9
+ George|Georges|
resources/nl_abbrev.lst ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ z'n|zijn|
2
+ d'r|er|
3
+ 't|het|
4
+ 'ns|eens|
5
+ oke|oké|
resources/nl_fillers.lst ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggg|<g>
2
+ ggg,ggg|<g>|
3
+ xxx|<x>|
4
+ mm-hu,filler|<hm>|
5
+ mm-hu|<hm>|
6
+ uhm|<hm>|
7
+ hm|<hm>
8
+ uh,filler|<h>|
9
+ uh|<h>|
10
+ euh|<h>|
11
+ he|<he>|
12
+ hè|<he>|
13
+ hè,filler|<he>|
14
+ hé|<he>|
15
+ oh-filler|<oh>|
16
+ ..|<PUNCT>|
17
+ ...|<PUNCT>|
resources/nl_getallen100.lst ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ één-en|eenen_|
2
+ twee-en|tweeën_|
3
+ drie-en|drieën_|
4
+ vier-en|vieren_|
5
+ vijf-en|vijfen_|
6
+ zes-en|zesen_|
7
+ zeven-en|zevenen_|
8
+ acht-en|achten_|
9
+ negen-en|negenen_|
10
+ n_ twintig|ntwintig|
11
+ n_ dertig|ndertig|
12
+ n_ veertig|nveertig|
13
+ n_ vijftig|nvijftig|
14
+ n_ zestig|nzestig|
15
+ n_ zeventig|nzeventig|
16
+ n_ tachtig|ntachtig|
17
+ n_ negentig|nnegentig|
resources/nl_getallen1000.lst ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ twee honderd|tweehonderd|
2
+ drie honderd|driehonderd|
3
+ vier honderd|vierhonderd|
4
+ vijf honderd|vijfhonderd|
5
+ zes honderd|zeshonderd|
6
+ zeven honderd|zevenhonderd|
7
+ acht honderd|achthonderd|
8
+ negen honderd|negenhonderd|
9
+ elf honderd|elfhonderd|
10
+ twaalf honderd|twaalfhonderd|
11
+ dertien honderd|dertienhonderd|
12
+ veertien honderd|veertienhonderd|
13
+ vijftien honderd|vijftienhonderd|
14
+ zestien honderd|zestienhonderd|
15
+ zeventien honderd|zeventienhonderd|
16
+ achttien honderd|achttienhonderd|
17
+ negentien honderd|negentienhonderd|
18
+ honderd en |honderden|
19
+ honderd één |honderdeen |
20
+ honderd twee |honderdtwee |
21
+ honderd drie |honderddrie |
22
+ honderd vier |honderdvier |
23
+ honderd vijf|honderdvijf|
24
+ honderd zes|honderdzes|
25
+ honderd zeven|honderdzeven|
26
+ honderd acht|honderdacht|
27
+ honderd negen|honderdnegen|
28
+ honderd tien|honderdtien|
29
+ honderd elf|honderdelf|
30
+ honderd twaalf|honderdtwaalf|
31
+ honderd dertien|honderddertien|
32
+ honderd veertien|honderdveertien|
33
+ honderd eenen|honderdeenen|
34
+ honderd tweeën|honderdtweeën|
35
+ honderd drieën|honderddrieën|
36
+ honderd vieren|honderdvieren|
37
+ honderd vijfen|honderdvijfen|
38
+ honderd zesen|honderdzesen|
39
+ honderd zevenen|honderdzevenen|
40
+ honderd achten|honderdachten|
41
+ honderd negenen|honderdnegenen|
resources/nl_nbest.lst ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rosetta-plan|Rosettaplan|
2
+ Congo|Kongo|
3
+ Bazel|Basel|
4
+ A twaalf|A12|
5
+ E veertig|E40|
6
+ E zeventien|E17|
7
+ E negentien|E19|
8
+ E driehonderd dertien|E313|
9
+ N negen|N9|
10
+ R vier|R4|
11
+ CO twee|CO2|
12
+ Microshift|Microsoft|
13
+ Jurasic|Jurassic|
14
+ stefaan declerck|stefaan de clerck|
15
+ sonckx|sonck|
16
+ cerclub|sercle|
17
+ van Sandvliet|van santvliet|
18
+ van Noppen|vannoppen|
19
+ van Der Schoot|vanderschoot|
20
+ van der schoot|vanderschoot|
21
+ Sint Joris Weert|Sint-Joris-Weert|
resources/nl_rm_fillers.lst ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggg|
2
+ ggg,ggg|
3
+ xxx|
4
+ mm-hu,filler|
5
+ mm-hu|
6
+ uhm|
7
+ hm|
8
+ uh,filler|
9
+ uh|
10
+ euh|
11
+ he|
12
+ hè|
13
+ hè,filler|
14
+ hé|
15
+ oh-filler|
16
+ ..|
17
+ ...|
18
+ <g>|
19
+ <x>|
20
+ <hm>|
21
+ <h>|
22
+ <he>|
23
+ <PUNCT>|
24
+ uhu|
resources/nl_rm_unk.lst ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <unk>|
2
+ <UNK>|
run.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Set bash to 'debug' mode, it will exit on :
3
+ # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4
+ set -e
5
+ set -u
6
+ set -o pipefail
7
+
8
+ train_set="cgn_train"
9
+ valid_set="cgn_valid"
10
+ test_sets="cgn_test"
11
+
12
+ nbpe=20000
13
+ s2t_config=conf/train_cgn.yaml
14
+ inference_config=conf/decode_s2t_nl.yaml
15
+
16
+ ./s2t.sh \
17
+ --s2t_task s2t_wadp \
18
+ --stage 11 \
19
+ --stop_stage 13 \
20
+ --use_lm false \
21
+ --ngpu 1 \
22
+ --nj 1 \
23
+ --gpu_inference true \
24
+ --inference_nj 1 \
25
+ --feats_type raw \
26
+ --audio_format flac.ark \
27
+ --expdir /espnet/egs2/owsm_v1/s2t1/cgn \
28
+ --token_type bpe \
29
+ --nbpe ${nbpe} \
30
+ --s2t_config "${s2t_config}" \
31
+ --inference_config "${inference_config}" \
32
+ --feats_normalize global_mvn \
33
+ --s2t_args "--model_conf extract_feats_in_collect_stats=false" \
34
+ --s2t_stats_dir /exp/s2t_stats_raw_bpe20000 \
35
+ --train_set "${train_set}" \
36
+ --valid_set "${valid_set}" \
37
+ --test_sets "${test_sets}" \
38
+ --bpe_train_text "dump/raw/${train_set}/text" \
39
+ --bpe_nlsyms data/nlsyms.txt \
40
+ --lm_train_text "dump/raw/${train_set}/text" "$@"
s2t.sh ADDED
@@ -0,0 +1,1730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # Set bash to 'debug' mode, it will exit on :
4
+ # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ log() {
10
+ local fname=${BASH_SOURCE[1]##*/}
11
+ echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
12
+ }
13
+ min() {
14
+ local a b
15
+ a=$1
16
+ for b in "$@"; do
17
+ if [ "${b}" -le "${a}" ]; then
18
+ a="${b}"
19
+ fi
20
+ done
21
+ echo "${a}"
22
+ }
23
+
24
+ SECONDS=0
25
+
26
+ # General configuration
27
+ stage=1 # Processes starts from the specified stage.
28
+ stop_stage=10000 # Processes is stopped at the specified stage.
29
+ skip_stages= # Spicify the stage to be skipped
30
+ skip_data_prep=false # Skip data preparation stages.
31
+ skip_train=false # Skip training stages.
32
+ skip_eval=false # Skip decoding and evaluation stages.
33
+ skip_packing=true # Skip the packing stage.
34
+ skip_upload_hf=true # Skip uploading to huggingface stage.
35
+ eval_valid_set=false # Run decoding for the validation set
36
+ ngpu=1 # The number of gpus ("0" uses cpu, otherwise use gpu).
37
+ num_nodes=1 # The number of nodes.
38
+ nj=32 # The number of parallel jobs.
39
+ inference_nj=32 # The number of parallel jobs in decoding.
40
+ gpu_inference=false # Whether to perform gpu decoding.
41
+ dumpdir=dump # Directory to dump features.
42
+ expdir=exp # Directory to save experiments.
43
+ python=python3 # Specify python to execute espnet commands.
44
+
45
+ # Data preparation related
46
+ local_data_opts= # The options given to local/data.sh.
47
+ post_process_local_data_opts= # The options given to local/data.sh for additional processing in stage 4.
48
+
49
+ # Speed perturbation related
50
+ speed_perturb_factors= # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
51
+
52
+ # Feature extraction related
53
+ feats_type=raw # Feature type (raw, raw_copy, fbank_pitch, or extracted).
54
+ audio_format=flac # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw).
55
+ multi_columns_input_wav_scp=false # Enable multi columns mode for input wav.scp for format_wav_scp.py
56
+ multi_columns_output_wav_scp=false # Enable multi columns mode for output wav.scp for format_wav_scp.py
57
+ fs=16k # Sampling rate.
58
+ min_wav_duration=0.1 # Minimum duration in second.
59
+ max_wav_duration=30.5 # Maximum duration in second.
60
+
61
+ # Tokenization related
62
+ token_type=bpe # Tokenization type (char or bpe).
63
+ nbpe=30 # The number of BPE vocabulary.
64
+ bpemode=unigram # Mode of BPE (unigram or bpe).
65
+ oov="<unk>" # Out of vocabulary symbol.
66
+ blank="<blank>" # CTC blank symbol
67
+ sos="<sos>" # Start of sentence symbol
68
+ eos="<eos>" # End of sentence symbol
69
+ sop="<sop>" # Start of prev/prompt symbol
70
+ bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
71
+ bpe_nlsyms= # non-linguistic symbols list, separated by a comma or a file containing 1 symbol per line, for BPE
72
+ bpe_char_cover=1.0 # character coverage when modeling BPE
73
+ hugging_face_model_name_or_path="" # Hugging Face model or path for hugging_face tokenizer
74
+
75
+ # Ngram model related
76
+ use_ngram=false
77
+ ngram_exp=
78
+ ngram_num=3
79
+
80
+ # Language model related
81
+ use_lm=true # Use language model for decoding.
82
+ lm_tag= # Suffix to the result dir for language model training.
83
+ lm_exp= # Specify the directory path for LM experiment.
84
+ # If this option is specified, lm_tag is ignored.
85
+ lm_stats_dir= # Specify the directory path for LM statistics.
86
+ lm_config= # Config for language model training.
87
+ lm_args= # Arguments for language model training, e.g., "--max_epoch 10".
88
+ # Note that it will overwrite args in lm config.
89
+ use_word_lm=false # Whether to use word language model.
90
+ num_splits_lm=1 # Number of splitting for lm corpus.
91
+ # shellcheck disable=SC2034
92
+ word_vocab_size=10000 # Size of word vocabulary.
93
+
94
+ # S2T model related
95
+ s2t_task=s2t
96
+ s2t_tag= # Suffix to the result dir for s2t model training.
97
+ s2t_exp= # Specify the directory path for s2t experiment.
98
+ # If this option is specified, s2t_tag is ignored.
99
+ s2t_stats_dir= # Specify the directory path for s2t statistics.
100
+ s2t_config= # Config for s2t model training.
101
+ s2t_args= # Arguments for s2t model training, e.g., "--max_epoch 10".
102
+ # Note that it will overwrite args in s2t config.
103
+ feats_normalize=global_mvn # Normalizaton layer type.
104
+ num_splits_s2t=1 # Number of splitting for lm corpus.
105
+ num_ref=1 # Number of references for training.
106
+ # In supervised learning based speech enhancement / separation, it is equivalent to number of speakers.
107
+ num_inf= # Number of inferences output by the model
108
+ # Note that if it is not specified, it will be the same as num_ref. Otherwise, it will be overwritten.
109
+ # In MixIT, number of outputs is larger than that of references.
110
+
111
+ # Upload model related
112
+ hf_repo=
113
+
114
+ # Decoding related
115
+ use_streaming=false # Whether to use streaming decoding
116
+
117
+ batch_size=1
118
+ inference_tag= # Suffix to the result dir for decoding.
119
+ inference_config= # Config for decoding.
120
+ inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1".
121
+ # Note that it will overwrite args in inference config.
122
+ inference_lm=valid.loss.ave.pth # Language model path for decoding.
123
+ inference_ngram=${ngram_num}gram.bin
124
+ inference_s2t_model=valid.acc.ave.pth # S2T model path for decoding.
125
+ # e.g.
126
+ # inference_s2t_model=train.loss.best.pth
127
+ # inference_s2t_model=3epoch.pth
128
+ # inference_s2t_model=valid.acc.best.pth
129
+ # inference_s2t_model=valid.loss.ave.pth
130
+ download_model= # Download a model from Model Zoo and use it for decoding.
131
+
132
+ # [Task dependent] Set the datadir name created by local/data.sh
133
+ train_set= # Name of training set.
134
+ valid_set= # Name of validation set used for monitoring/tuning network training.
135
+ test_sets= # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
136
+ bpe_train_text= # Text file path of bpe training set.
137
+ lm_train_text= # Text file path of language model training set.
138
+ lm_dev_text= # Text file path of language model development set.
139
+ lm_test_text= # Text file path of language model evaluation set.
140
+ nlsyms_txt=none # Non-linguistic symbol list if existing.
141
+ cleaner=none # Text cleaner.
142
+ hyp_cleaner=none # Text cleaner for hypotheses (may be used with external tokenizers)
143
+ g2p=none # g2p method (needed if token_type=phn).
144
+ lang=noinfo # The language type of corpus.
145
+ score_opts= # The options given to sclite scoring
146
+ local_score_opts= # The options given to local/score.sh.
147
+ s2t_speech_fold_length=800 # fold_length for speech data during S2T training.
148
+ s2t_text_fold_length=150 # fold_length for text data during S2T training.
149
+ lm_fold_length=150 # fold_length for LM training.
150
+
151
+ help_message=$(cat << EOF
152
+ Usage: $0 --train_set "<train_set_name>" --valid_set "<valid_set_name>" --test_sets "<test_set_names>"
153
+
154
+ Options:
155
+ # General configuration
156
+ --stage # Processes starts from the specified stage (default="${stage}").
157
+ --stop_stage # Processes is stopped at the specified stage (default="${stop_stage}").
158
+ --skip_stages # Spicify the stage to be skipped (default="${skip_stages}").
159
+ --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
160
+ --skip_train # Skip training stages (default="${skip_train}").
161
+ --skip_eval # Skip decoding and evaluation stages (default="${skip_eval}").
162
+ --skip_packing # Skip the packing stage (default="${skip_packing}").
163
+ --skip_upload_hf # Skip uploading to huggingface stage (default="${skip_upload_hf}").
164
+ --eval_valid_set # Run decoding for the validation set (default="${eval_valid_set}").
165
+ --ngpu # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
166
+ --num_nodes # The number of nodes (default="${num_nodes}").
167
+ --nj # The number of parallel jobs (default="${nj}").
168
+ --inference_nj # The number of parallel jobs in decoding (default="${inference_nj}").
169
+ --gpu_inference # Whether to perform gpu decoding (default="${gpu_inference}").
170
+ --dumpdir # Directory to dump features (default="${dumpdir}").
171
+ --expdir # Directory to save experiments (default="${expdir}").
172
+ --python # Specify python to execute espnet commands (default="${python}").
173
+
174
+ # Data preparation related
175
+ --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
176
+
177
+ # Speed perturbation related
178
+ --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
179
+
180
+ # Feature extraction related
181
+ --feats_type # Feature type (raw, raw_copy, fbank_pitch or extracted, default="${feats_type}").
182
+ --audio_format # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw or raw_copy, default="${audio_format}").
183
+ --fs # Sampling rate (default="${fs}").
184
+ --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
185
+ --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
186
+
187
+ # Tokenization related
188
+ --token_type # Tokenization type (char or bpe, default="${token_type}").
189
+ --nbpe # The number of BPE vocabulary (default="${nbpe}").
190
+ --bpemode # Mode of BPE (unigram or bpe, default="${bpemode}").
191
+ --oov # Out of vocabulary symbol (default="${oov}").
192
+ --blank # CTC blank symbol (default="${blank}").
193
+ --sos # sos symbol (default="${sos}").
194
+ --eos # eos symbol (default="${eos}").
195
+ --sop # sop symbol (default="${sop}").
196
+ --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
197
+ --bpe_nlsyms # Non-linguistic symbol list for sentencepiece, separated by a comma or a file containing 1 symbol per line . (default="${bpe_nlsyms}").
198
+ --bpe_char_cover # Character coverage when modeling BPE (default="${bpe_char_cover}").
199
+
200
+ # Language model related
201
+ --lm_tag # Suffix to the result dir for language model training (default="${lm_tag}").
202
+ --lm_exp # Specify the directory path for LM experiment.
203
+ # If this option is specified, lm_tag is ignored (default="${lm_exp}").
204
+ --lm_stats_dir # Specify the directory path for LM statistics (default="${lm_stats_dir}").
205
+ --lm_config # Config for language model training (default="${lm_config}").
206
+ --lm_args # Arguments for language model training (default="${lm_args}").
207
+ # e.g., --lm_args "--max_epoch 10"
208
+ # Note that it will overwrite args in lm config.
209
+ --use_word_lm # Whether to use word language model (default="${use_word_lm}").
210
+ --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
211
+ --num_splits_lm # Number of splitting for lm corpus (default="${num_splits_lm}").
212
+
213
+ # S2T model related
214
+ --s2t_tag # Suffix to the result dir for s2t model training (default="${s2t_tag}").
215
+ --s2t_exp # Specify the directory path for S2T experiment.
216
+ # If this option is specified, s2t_tag is ignored (default="${s2t_exp}").
217
+ --s2t_stats_dir # Specify the directory path for S2T statistics (default="${s2t_stats_dir}").
218
+ --s2t_config # Config for S2T model training (default="${s2t_config}").
219
+ --s2t_args # Arguments for S2T model training (default="${s2t_args}").
220
+ # e.g., --s2t_args "--max_epoch 10"
221
+ # Note that it will overwrite args in s2t config.
222
+ --feats_normalize # Normalizaton layer type (default="${feats_normalize}").
223
+ --num_splits_s2t # Number of splitting for lm corpus (default="${num_splits_s2t}").
224
+ --num_ref # Number of references for training (default="${num_ref}").
225
+ # In supervised learning based speech recognition, it is equivalent to number of speakers.
226
+ --num_inf # Number of inference audio generated by the model (default="${num_inf}")
227
+ # Note that if it is not specified, it will be the same as num_ref. Otherwise, it will be overwritten.
228
+
229
+ # Decoding related
230
+ --inference_tag # Suffix to the result dir for decoding (default="${inference_tag}").
231
+ --inference_config # Config for decoding (default="${inference_config}").
232
+ --inference_args # Arguments for decoding (default="${inference_args}").
233
+ # e.g., --inference_args "--lm_weight 0.1"
234
+ # Note that it will overwrite args in inference config.
235
+ --inference_lm # Language model path for decoding (default="${inference_lm}").
236
+ --inference_s2t_model # S2T model path for decoding (default="${inference_s2t_model}").
237
+ --download_model # Download a model from Model Zoo and use it for decoding (default="${download_model}").
238
+ --use_streaming # Whether to use streaming decoding (default="${use_streaming}").
239
+
240
+ # [Task dependent] Set the datadir name created by local/data.sh
241
+ --train_set # Name of training set (required).
242
+ --valid_set # Name of validation set used for monitoring/tuning network training (required).
243
+ --test_sets # Names of test sets.
244
+ # Multiple items (e.g., both dev and eval sets) can be specified (required).
245
+ --bpe_train_text # Text file path of bpe training set.
246
+ --lm_train_text # Text file path of language model training set.
247
+ --lm_dev_text # Text file path of language model development set (default="${lm_dev_text}").
248
+ --lm_test_text # Text file path of language model evaluation set (default="${lm_test_text}").
249
+ --nlsyms_txt # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
250
+ --cleaner # Text cleaner (default="${cleaner}").
251
+ --g2p # g2p method (default="${g2p}").
252
+ --lang # The language type of corpus (default=${lang}).
253
+ --score_opts # The options given to sclite scoring (default="{score_opts}").
254
+ --local_score_opts # The options given to local/score.sh (default="{local_score_opts}").
255
+ --s2t_speech_fold_length # fold_length for speech data during S2T training (default="${s2t_speech_fold_length}").
256
+ --s2t_text_fold_length # fold_length for text data during S2T training (default="${s2t_text_fold_length}").
257
+ --lm_fold_length # fold_length for LM training (default="${lm_fold_length}").
258
+ EOF
259
+ )
260
+
261
+ log "$0 $*"
262
+ # Save command line args for logging (they will be lost after utils/parse_options.sh)
263
+ run_args=$(scripts/utils/print_args.sh $0 "$@")
264
+ . utils/parse_options.sh
265
+
266
+ if [ $# -ne 0 ]; then
267
+ log "${help_message}"
268
+ log "Error: No positional arguments are required."
269
+ exit 2
270
+ fi
271
+
272
+ . ./path.sh
273
+ . ./cmd.sh
274
+
275
+
276
+ # Check required arguments
277
+ if ! "${skip_train}"; then
278
+ [ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
279
+ [ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
280
+ fi
281
+ if ! "${eval_valid_set}"; then
282
+ [ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
283
+ else
284
+ [ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
285
+ fi
286
+
287
+ if [ -n "${train_set}" ] && [ "${train_set}" = "${valid_set}" ]; then
288
+ log "Error: train_set and valid_set must be different. --train_set ${train_set} --valid_set ${valid_set}"
289
+ exit 1
290
+ fi
291
+
292
+ _test_sets=
293
+ for dset in ${test_sets}; do
294
+ if [ "${dset}" = "${train_set}" ]; then
295
+ log "Error: train_set and test_sets must be different. --train_set ${train_set} --test_sets ${test_sets}"
296
+ exit 1
297
+ fi
298
+ if [ "${dset}" = "${valid_set}" ]; then
299
+ log "Info: The valid_set '${valid_set}' is included in the test_sets. '--eval_valid_set true' is set and '${valid_set}' is removed from the test_sets"
300
+ eval_valid_set=true
301
+ elif [[ " ${_test_sets} " =~ [[:space:]]${dset}[[:space:]] ]]; then
302
+ log "Info: ${dset} is duplicated in the test_sets. One is removed"
303
+ else
304
+ _test_sets+="${dset} "
305
+ fi
306
+ done
307
+ test_sets=${_test_sets}
308
+
309
+ # Check feature type
310
+ if [ "${feats_type}" = raw ]; then
311
+ data_feats=${dumpdir}/raw
312
+ elif [ "${feats_type}" = raw_copy ]; then
313
+ # raw_copy is as same as raw except for skipping the format_wav stage
314
+ data_feats=${dumpdir}/raw_copy
315
+ elif [ "${feats_type}" = fbank_pitch ]; then
316
+ data_feats=${dumpdir}/fbank_pitch
317
+ elif [ "${feats_type}" = fbank ]; then
318
+ data_feats=${dumpdir}/fbank
319
+ elif [ "${feats_type}" == extracted ]; then
320
+ data_feats=${dumpdir}/extracted
321
+ else
322
+ log "${help_message}"
323
+ log "Error: not supported: --feats_type ${feats_type}"
324
+ exit 2
325
+ fi
326
+
327
+ # Extra files for prev/prompt and ASR CTC
328
+ utt_extra_files="text.prev text.ctc"
329
+
330
+ num_inf=${num_inf:=${num_ref}}
331
+ # Preprocessor related
332
+ if [ ${num_ref} -eq 1 ]; then
333
+ # For single speaker, text file path and name are text
334
+ ref_text_files_str="text "
335
+ ref_text_names_str="text "
336
+ else
337
+ # For multiple speakers, text file path and name are text_spk[1-N] and [text, text_spk2, ...]
338
+ #TODO(simpleoier): later to support flexibly defined text prefix
339
+ ref_text_files_str="text_spk1 "
340
+ ref_text_names_str="text "
341
+ for n in $(seq 2 ${num_ref}); do
342
+ ref_text_files_str+="text_spk${n} "
343
+ ref_text_names_str+="text_spk${n} "
344
+ done
345
+ fi
346
+ # shellcheck disable=SC2206
347
+ ref_text_files=(${ref_text_files_str// / })
348
+ # shellcheck disable=SC2206
349
+ ref_text_names=(${ref_text_names_str// / })
350
+
351
+ [ -z "${bpe_train_text}" ] && bpe_train_text="${data_feats}/org/${train_set}/${ref_text_files[0]}"
352
+ # Use the same text as S2T for lm training if not specified.
353
+ [ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/org/${train_set}/${ref_text_files[0]}"
354
+ # Use the same text as S2T for lm training if not specified.
355
+ [ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/org/${valid_set}/${ref_text_files[0]}"
356
+ if [ -z "${lm_test_text}" ]; then
357
+ if [ -z "${test_sets}" ]; then
358
+ lm_test_text="${data_feats}/org/${valid_set}/${ref_text_files[0]}"
359
+ else
360
+ # Use the text of the 1st evaldir if lm_test is not specified
361
+ lm_test_text="${data_feats}/${test_sets%% *}/${ref_text_files[0]}"
362
+ fi
363
+ fi
364
+
365
+ # Check tokenization type
366
+ if [ "${lang}" != noinfo ]; then
367
+ token_listdir=data/${lang}_token_list
368
+ else
369
+ token_listdir=data/token_list
370
+ fi
371
+ bpedir="${token_listdir}/bpe_${bpemode}${nbpe}"
372
+ bpeprefix="${bpedir}"/bpe
373
+ bpemodel="${bpeprefix}".model
374
+ bpetoken_list="${bpedir}"/tokens.txt
375
+ chartoken_list="${token_listdir}"/char/tokens.txt
376
+ hugging_face_token_list="${token_listdir}/hugging_face_"${hugging_face_model_name_or_path/\//-}/tokens.txt
377
+ # NOTE: keep for future development.
378
+ # shellcheck disable=SC2034
379
+ wordtoken_list="${token_listdir}"/word/tokens.txt
380
+
381
+ if [ "${token_type}" = bpe ]; then
382
+ token_list="${bpetoken_list}"
383
+ elif [ "${token_type}" = char ]; then
384
+ token_list="${chartoken_list}"
385
+ bpemodel=none
386
+ elif [ "${token_type}" = word ]; then
387
+ token_list="${wordtoken_list}"
388
+ bpemodel=none
389
+ elif [ "${token_type}" = whisper_en ]; then # should make token_list an output filepath here
390
+ token_list="${token_listdir}"/whisper_en/tokens.txt
391
+ bpemodel=whisper_en
392
+ hyp_cleaner=${cleaner}
393
+ elif [ "${token_type}" = whisper_multilingual ]; then
394
+ token_list="${token_listdir}"/whisper_multilingual/tokens.txt
395
+ bpemodel=whisper_multilingual
396
+ hyp_cleaner=${cleaner}
397
+ elif [ "${token_type}" = hugging_face ]; then
398
+ token_list="${hugging_face_token_list}"
399
+ bpemodel=${hugging_face_model_name_or_path}
400
+ else
401
+ log "Error: not supported --token_type '${token_type}'"
402
+ exit 2
403
+ fi
404
+ if ${use_word_lm}; then
405
+ log "Error: Word LM is not supported yet"
406
+ exit 2
407
+ else
408
+ lm_token_list="${token_list}"
409
+ lm_token_type="${token_type}"
410
+ fi
411
+
412
+
413
+ # Set tag for naming of model directory
414
+ if [ -z "${s2t_tag}" ]; then
415
+ if [ -n "${s2t_config}" ]; then
416
+ s2t_tag="$(basename "${s2t_config}" .yaml)_${feats_type}"
417
+ else
418
+ s2t_tag="train_${feats_type}"
419
+ fi
420
+ if [ "${lang}" != noinfo ]; then
421
+ s2t_tag+="_${lang}_${token_type}"
422
+ else
423
+ s2t_tag+="_${token_type}"
424
+ fi
425
+ if [ "${token_type}" = bpe ]; then
426
+ s2t_tag+="${nbpe}"
427
+ fi
428
+ if [ "${token_type}" = hugging_face ]; then
429
+ s2t_tag+="_"${hugging_face_model_name_or_path/\//-}
430
+ fi
431
+ # Add overwritten arg's info
432
+ if [ -n "${s2t_args}" ]; then
433
+ s2t_tag+="$(echo "${s2t_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
434
+ fi
435
+ if [ -n "${speed_perturb_factors}" ]; then
436
+ s2t_tag+="_sp"
437
+ fi
438
+ fi
439
+ if [ -z "${lm_tag}" ]; then
440
+ if [ -n "${lm_config}" ]; then
441
+ lm_tag="$(basename "${lm_config}" .yaml)"
442
+ else
443
+ lm_tag="train"
444
+ fi
445
+ if [ "${lang}" != noinfo ]; then
446
+ lm_tag+="_${lang}_${lm_token_type}"
447
+ else
448
+ lm_tag+="_${lm_token_type}"
449
+ fi
450
+ if [ "${lm_token_type}" = bpe ]; then
451
+ lm_tag+="${nbpe}"
452
+ fi
453
+ # Add overwritten arg's info
454
+ if [ -n "${lm_args}" ]; then
455
+ lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
456
+ fi
457
+ fi
458
+
459
+ # The directory used for collect-stats mode
460
+ if [ -z "${s2t_stats_dir}" ]; then
461
+ if [ "${lang}" != noinfo ]; then
462
+ s2t_stats_dir="${expdir}/s2t_stats_${feats_type}_${lang}_${token_type}"
463
+ else
464
+ s2t_stats_dir="${expdir}/s2t_stats_${feats_type}_${token_type}"
465
+ fi
466
+ if [ "${token_type}" = bpe ]; then
467
+ s2t_stats_dir+="${nbpe}"
468
+ fi
469
+ if [ "${token_type}" = hugging_face ]; then
470
+ s2t_stats_dir+="_"${hugging_face_model_name_or_path/\//-}
471
+ fi
472
+ if [ -n "${speed_perturb_factors}" ]; then
473
+ s2t_stats_dir+="_sp"
474
+ fi
475
+ fi
476
+ if [ -z "${lm_stats_dir}" ]; then
477
+ if [ "${lang}" != noinfo ]; then
478
+ lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
479
+ else
480
+ lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
481
+ fi
482
+ if [ "${lm_token_type}" = bpe ]; then
483
+ lm_stats_dir+="${nbpe}"
484
+ fi
485
+ fi
486
+ # The directory used for training commands
487
+ if [ -z "${s2t_exp}" ]; then
488
+ s2t_exp="${expdir}/s2t_${s2t_tag}"
489
+ fi
490
+ if [ -z "${lm_exp}" ]; then
491
+ lm_exp="${expdir}/lm_${lm_tag}"
492
+ fi
493
+ if [ -z "${ngram_exp}" ]; then
494
+ ngram_exp="${expdir}/ngram"
495
+ fi
496
+
497
+
498
+ if [ -z "${inference_tag}" ]; then
499
+ if [ -n "${inference_config}" ]; then
500
+ inference_tag="$(basename "${inference_config}" .yaml)"
501
+ else
502
+ inference_tag=inference
503
+ fi
504
+ # Add overwritten arg's info
505
+ if [ -n "${inference_args}" ]; then
506
+ inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
507
+ fi
508
+ if "${use_lm}"; then
509
+ inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
510
+ fi
511
+ if "${use_ngram}"; then
512
+ inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
513
+ fi
514
+ inference_tag+="_s2t_model_$(echo "${inference_s2t_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
515
+ fi
516
+
517
+ if "${skip_data_prep}"; then
518
+ skip_stages+="1 2 3 4 5 "
519
+ fi
520
+ if "${skip_train}"; then
521
+ skip_stages+="2 4 5 6 7 8 9 10 11 "
522
+ elif ! "${use_lm}"; then
523
+ skip_stages+="6 7 8 "
524
+ fi
525
+ if ! "${use_ngram}"; then
526
+ skip_stages+="9 "
527
+ fi
528
+ if "${skip_eval}"; then
529
+ skip_stages+="12 13 "
530
+ fi
531
+ if [ "${skip_packing}" = "true" ] || [ -n "${download_model}" ]; then
532
+ skip_stages+="14 "
533
+ fi
534
+ if "${skip_upload_hf}"; then
535
+ skip_stages+="15 "
536
+ fi
537
+ skip_stages=$(echo "${skip_stages}" | tr ' ' '\n' | sort -nu | tr '\n' ' ')
538
+ log "Skipped stages: ${skip_stages}"
539
+
540
+ # ========================== Main stages start from here. ==========================
541
+
542
+
543
+
544
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && ! [[ " ${skip_stages} " =~ [[:space:]]1[[:space:]] ]]; then
545
+ log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
546
+ # [Task dependent] Need to create data.sh for new corpus
547
+ local/data.sh ${local_data_opts}
548
+ fi
549
+
550
+
551
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] && ! [[ " ${skip_stages} " =~ [[:space:]]2[[:space:]] ]]; then
552
+ if [ -n "${speed_perturb_factors}" ]; then
553
+ log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
554
+ for factor in ${speed_perturb_factors}; do
555
+ if python3 -c "assert ${factor} != 1.0" 2>/dev/null; then
556
+ scripts/utils/perturb_data_dir_speed.sh \
557
+ --utt_extra_files "${utt_extra_files} ${ref_text_files_str}" \
558
+ "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
559
+ _dirs+="data/${train_set}_sp${factor} "
560
+ else
561
+ # If speed factor is 1, same as the original
562
+ _dirs+="data/${train_set} "
563
+ fi
564
+ done
565
+ utils/combine_data.sh \
566
+ --extra_files "${utt_extra_files} ${ref_text_files_str}" \
567
+ "data/${train_set}_sp" ${_dirs}
568
+ else
569
+ log "Skip stage 2: Speed perturbation"
570
+ fi
571
+ fi
572
+
573
+ if [ -n "${speed_perturb_factors}" ]; then
574
+ train_set="${train_set}_sp"
575
+ fi
576
+
577
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! [[ " ${skip_stages} " =~ [[:space:]]3[[:space:]] ]]; then
578
+ if "${skip_train}"; then
579
+ if "${eval_valid_set}"; then
580
+ _dsets="${valid_set} ${test_sets}"
581
+ else
582
+ _dsets="${test_sets}"
583
+ fi
584
+ else
585
+ _dsets="${train_set} ${valid_set} ${test_sets}"
586
+ fi
587
+ if [ "${feats_type}" = raw ]; then
588
+ log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
589
+
590
+ # ====== Recreating "wav.scp" ======
591
+ # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
592
+ # shouldn't be used in training process.
593
+ # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
594
+ # and it can also change the audio-format and sampling rate.
595
+ # If nothing is need, then format_wav_scp.sh does nothing:
596
+ # i.e. the input file format and rate is same as the output.
597
+
598
+ for dset in ${_dsets}; do
599
+ if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
600
+ _suf="/org"
601
+ else
602
+ _suf=""
603
+ fi
604
+ utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
605
+ rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
606
+
607
+ # Copy extra text files
608
+ for extra_txt in ${utt_extra_files}; do
609
+ [ -f data/${dset}/${extra_txt} ] && cp data/${dset}/${extra_txt} ${data_feats}${_suf}/${dset}
610
+ done
611
+
612
+ # Copy reference text files if there is more than 1 reference
613
+ if [ ${#ref_text_files[@]} -gt 1 ]; then
614
+ # shellcheck disable=SC2068
615
+ for ref_txt in ${ref_text_files[@]}; do
616
+ [ -f data/${dset}/${ref_txt} ] && cp data/${dset}/${ref_txt} ${data_feats}${_suf}/${dset}
617
+ done
618
+ fi
619
+
620
+ _opts=
621
+ if [ -e data/"${dset}"/segments ]; then
622
+ # "segments" is used for splitting wav files which are written in "wav".scp
623
+ # into utterances. The file format of segments:
624
+ # <segment_id> <record_id> <start_time> <end_time>
625
+ # "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
626
+ # Where the time is written in seconds.
627
+ _opts+="--segments data/${dset}/segments "
628
+ fi
629
+ # shellcheck disable=SC2086
630
+ scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
631
+ --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
632
+ --multi-columns-input "${multi_columns_input_wav_scp}" \
633
+ --multi-columns-output "${multi_columns_output_wav_scp}" \
634
+ "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
635
+
636
+ echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
637
+ if "${multi_columns_output_wav_scp}"; then
638
+ echo "multi_${audio_format}" > "${data_feats}${_suf}/${dset}/audio_format"
639
+ else
640
+ echo "${audio_format}" > "${data_feats}${_suf}/${dset}/audio_format"
641
+ fi
642
+ done
643
+
644
+ elif [ "${feats_type}" = raw_copy ]; then
645
+ # If you guaranteed that the data already satisfy the raw format, you can skip format_wav_scp.py for reduce the overhead
646
+ for dset in ${_dsets}; do
647
+ if [ -e "data/${dset}/segments" ]; then
648
+ log "Error: data/${dset}/segments is existing. Please use --feats_type raw"
649
+ exit 1
650
+ fi
651
+ if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
652
+ _suf="/org"
653
+ else
654
+ _suf=""
655
+ fi
656
+ utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
657
+ if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
658
+ _suf="/org"
659
+
660
+ if [ -e "data/${dset}/utt2dur" ]; then
661
+ _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
662
+ <data/${dset}/utt2dur awk '{ print $1, int($2*'${_fs}'); }' > "${data_feats}${_suf}/${dset}"/utt2num_samples
663
+
664
+ elif [ -e "data/${dset}/utt2num_samples" ]; then
665
+ cp "data/${dset}/utt2num_samples" "${data_feats}${_suf}/${dset}"/utt2num_samples
666
+
667
+ else
668
+ log "Error: data/${dset}/utt2dur or data/${dset}/utt2num_samples must be existing for train_set and valid_set. Please use --feats_type raw. If you'd like to perform this script for evaluation, please give --skip_train true"
669
+ exit 1
670
+ fi
671
+ fi
672
+
673
+ # Copy extra text files
674
+ for extra_txt in ${utt_extra_files}; do
675
+ [ -f data/${dset}/${extra_txt} ] && cp data/${dset}/${extra_txt} ${data_feats}${_suf}/${dset}
676
+ done
677
+
678
+ # Copy reference text files if there is more than 1 reference
679
+ if [ ${#ref_text_files[@]} -gt 1 ]; then
680
+ # shellcheck disable=SC2068
681
+ for ref_txt in ${ref_text_files[@]}; do
682
+ [ -f data/${dset}/${ref_txt} ] && cp data/${dset}/${ref_txt} ${data_feats}${_suf}/${dset}
683
+ done
684
+ fi
685
+
686
+ echo "raw" > "${data_feats}${_suf}/${dset}/feats_type"
687
+ if "${multi_columns_input_wav_scp}"; then
688
+ echo "multi_${audio_format}" > "${data_feats}${_suf}/${dset}/audio_format"
689
+ else
690
+ echo "${audio_format}" > "${data_feats}${_suf}/${dset}/audio_format"
691
+ fi
692
+ done
693
+
694
+ elif [ "${feats_type}" = fbank_pitch ]; then
695
+ log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
696
+
697
+ for dset in ${_dsets}; do
698
+ if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
699
+ _suf="/org"
700
+ else
701
+ _suf=""
702
+ fi
703
+ # 1. Copy datadir
704
+ utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
705
+
706
+ # Copy extra text files
707
+ for extra_txt in ${utt_extra_files}; do
708
+ [ -f data/${dset}/${extra_txt} ] && cp data/${dset}/${extra_txt} ${data_feats}${_suf}/${dset}
709
+ done
710
+
711
+ # Copy reference text files if there is more than 1 reference
712
+ if [ ${#ref_text_files[@]} -gt 1 ]; then
713
+ # shellcheck disable=SC2068
714
+ for ref_txt in ${ref_text_files[@]}; do
715
+ [ -f data/${dset}/${ref_txt} ] && cp data/${dset}/${ref_txt} ${data_feats}${_suf}/${dset}
716
+ done
717
+ fi
718
+
719
+ # 2. Feature extract
720
+ _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
721
+ steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
722
+ utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
723
+
724
+ # 3. Derive the the frame length and feature dimension
725
+ scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
726
+ "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
727
+
728
+ # 4. Write feats_dim
729
+ head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
730
+ | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
731
+
732
+ # 5. Write feats_type
733
+ echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
734
+ done
735
+
736
+ elif [ "${feats_type}" = fbank ]; then
737
+ log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
738
+ log "${feats_type} is not supported yet."
739
+ exit 1
740
+
741
+ elif [ "${feats_type}" = extracted ]; then
742
+ log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
743
+ # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
744
+
745
+ for dset in ${_dsets}; do
746
+ if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
747
+ _suf="/org"
748
+ else
749
+ _suf=""
750
+ fi
751
+ # Generate dummy wav.scp to avoid error by copy_data_dir.sh
752
+ if [ ! -f data/"${dset}"/wav.scp ]; then
753
+ if [ ! -f data/"${dset}"/segments ]; then
754
+ <data/"${dset}"/feats.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
755
+ else
756
+ <data/"${dset}"/segments awk ' { print($2,"<DUMMY>") }' > data/"${dset}"/wav.scp
757
+ fi
758
+ fi
759
+ utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
760
+
761
+ # Copy extra text files
762
+ for extra_txt in ${utt_extra_files}; do
763
+ [ -f data/${dset}/${extra_txt} ] && cp data/${dset}/${extra_txt} ${data_feats}${_suf}/${dset}
764
+ done
765
+
766
+ # Copy reference text files if there is more than 1 reference
767
+ # shellcheck disable=SC2068
768
+ if [ ${#ref_text_files[@]} -gt 1 ]; then
769
+ for ref_txt in ${ref_text_files[@]}; do
770
+ [ -f data/${dset}/${ref_txt} ] && cp data/${dset}/${ref_txt} ${data_feats}${_suf}/${dset}
771
+ done
772
+ fi
773
+
774
+ # Derive the the frame length and feature dimension
775
+ _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
776
+ scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
777
+ "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
778
+
779
+ pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
780
+ awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
781
+
782
+ echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
783
+ done
784
+
785
+ else
786
+ log "Error: not supported: --feats_type ${feats_type}"
787
+ exit 2
788
+ fi
789
+ fi
790
+
791
+
792
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ] && ! [[ " ${skip_stages} " =~ [[:space:]]4[[:space:]] ]]; then
793
+ log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
794
+
795
+ # NOTE(kamo): Not applying to test_sets to keep original data
796
+ for dset in "${train_set}" "${valid_set}"; do
797
+
798
+ # Copy data dir
799
+ utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
800
+ cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
801
+
802
+ # Remove short utterances
803
+ _feats_type="$(<${data_feats}/${dset}/feats_type)"
804
+ if [ "${_feats_type}" = raw ]; then
805
+ _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
806
+ _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
807
+ _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
808
+
809
+ # utt2num_samples is created by format_wav_scp.sh
810
+ <"${data_feats}/org/${dset}/utt2num_samples" \
811
+ awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
812
+ '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
813
+ >"${data_feats}/${dset}/utt2num_samples"
814
+ <"${data_feats}/org/${dset}/wav.scp" \
815
+ utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples" \
816
+ >"${data_feats}/${dset}/wav.scp"
817
+ else
818
+ # Get frame shift in ms from conf/fbank.conf
819
+ _frame_shift=
820
+ if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
821
+ # Assume using conf/fbank.conf for feature extraction
822
+ _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
823
+ fi
824
+ if [ -z "${_frame_shift}" ]; then
825
+ # If not existing, use the default number in Kaldi (=10ms).
826
+ # If you are using different number, you have to change the following value manually.
827
+ _frame_shift=10
828
+ fi
829
+
830
+ _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
831
+ _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
832
+
833
+ cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
834
+ <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
835
+ | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
836
+ '{ if ($2 > min_length && $2 < max_length) print $0; }' \
837
+ >"${data_feats}/${dset}/feats_shape"
838
+ <"${data_feats}/org/${dset}/feats.scp" \
839
+ utils/filter_scp.pl "${data_feats}/${dset}/feats_shape" \
840
+ >"${data_feats}/${dset}/feats.scp"
841
+ fi
842
+
843
+ # Remove empty text
844
+ # shellcheck disable=SC2068
845
+ for extra_txt in ${utt_extra_files}; do
846
+ <"${data_feats}/org/${dset}/${extra_txt}" \
847
+ awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/${extra_txt}"
848
+ done
849
+ for ref_txt in ${ref_text_files[@]}; do
850
+ <"${data_feats}/org/${dset}/${ref_txt}" \
851
+ awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/${ref_txt}"
852
+ done
853
+
854
+ # fix_data_dir.sh leaves only utts which exist in all files
855
+ utils/fix_data_dir.sh \
856
+ --utt_extra_files "${utt_extra_files} ${ref_text_files_str}" \
857
+ "${data_feats}/${dset}"
858
+ done
859
+
860
+ if [ -n "${post_process_local_data_opts}" ]; then
861
+ # Do any additional local data post-processing here
862
+ local/data.sh ${post_process_local_data_opts} --s2t_data_dir "${data_feats}/${train_set}"
863
+ fi
864
+
865
+ # shellcheck disable=SC2002,SC2068,SC2005
866
+ for lm_txt in ${lm_train_text[@]}; do
867
+ suffix=$(echo "$(basename ${lm_txt})" | sed 's/text//')
868
+ <${lm_txt} awk -v suffix=${suffix} ' { if( NF != 1 ) {$1=$1 suffix; print $0; }} '
869
+ done > "${data_feats}/lm_train.txt"
870
+ fi
871
+
872
+
873
+ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && ! [[ " ${skip_stages} " =~ [[:space:]]5[[:space:]] ]]; then
874
+ if [ "${token_type}" = bpe ]; then
875
+ log "Stage 5: Generate token_list from ${bpe_train_text} using BPE"
876
+
877
+ mkdir -p "${bpedir}"
878
+ # shellcheck disable=SC2002
879
+ cat ${bpe_train_text} | cut -f 2- -d" " > "${bpedir}"/train.txt
880
+
881
+ if [ -n "${bpe_nlsyms}" ]; then
882
+ if test -f "${bpe_nlsyms}"; then
883
+ bpe_nlsyms_list=$(awk '{print $1}' ${bpe_nlsyms} | paste -s -d, -)
884
+ _opts_spm="--user_defined_symbols=${bpe_nlsyms_list}"
885
+ else
886
+ _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
887
+ fi
888
+ else
889
+ _opts_spm=""
890
+ fi
891
+
892
+ spm_train \
893
+ --input="${bpedir}"/train.txt \
894
+ --vocab_size="${nbpe}" \
895
+ --model_type="${bpemode}" \
896
+ --model_prefix="${bpeprefix}" \
897
+ --character_coverage=${bpe_char_cover} \
898
+ --input_sentence_size="${bpe_input_sentence_size}" \
899
+ ${_opts_spm}
900
+
901
+ {
902
+ echo "${blank}"
903
+ echo "${oov}"
904
+ # Remove <unk>, <s>, </s> from the vocabulary
905
+ <"${bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
906
+ echo "${sos}"
907
+ echo "${eos}"
908
+ echo "${sop}"
909
+ } > "${token_list}"
910
+
911
+ elif [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
912
+ log "Stage 5: Generate character level token_list from ${lm_train_text}"
913
+
914
+ _opts="--non_linguistic_symbols ${nlsyms_txt}"
915
+
916
+ # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
917
+ # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
918
+ ${python} -m espnet2.bin.tokenize_text \
919
+ --token_type "${token_type}" \
920
+ --input "${data_feats}/lm_train.txt" --output "${token_list}" ${_opts} \
921
+ --field 2- \
922
+ --cleaner "${cleaner}" \
923
+ --g2p "${g2p}" \
924
+ --write_vocabulary true \
925
+ --add_symbol "${blank}:0" \
926
+ --add_symbol "${oov}:1" \
927
+ --add_symbol "${sop}:-1" \
928
+ --add_symbol "${eos}:-2" \
929
+ --add_symbol "${sos}:-3"
930
+
931
+ elif grep -q "whisper" <<< ${token_type}; then
932
+ log "Stage 5: Generate whisper token_list from ${token_type} tokenizer"
933
+
934
+ # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
935
+ # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
936
+ echo ${token_list}
937
+ ${python} -m espnet2.bin.whisper_export_vocabulary \
938
+ --whisper_model "${token_type}" \
939
+ --output "${token_list}"
940
+ elif [ "${token_type}" = hugging_face ]; then
941
+ log "Stage 5: Generate hugging_face token_list from ${hugging_face_model_name_or_path}"
942
+
943
+ # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
944
+ # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
945
+ ${python} -m espnet2.bin.hugging_face_export_vocabulary \
946
+ --model_name_or_path "${hugging_face_model_name_or_path}" \
947
+ --output "${token_list}"
948
+ else
949
+ log "Error: not supported --token_type '${token_type}'"
950
+ exit 2
951
+ fi
952
+
953
+ # Create word-list for word-LM training
954
+ if ${use_word_lm} && [ "${token_type}" != word ]; then
955
+ log "Generate word level token_list from ${data_feats}/lm_train.txt"
956
+ ${python} -m espnet2.bin.tokenize_text \
957
+ --token_type word \
958
+ --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
959
+ --field 2- \
960
+ --cleaner "${cleaner}" \
961
+ --g2p "${g2p}" \
962
+ --write_vocabulary true \
963
+ --vocabulary_size "${word_vocab_size}" \
964
+ --add_symbol "${blank}:0" \
965
+ --add_symbol "${oov}:1" \
966
+ --add_symbol "${sop}:-1" \
967
+ --add_symbol "${eos}:-2" \
968
+ --add_symbol "${sos}:-3"
969
+ fi
970
+
971
+ fi
972
+
973
+
974
+ # ========================== Data preparation is done here. ==========================
975
+
976
+
977
+ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && ! [[ " ${skip_stages} " =~ [[:space:]]6[[:space:]] ]]; then
978
+ log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
979
+
980
+ _opts=
981
+ if [ -n "${lm_config}" ]; then
982
+ # To generate the config file: e.g.
983
+ # % python3 -m espnet2.bin.lm_train --print_config --optim adam
984
+ _opts+="--config ${lm_config} "
985
+ fi
986
+
987
+ # 1. Split the key file
988
+ _logdir="${lm_stats_dir}/logdir"
989
+ mkdir -p "${_logdir}"
990
+ # Get the minimum number among ${nj} and the number lines of input files
991
+ _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
992
+
993
+ key_file="${data_feats}/lm_train.txt"
994
+ split_scps=""
995
+ for n in $(seq ${_nj}); do
996
+ split_scps+=" ${_logdir}/train.${n}.scp"
997
+ done
998
+ # shellcheck disable=SC2086
999
+ utils/split_scp.pl "${key_file}" ${split_scps}
1000
+
1001
+ key_file="${lm_dev_text}"
1002
+ split_scps=""
1003
+ for n in $(seq ${_nj}); do
1004
+ split_scps+=" ${_logdir}/dev.${n}.scp"
1005
+ done
1006
+ # shellcheck disable=SC2086
1007
+ utils/split_scp.pl "${key_file}" ${split_scps}
1008
+
1009
+ # 2. Generate run.sh
1010
+ log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
1011
+ mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
1012
+
1013
+ # 3. Submit jobs
1014
+ log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
1015
+ # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
1016
+ # but it's used only for deciding the sample ids.
1017
+ # shellcheck disable=SC2046,SC2086
1018
+ ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
1019
+ ${python} -m espnet2.bin.lm_train \
1020
+ --collect_stats true \
1021
+ --use_preprocessor true \
1022
+ --bpemodel "${bpemodel}" \
1023
+ --token_type "${lm_token_type}"\
1024
+ --token_list "${lm_token_list}" \
1025
+ --non_linguistic_symbols "${nlsyms_txt}" \
1026
+ --cleaner "${cleaner}" \
1027
+ --g2p "${g2p}" \
1028
+ --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
1029
+ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
1030
+ --train_shape_file "${_logdir}/train.JOB.scp" \
1031
+ --valid_shape_file "${_logdir}/dev.JOB.scp" \
1032
+ --output_dir "${_logdir}/stats.JOB" \
1033
+ ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }
1034
+
1035
+ # 4. Aggregate shape files
1036
+ _opts=
1037
+ for i in $(seq "${_nj}"); do
1038
+ _opts+="--input_dir ${_logdir}/stats.${i} "
1039
+ done
1040
+ # shellcheck disable=SC2086
1041
+ ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
1042
+
1043
+ # Append the num-tokens at the last dimensions. This is used for batch-bins count
1044
+ <"${lm_stats_dir}/train/text_shape" \
1045
+ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
1046
+ >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
1047
+
1048
+ <"${lm_stats_dir}/valid/text_shape" \
1049
+ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
1050
+ >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
1051
+ fi
1052
+
1053
+
1054
+ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && ! [[ " ${skip_stages} " =~ [[:space:]]7[[:space:]] ]]; then
1055
+ log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
1056
+
1057
+ _opts=
1058
+ if [ -n "${lm_config}" ]; then
1059
+ # To generate the config file: e.g.
1060
+ # % python3 -m espnet2.bin.lm_train --print_config --optim adam
1061
+ _opts+="--config ${lm_config} "
1062
+ fi
1063
+
1064
+ if [ "${num_splits_lm}" -gt 1 ]; then
1065
+ # If you met a memory error when parsing text files, this option may help you.
1066
+ # The corpus is split into subsets and each subset is used for training one by one in order,
1067
+ # so the memory footprint can be limited to the memory required for each dataset.
1068
+
1069
+ _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
1070
+ if [ ! -f "${_split_dir}/.done" ]; then
1071
+ rm -f "${_split_dir}/.done"
1072
+ ${python} -m espnet2.bin.split_scps \
1073
+ --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
1074
+ --num_splits "${num_splits_lm}" \
1075
+ --output_dir "${_split_dir}"
1076
+ touch "${_split_dir}/.done"
1077
+ else
1078
+ log "${_split_dir}/.done exists. Spliting is skipped"
1079
+ fi
1080
+
1081
+ _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
1082
+ _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
1083
+ _opts+="--multiple_iterator true "
1084
+
1085
+ else
1086
+ _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
1087
+ _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
1088
+ fi
1089
+
1090
+ # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
1091
+
1092
+ log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
1093
+ mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
1094
+
1095
+ log "LM training started... log: '${lm_exp}/train.log'"
1096
+ if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
1097
+ # SGE can't include "/" in a job name
1098
+ jobname="$(basename ${lm_exp})"
1099
+ else
1100
+ jobname="${lm_exp}/train.log"
1101
+ fi
1102
+
1103
+ # shellcheck disable=SC2086
1104
+ ${python} -m espnet2.bin.launch \
1105
+ --cmd "${cuda_cmd} --name ${jobname}" \
1106
+ --log "${lm_exp}"/train.log \
1107
+ --ngpu "${ngpu}" \
1108
+ --num_nodes "${num_nodes}" \
1109
+ --init_file_prefix "${lm_exp}"/.dist_init_ \
1110
+ --multiprocessing_distributed true -- \
1111
+ ${python} -m espnet2.bin.lm_train \
1112
+ --ngpu "${ngpu}" \
1113
+ --use_preprocessor true \
1114
+ --bpemodel "${bpemodel}" \
1115
+ --token_type "${lm_token_type}"\
1116
+ --token_list "${lm_token_list}" \
1117
+ --non_linguistic_symbols "${nlsyms_txt}" \
1118
+ --cleaner "${cleaner}" \
1119
+ --g2p "${g2p}" \
1120
+ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
1121
+ --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
1122
+ --fold_length "${lm_fold_length}" \
1123
+ --resume true \
1124
+ --output_dir "${lm_exp}" \
1125
+ ${_opts} ${lm_args}
1126
+
1127
+ fi
1128
+
1129
+
1130
+ if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ] && ! [[ " ${skip_stages} " =~ [[:space:]]8[[:space:]] ]]; then
1131
+ log "Stage 8: Calc perplexity: ${lm_test_text}"
1132
+ _opts=
1133
+ # TODO(kamo): Parallelize?
1134
+ log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
1135
+ # shellcheck disable=SC2086
1136
+ ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
1137
+ ${python} -m espnet2.bin.lm_calc_perplexity \
1138
+ --ngpu "${ngpu}" \
1139
+ --data_path_and_name_and_type "${lm_test_text},text,text" \
1140
+ --train_config "${lm_exp}"/config.yaml \
1141
+ --model_file "${lm_exp}/${inference_lm}" \
1142
+ --output_dir "${lm_exp}/perplexity_test" \
1143
+ ${_opts}
1144
+ log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
1145
+
1146
+ fi
1147
+
1148
+
1149
+ if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ] && ! [[ " ${skip_stages} " =~ [[:space:]]9[[:space:]] ]]; then
1150
+ log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
1151
+ mkdir -p ${ngram_exp}
1152
+ cut -f 2- -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
1153
+ build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
1154
+ fi
1155
+
1156
+
1157
+ if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ] && ! [[ " ${skip_stages} " =~ [[:space:]]10[[:space:]] ]]; then
1158
+ _s2t_train_dir="${data_feats}/${train_set}"
1159
+ _s2t_valid_dir="${data_feats}/${valid_set}"
1160
+ log "Stage 10: S2T collect stats: train_set=${_s2t_train_dir}, valid_set=${_s2t_valid_dir}"
1161
+
1162
+ _opts=
1163
+ if [ -n "${s2t_config}" ]; then
1164
+ # To generate the config file: e.g.
1165
+ # % python3 -m espnet2.bin.s2t_train --print_config --optim adam
1166
+ _opts+="--config ${s2t_config} "
1167
+ fi
1168
+
1169
+ _feats_type="$(<${_s2t_train_dir}/feats_type)"
1170
+ _audio_format="$(cat ${_s2t_train_dir}/audio_format 2>/dev/null || echo ${audio_format})"
1171
+ if [ "${_feats_type}" = raw ]; then
1172
+ _scp=wav.scp
1173
+ if [[ "${_audio_format}" == *ark* ]]; then
1174
+ _type=kaldi_ark
1175
+ else
1176
+ # "sound" supports "wav", "flac", etc.
1177
+ _type=sound
1178
+ fi
1179
+ _opts+="--frontend_conf fs=${fs} "
1180
+ else
1181
+ _scp=feats.scp
1182
+ _type=kaldi_ark
1183
+ _input_size="$(<${_s2t_train_dir}/feats_dim)"
1184
+ _opts+="--input_size=${_input_size} "
1185
+ fi
1186
+
1187
+ # 1. Split the key file
1188
+ _logdir="${s2t_stats_dir}/logdir"
1189
+ mkdir -p "${_logdir}"
1190
+
1191
+ # Get the minimum number among ${nj} and the number lines of input files
1192
+ _nj=$(min "${nj}" "$(<${_s2t_train_dir}/${_scp} wc -l)" "$(<${_s2t_valid_dir}/${_scp} wc -l)")
1193
+
1194
+ key_file="${_s2t_train_dir}/${_scp}"
1195
+ split_scps=""
1196
+ for n in $(seq "${_nj}"); do
1197
+ split_scps+=" ${_logdir}/train.${n}.scp"
1198
+ done
1199
+ # shellcheck disable=SC2086
1200
+ utils/split_scp.pl "${key_file}" ${split_scps}
1201
+
1202
+ key_file="${_s2t_valid_dir}/${_scp}"
1203
+ split_scps=""
1204
+ for n in $(seq "${_nj}"); do
1205
+ split_scps+=" ${_logdir}/valid.${n}.scp"
1206
+ done
1207
+ # shellcheck disable=SC2086
1208
+ utils/split_scp.pl "${key_file}" ${split_scps}
1209
+
1210
+ # 2. Generate run.sh
1211
+ log "Generate '${s2t_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
1212
+ mkdir -p "${s2t_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${s2t_stats_dir}/run.sh"; chmod +x "${s2t_stats_dir}/run.sh"
1213
+
1214
+ # 3. Submit jobs
1215
+ log "S2T collect-stats started... log: '${_logdir}/stats.*.log'"
1216
+
1217
+ # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
1218
+ # but it's used only for deciding the sample ids.
1219
+
1220
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${_scp},speech,${_type} "
1221
+ _opts+="--valid_data_path_and_name_and_type ${_s2t_valid_dir}/${_scp},speech,${_type} "
1222
+ # shellcheck disable=SC2068
1223
+ for extra_txt in ${utt_extra_files}; do
1224
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${extra_txt},${extra_txt//./_},text "
1225
+ _opts+="--valid_data_path_and_name_and_type ${_s2t_valid_dir}/${extra_txt},${extra_txt//./_},text "
1226
+ done
1227
+ for i in ${!ref_text_files[@]}; do
1228
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${ref_text_files[$i]},${ref_text_names[$i]},text "
1229
+ _opts+="--valid_data_path_and_name_and_type ${_s2t_valid_dir}/${ref_text_files[$i]},${ref_text_names[$i]},text "
1230
+ done
1231
+
1232
+ # shellcheck disable=SC2046,SC2086
1233
+ ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
1234
+ ${python} -m espnet2.bin.s2t_train \
1235
+ --collect_stats true \
1236
+ --use_preprocessor true \
1237
+ --bpemodel "${bpemodel}" \
1238
+ --token_type "${token_type}" \
1239
+ --token_list "${token_list}" \
1240
+ --non_linguistic_symbols "${nlsyms_txt}" \
1241
+ --cleaner "${cleaner}" \
1242
+ --g2p "${g2p}" \
1243
+ --train_shape_file "${_logdir}/train.JOB.scp" \
1244
+ --valid_shape_file "${_logdir}/valid.JOB.scp" \
1245
+ --output_dir "${_logdir}/stats.JOB" \
1246
+ ${_opts} ${s2t_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }
1247
+
1248
+ # 4. Aggregate shape files
1249
+ _opts=
1250
+ for i in $(seq "${_nj}"); do
1251
+ _opts+="--input_dir ${_logdir}/stats.${i} "
1252
+ done
1253
+ if [ "${feats_normalize}" != global_mvn ]; then
1254
+ # Skip summerizaing stats if not using global MVN
1255
+ _opts+="--skip_sum_stats"
1256
+ fi
1257
+ # shellcheck disable=SC2086
1258
+ ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${s2t_stats_dir}"
1259
+
1260
+ # Append the num-tokens at the last dimensions. This is used for batch-bins count
1261
+ # shellcheck disable=SC2068
1262
+ for extra_txt in ${utt_extra_files}; do
1263
+ _extra_txt=${extra_txt//./_}
1264
+ <"${s2t_stats_dir}/train/${_extra_txt}_shape" \
1265
+ awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
1266
+ >"${s2t_stats_dir}/train/${_extra_txt}_shape.${token_type}"
1267
+
1268
+ <"${s2t_stats_dir}/valid/${_extra_txt}_shape" \
1269
+ awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
1270
+ >"${s2t_stats_dir}/valid/${_extra_txt}_shape.${token_type}"
1271
+ done
1272
+ for ref_txt in ${ref_text_names[@]}; do
1273
+ <"${s2t_stats_dir}/train/${ref_txt}_shape" \
1274
+ awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
1275
+ >"${s2t_stats_dir}/train/${ref_txt}_shape.${token_type}"
1276
+
1277
+ <"${s2t_stats_dir}/valid/${ref_txt}_shape" \
1278
+ awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
1279
+ >"${s2t_stats_dir}/valid/${ref_txt}_shape.${token_type}"
1280
+ done
1281
+ fi
1282
+
1283
+
1284
+ if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ] && ! [[ " ${skip_stages} " =~ [[:space:]]11[[:space:]] ]]; then
1285
+ _s2t_train_dir="${data_feats}/${train_set}"
1286
+ _s2t_valid_dir="${data_feats}/${valid_set}"
1287
+ log "Stage 11: S2T Training: train_set=${_s2t_train_dir}, valid_set=${_s2t_valid_dir}"
1288
+
1289
+ _opts=
1290
+ if [ -n "${s2t_config}" ]; then
1291
+ # To generate the config file: e.g.
1292
+ # % python3 -m espnet2.bin.s2t_train --print_config --optim adam
1293
+ _opts+="--config ${s2t_config} "
1294
+ fi
1295
+
1296
+ _feats_type="$(<${_s2t_train_dir}/feats_type)"
1297
+ _audio_format="$(cat ${_s2t_train_dir}/audio_format 2>/dev/null || echo ${audio_format})"
1298
+ if [ "${_feats_type}" = raw ]; then
1299
+ _scp=wav.scp
1300
+ # "sound" supports "wav", "flac", etc.
1301
+ if [[ "${_audio_format}" == *ark* ]]; then
1302
+ _type=kaldi_ark
1303
+ elif [[ "${_audio_format}" == *multi* ]]; then
1304
+ _type=multi_columns_sound
1305
+ else
1306
+ _type=sound
1307
+ fi
1308
+ _fold_length="$((s2t_speech_fold_length * 100))"
1309
+ _opts+="--frontend_conf fs=${fs} "
1310
+ else
1311
+ _scp=feats.scp
1312
+ _type=kaldi_ark
1313
+ _fold_length="${s2t_speech_fold_length}"
1314
+ _input_size="$(<${_s2t_train_dir}/feats_dim)"
1315
+ _opts+="--input_size=${_input_size} "
1316
+
1317
+ fi
1318
+ if [ "${feats_normalize}" = global_mvn ]; then
1319
+ # Default normalization is utterance_mvn and changes to global_mvn
1320
+ _opts+="--normalize=global_mvn --normalize_conf stats_file=${s2t_stats_dir}/train/feats_stats.npz "
1321
+ fi
1322
+
1323
+ if [ "${num_splits_s2t}" -gt 1 ]; then
1324
+ # If you met a memory error when parsing text files, this option may help you.
1325
+ # The corpus is split into subsets and each subset is used for training one by one in order,
1326
+ # so the memory footprint can be limited to the memory required for each dataset.
1327
+
1328
+ _split_dir="${s2t_stats_dir}/splits${num_splits_s2t}"
1329
+ _all_scps="${_s2t_train_dir}/${_scp} ${_s2t_train_dir}/text ${s2t_stats_dir}/train/speech_shape ${s2t_stats_dir}/train/text_shape.${token_type} "
1330
+ for extra_txt in ${utt_extra_files}; do
1331
+ _all_scps+="${_s2t_train_dir}/${extra_txt} ${s2t_stats_dir}/train/${extra_txt//./_}_shape.${token_type} "
1332
+ done
1333
+ if [ ! -f "${_split_dir}/.done" ]; then
1334
+ rm -f "${_split_dir}/.done"
1335
+ ${python} -m espnet2.bin.split_scps \
1336
+ --scps ${_all_scps} \
1337
+ --num_splits "${num_splits_s2t}" \
1338
+ --output_dir "${_split_dir}"
1339
+ touch "${_split_dir}/.done"
1340
+ else
1341
+ log "${_split_dir}/.done exists. Spliting is skipped"
1342
+ fi
1343
+
1344
+ _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
1345
+ _opts+="--train_shape_file ${_split_dir}/speech_shape "
1346
+ # shellcheck disable=SC2068
1347
+ for extra_txt in ${utt_extra_files}; do
1348
+ _opts+="--fold_length ${s2t_text_fold_length} "
1349
+ _opts+="--train_data_path_and_name_and_type ${_split_dir}/${extra_txt},${extra_txt//./_},text "
1350
+ _opts+="--train_shape_file ${_split_dir}/${extra_txt//./_}_shape.${token_type} "
1351
+ done
1352
+ for i in ${!ref_text_names[@]}; do
1353
+ _opts+="--fold_length ${s2t_text_fold_length} "
1354
+ _opts+="--train_data_path_and_name_and_type ${_split_dir}/${ref_text_files[$i]},${ref_text_names[$i]},text "
1355
+ _opts+="--train_shape_file ${_split_dir}/${ref_text_names[$i]}_shape.${token_type} "
1356
+ done
1357
+ _opts+="--multiple_iterator true "
1358
+
1359
+ else
1360
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${_scp},speech,${_type} "
1361
+ _opts+="--train_shape_file ${s2t_stats_dir}/train/speech_shape "
1362
+
1363
+ # shellcheck disable=SC2068
1364
+ for extra_txt in ${utt_extra_files}; do
1365
+ _opts+="--fold_length ${s2t_text_fold_length} "
1366
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${extra_txt},${extra_txt//./_},text "
1367
+ _opts+="--train_shape_file ${s2t_stats_dir}/train/${extra_txt//./_}_shape.${token_type} "
1368
+ done
1369
+ for i in ${!ref_text_names[@]}; do
1370
+ _opts+="--fold_length ${s2t_text_fold_length} "
1371
+ _opts+="--train_data_path_and_name_and_type ${_s2t_train_dir}/${ref_text_files[$i]},${ref_text_names[$i]},text "
1372
+ _opts+="--train_shape_file ${s2t_stats_dir}/train/${ref_text_names[$i]}_shape.${token_type} "
1373
+ done
1374
+ fi
1375
+
1376
+ # shellcheck disable=SC2068
1377
+ for extra_txt in ${utt_extra_files}; do
1378
+ _opts+="--valid_data_path_and_name_and_type ${_s2t_valid_dir}/${extra_txt},${extra_txt//./_},text "
1379
+ _opts+="--valid_shape_file ${s2t_stats_dir}/valid/${extra_txt//./_}_shape.${token_type} "
1380
+ done
1381
+ for i in ${!ref_text_names[@]}; do
1382
+ _opts+="--valid_data_path_and_name_and_type ${_s2t_valid_dir}/${ref_text_files[$i]},${ref_text_names[$i]},text "
1383
+ _opts+="--valid_shape_file ${s2t_stats_dir}/valid/${ref_text_names[$i]}_shape.${token_type} "
1384
+ done
1385
+
1386
+ log "Generate '${s2t_exp}/run.sh'. You can resume the process from stage 11 using this script"
1387
+ mkdir -p "${s2t_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${s2t_exp}/run.sh"; chmod +x "${s2t_exp}/run.sh"
1388
+
1389
+ # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
1390
+ log "S2T training started... log: '${s2t_exp}/train.log'"
1391
+ if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
1392
+ # SGE can't include "/" in a job name
1393
+ jobname="$(basename ${s2t_exp})"
1394
+ else
1395
+ jobname="${s2t_exp}/train.log"
1396
+ fi
1397
+
1398
+ # shellcheck disable=SC2086
1399
+ ${python} -m espnet2.bin.launch \
1400
+ --cmd "${cuda_cmd} --name ${jobname}" \
1401
+ --log "${s2t_exp}"/train.log \
1402
+ --ngpu "${ngpu}" \
1403
+ --num_nodes "${num_nodes}" \
1404
+ --init_file_prefix "${s2t_exp}"/.dist_init_ \
1405
+ --multiprocessing_distributed true -- \
1406
+ ${python} -m espnet2.bin.${s2t_task}_train \
1407
+ --use_preprocessor true \
1408
+ --bpemodel "${bpemodel}" \
1409
+ --token_type "${token_type}" \
1410
+ --token_list "${token_list}" \
1411
+ --non_linguistic_symbols "${nlsyms_txt}" \
1412
+ --cleaner "${cleaner}" \
1413
+ --g2p "${g2p}" \
1414
+ --valid_data_path_and_name_and_type "${_s2t_valid_dir}/${_scp},speech,${_type}" \
1415
+ --valid_shape_file "${s2t_stats_dir}/valid/speech_shape" \
1416
+ --resume true \
1417
+ --fold_length "${_fold_length}" \
1418
+ --output_dir "${s2t_exp}" \
1419
+ ${_opts} ${s2t_args}
1420
+
1421
+ fi
1422
+
1423
+
1424
+ if [ -n "${download_model}" ]; then
1425
+ log "Use ${download_model} for decoding and evaluation"
1426
+ s2t_exp="${expdir}/${download_model}"
1427
+ mkdir -p "${s2t_exp}"
1428
+
1429
+ # If the model already exists, you can skip downloading
1430
+ espnet_model_zoo_download --unpack true "${download_model}" > "${s2t_exp}/config.txt"
1431
+
1432
+ # Get the path of each file
1433
+ _s2t_model_file=$(<"${s2t_exp}/config.txt" sed -e "s/.*'s2t_model_file': '\([^']*\)'.*$/\1/")
1434
+ _s2t_train_config=$(<"${s2t_exp}/config.txt" sed -e "s/.*'s2t_train_config': '\([^']*\)'.*$/\1/")
1435
+
1436
+ # Create symbolic links
1437
+ ln -sf "${_s2t_model_file}" "${s2t_exp}"
1438
+ ln -sf "${_s2t_train_config}" "${s2t_exp}"
1439
+ inference_s2t_model=$(basename "${_s2t_model_file}")
1440
+
1441
+ if [ "$(<${s2t_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
1442
+ _lm_file=$(<"${s2t_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
1443
+ _lm_train_config=$(<"${s2t_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
1444
+
1445
+ lm_exp="${expdir}/${download_model}/lm"
1446
+ mkdir -p "${lm_exp}"
1447
+
1448
+ ln -sf "${_lm_file}" "${lm_exp}"
1449
+ ln -sf "${_lm_train_config}" "${lm_exp}"
1450
+ inference_lm=$(basename "${_lm_file}")
1451
+ fi
1452
+
1453
+ fi
1454
+
1455
+
1456
+ if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ] && ! [[ " ${skip_stages} " =~ [[:space:]]12[[:space:]] ]]; then
1457
+ log "Stage 12: Decoding: training_dir=${s2t_exp}"
1458
+
1459
+ if ${gpu_inference}; then
1460
+ _cmd="${cuda_cmd}"
1461
+ _ngpu=1
1462
+ else
1463
+ _cmd="${decode_cmd}"
1464
+ _ngpu=0
1465
+ fi
1466
+
1467
+ _opts=
1468
+ if [ -n "${inference_config}" ]; then
1469
+ _opts+="--config ${inference_config} "
1470
+ fi
1471
+ if "${use_lm}"; then
1472
+ if "${use_word_lm}"; then
1473
+ _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
1474
+ _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
1475
+ else
1476
+ _opts+="--lm_train_config ${lm_exp}/config.yaml "
1477
+ _opts+="--lm_file ${lm_exp}/${inference_lm} "
1478
+ fi
1479
+ fi
1480
+ if "${use_ngram}"; then
1481
+ _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
1482
+ fi
1483
+
1484
+ # 2. Generate run.sh
1485
+ log "Generate '${s2t_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
1486
+ mkdir -p "${s2t_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${s2t_exp}/${inference_tag}/run.sh"; chmod +x "${s2t_exp}/${inference_tag}/run.sh"
1487
+
1488
+ inference_bin_tag=""
1489
+ if "${use_streaming}"; then
1490
+ inference_bin_tag="_streaming"
1491
+ fi
1492
+
1493
+ if "${eval_valid_set}"; then
1494
+ _dsets="org/${valid_set} ${test_sets}"
1495
+ else
1496
+ _dsets="${test_sets}"
1497
+ fi
1498
+ for dset in ${_dsets}; do
1499
+ _data="${data_feats}/${dset}"
1500
+ _dir="${s2t_exp}/${inference_tag}/${dset}"
1501
+ _logdir="${_dir}/logdir"
1502
+ mkdir -p "${_logdir}"
1503
+
1504
+ _feats_type="$(<${_data}/feats_type)"
1505
+ _audio_format="$(cat ${_data}/audio_format 2>/dev/null || echo ${audio_format})"
1506
+ if [ "${_feats_type}" = raw ]; then
1507
+ _scp=wav.scp
1508
+ if [[ "${audio_format}" == *ark* ]]; then
1509
+ _type=kaldi_ark
1510
+ elif [[ "${_audio_format}" == *multi* ]]; then
1511
+ _type=multi_columns_sound
1512
+ else
1513
+ _type=sound
1514
+ fi
1515
+ else
1516
+ _scp=feats.scp
1517
+ _type=kaldi_ark
1518
+ fi
1519
+
1520
+ # 1. Split the key file
1521
+ key_file=${_data}/${_scp}
1522
+ split_scps=""
1523
+ _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
1524
+
1525
+ for n in $(seq "${_nj}"); do
1526
+ split_scps+=" ${_logdir}/keys.${n}.scp"
1527
+ done
1528
+ # shellcheck disable=SC2086
1529
+ utils/split_scp.pl "${key_file}" ${split_scps}
1530
+
1531
+ # 2. Submit decoding jobs
1532
+ log "Decoding started... log: '${_logdir}/s2t_inference.*.log'"
1533
+ rm -f "${_logdir}/*.log"
1534
+ # shellcheck disable=SC2046,SC2086
1535
+ ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/s2t_inference.JOB.log \
1536
+ ${python} -m espnet2.bin.${s2t_task}_inference${inference_bin_tag} \
1537
+ --batch_size ${batch_size} \
1538
+ --ngpu "${_ngpu}" \
1539
+ --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
1540
+ --key_file "${_logdir}"/keys.JOB.scp \
1541
+ --s2t_train_config "${s2t_exp}"/config.yaml \
1542
+ --s2t_model_file "${s2t_exp}"/"${inference_s2t_model}" \
1543
+ --output_dir "${_logdir}"/output.JOB \
1544
+ ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/s2t_inference.*.log) ; exit 1; }
1545
+
1546
+ # 3. Concatenates the output files from each jobs
1547
+ # shellcheck disable=SC2068
1548
+ for ref_txt in ${ref_text_files[@]}; do
1549
+ suffix=$(echo ${ref_txt} | sed 's/text//')
1550
+ for f in token token_int score text text_nospecial; do
1551
+ if [ -f "${_logdir}/output.1/1best_recog/${f}${suffix}" ]; then
1552
+ for i in $(seq "${_nj}"); do
1553
+ cat "${_logdir}/output.${i}/1best_recog/${f}${suffix}"
1554
+ done | sort -k1 >"${_dir}/${f}${suffix}"
1555
+ fi
1556
+ done
1557
+ done
1558
+
1559
+ done
1560
+ fi
1561
+
1562
+
1563
+ if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ] && ! [[ " ${skip_stages} " =~ [[:space:]]13[[:space:]] ]]; then
1564
+ log "Stage 13: Scoring"
1565
+ if [ "${token_type}" = phn ]; then
1566
+ log "Error: Not implemented for token_type=phn"
1567
+ exit 1
1568
+ fi
1569
+
1570
+ if "${eval_valid_set}"; then
1571
+ _dsets="org/${valid_set} ${test_sets}"
1572
+ else
1573
+ _dsets="${test_sets}"
1574
+ fi
1575
+ for dset in ${_dsets}; do
1576
+ _data="${data_feats}/${dset}"
1577
+ _dir="${s2t_exp}/${inference_tag}/${dset}"
1578
+
1579
+ for _tok_type in "char" "word" "bpe"; do
1580
+ [ "${_tok_type}" = bpe ] && [ ! -f "${bpemodel}" ] && continue
1581
+
1582
+ _opts="--token_type ${_tok_type} "
1583
+ if [ "${_tok_type}" = "char" ] || [ "${_tok_type}" = "word" ]; then
1584
+ _type="${_tok_type:0:1}er"
1585
+ _opts+="--non_linguistic_symbols ${nlsyms_txt} "
1586
+ _opts+="--remove_non_linguistic_symbols true "
1587
+
1588
+ elif [ "${_tok_type}" = "bpe" ]; then
1589
+ _type="ter"
1590
+ _opts+="--bpemodel ${bpemodel} "
1591
+
1592
+ else
1593
+ log "Error: unsupported token type ${_tok_type}"
1594
+ fi
1595
+
1596
+ _scoredir="${_dir}/score_${_type}"
1597
+ mkdir -p "${_scoredir}"
1598
+
1599
+ # shellcheck disable=SC2068
1600
+ for ref_txt in ${ref_text_files[@]}; do
1601
+ # Note(simpleoier): to get the suffix after text, e.g. "text_spk1" -> "_spk1"
1602
+ suffix=$(echo ${ref_txt} | sed 's/text//')
1603
+
1604
+ # Tokenize text to ${_tok_type} level
1605
+ paste \
1606
+ <(<"${_data}/${ref_txt}" \
1607
+ ${python} -m espnet2.bin.tokenize_text \
1608
+ -f 2- --input - --output - \
1609
+ --cleaner "${cleaner}" \
1610
+ ${_opts} \
1611
+ ) \
1612
+ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
1613
+ >"${_scoredir}/ref${suffix:-${suffix}}.trn"
1614
+
1615
+ paste \
1616
+ <(<"${_dir}/${ref_txt}_nospecial" \
1617
+ ${python} -m espnet2.bin.tokenize_text \
1618
+ -f 2- --input - --output - \
1619
+ ${_opts} \
1620
+ --cleaner "${hyp_cleaner}" \
1621
+ ) \
1622
+ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
1623
+ >"${_scoredir}/hyp${suffix:-${suffix}}.trn"
1624
+
1625
+ done
1626
+
1627
+ #sclite \
1628
+ #${score_opts} \
1629
+ #-r "${_scoredir}/ref.trn" trn \
1630
+ #-h "${_scoredir}/hyp.trn" trn \
1631
+ #-i rm -o all stdout > "${_scoredir}/result.txt"
1632
+
1633
+ #log "Write ${_type} result in ${_scoredir}/result.txt"
1634
+ #grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
1635
+ done
1636
+ done
1637
+
1638
+ [ -f local/score.sh ] && local/score.sh ${local_score_opts} "${s2t_exp}"
1639
+
1640
+ # Show results in Markdown syntax
1641
+ scripts/utils/show_asr_result.sh "${s2t_exp}" > "${s2t_exp}"/RESULTS.md
1642
+ cat "${s2t_exp}"/RESULTS.md
1643
+
1644
+ fi
1645
+
1646
+
1647
+ packed_model="${s2t_exp}/${s2t_exp##*/}_${inference_s2t_model%.*}.zip"
1648
+ if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ] && ! [[ " ${skip_stages} " =~ [[:space:]]14[[:space:]] ]]; then
1649
+ log "Stage 14: Pack model: ${packed_model}"
1650
+
1651
+ _opts=
1652
+ if "${use_lm}"; then
1653
+ _opts+="--lm_train_config ${lm_exp}/config.yaml "
1654
+ _opts+="--lm_file ${lm_exp}/${inference_lm} "
1655
+ _opts+="--option ${lm_exp}/perplexity_test/ppl "
1656
+ _opts+="--option ${lm_exp}/images "
1657
+ fi
1658
+ if [ "${feats_normalize}" = global_mvn ]; then
1659
+ _opts+="--option ${s2t_stats_dir}/train/feats_stats.npz "
1660
+ fi
1661
+ if [ "${token_type}" = bpe ]; then
1662
+ _opts+="--option ${bpemodel} "
1663
+ fi
1664
+ if [ "${nlsyms_txt}" != none ]; then
1665
+ _opts+="--option ${nlsyms_txt} "
1666
+ fi
1667
+ # shellcheck disable=SC2086
1668
+ ${python} -m espnet2.bin.pack s2t \
1669
+ --s2t_train_config "${s2t_exp}"/config.yaml \
1670
+ --s2t_model_file "${s2t_exp}"/"${inference_s2t_model}" \
1671
+ ${_opts} \
1672
+ --option "${s2t_exp}"/RESULTS.md \
1673
+ --option "${s2t_exp}"/images \
1674
+ --outpath "${packed_model}"
1675
+ fi
1676
+
1677
+ if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ] && ! [[ " ${skip_stages} " =~ [[:space:]]15[[:space:]] ]]; then
1678
+ [ -z "${hf_repo}" ] && \
1679
+ log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace, follow the following steps described here https://github.com/espnet/espnet/blob/master/CONTRIBUTING.md#132-espnet2-recipes" && \
1680
+ exit 1
1681
+ log "Stage 15: Upload model to HuggingFace: ${hf_repo}"
1682
+
1683
+ if [ ! -f "${packed_model}" ]; then
1684
+ log "ERROR: ${packed_model} does not exist. Please run stage 14 first."
1685
+ exit 1
1686
+ fi
1687
+
1688
+ gitlfs=$(git lfs --version 2> /dev/null || true)
1689
+ [ -z "${gitlfs}" ] && \
1690
+ log "ERROR: You need to install git-lfs first" && \
1691
+ exit 1
1692
+
1693
+ dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
1694
+ [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
1695
+
1696
+ if command -v git &> /dev/null; then
1697
+ _creator_name="$(git config user.name)"
1698
+ _checkout="git checkout $(git show -s --format=%H)"
1699
+ else
1700
+ _creator_name="$(whoami)"
1701
+ _checkout=""
1702
+ fi
1703
+ # /some/where/espnet/egs2/foo/s2t1/ -> foo/s2t1
1704
+ _task="$(pwd | rev | cut -d/ -f2 | rev)"
1705
+ # foo/s2t1 -> foo
1706
+ _corpus="${_task%/*}"
1707
+ _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
1708
+
1709
+ # copy files in ${dir_repo}
1710
+ unzip -o ${packed_model} -d ${dir_repo}
1711
+ # Generate description file
1712
+ # shellcheck disable=SC2034
1713
+ hf_task=automatic-speech-recognition
1714
+ # shellcheck disable=SC2034
1715
+ espnet_task=S2T
1716
+ # shellcheck disable=SC2034
1717
+ task_exp=${s2t_exp}
1718
+ eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
1719
+
1720
+ this_folder=${PWD}
1721
+ cd ${dir_repo}
1722
+ if [ -n "$(git status --porcelain)" ]; then
1723
+ git add .
1724
+ git commit -m "Update model"
1725
+ fi
1726
+ git push
1727
+ cd ${this_folder}
1728
+ fi
1729
+
1730
+ log "Successfully finished. [elapsed=${SECONDS}s]"