|
#!/bin/bash |
|
|
|
set -eu |
|
|
|
w2v_dir= |
|
lab_dir= |
|
out_dir= |
|
arpa_lm= |
|
arpa_lm_bin= |
|
|
|
label=phnc |
|
train_name="train" |
|
valid_name="valid" |
|
data_dir=${out_dir}/data |
|
|
|
mkdir -p ${out_dir}/exp |
|
local/prepare_lang.sh $w2v_dir/dict.${label}.txt $data_dir |
|
local/prepare_lm.sh $arpa_lm $data_dir |
|
|
|
for x in $train_name $valid_name; do |
|
x_gt=${x}_gt |
|
|
|
|
|
python local/prepare_data_from_w2v.py $w2v_dir $data_dir $x |
|
steps/compute_cmvn_stats.sh $data_dir/$x $out_dir/exp/make_feat/$x $out_dir/feats/$x |
|
python local/copy_aligned_text.py < $lab_dir/$x.txt > $data_dir/$x/text |
|
|
|
|
|
mkdir $data_dir/$x_gt |
|
cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $data_dir/$x_gt/ |
|
python local/copy_aligned_text.py < $w2v_dir/$x.$label > $data_dir/$x_gt/text |
|
done |
|
|
|
local/train_subset_lgbeam.sh \ |
|
--out_root ${out_dir} --out_name exp --train $train_name --valid $valid_name \ |
|
--mono_size 2000 --tri1_size 5000 --tri2b_size -1 --tri3b_size -1 \ |
|
--stage 1 --max_stage 3 $data_dir $data_dir/lang $data_dir/lang_test |
|
|
|
local/unsup_select_decode.sh \ |
|
--split $valid_name --kenlm_path $arpa_lm_bin \ |
|
--ref_txt $data_dir/${valid_name}_gt/text \ |
|
--psd_txt $data_dir/${valid_name}/text \ |
|
$out_dir/exp |
|
|