Spaces:
Runtime error
Runtime error
File size: 1,915 Bytes
62e9ca6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
#!/bin/bash
[ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
[ $# -lt 1 ] && echo "Usage: $0 <lang> [root=${PWD}/dataset/CommonVoice/v4]" && exit 0
cwd=${PWD}
src=${PWD}/speechlm/data_process
lang=$1
root=$2
[ -z $root ] && root="${PWD}/dataset/CommonVoice/v4"
set -e -o pipefail -u
### step1, convert mp3 to wav
cd $root/en && mkdir -p wav
cut -f2 validated.tsv | sed '1d' | sed "s|^|${root}/en/clips/|" > validated.id
for i in $(seq 0 39); do
echo extracting $i;
python $src/covost2/mp3_to_wav.py -i validated.id -n 40 -r $i &
done
wait
cd $cwd
### step2, manifest
datadir="$root/en/en-$lang" && mkdir -p $datadir && cd $datadir
python /mnt/default/v-ziqzhang/code/stpretrain_scripts/data_process/covost2/prepare_covost_data.py --data-root $root --src-lang en --tgt-lang $lang --vocab-type char
mv ../*en_${lang}.* ./
# adjust config_base_en${lang}.yaml
echo "bpe_tokenizer:" > config_base_en${lang}.yaml
echo " bpe: sentencepiece" >> config_base_en${lang}.yaml
echo " sentencepiece_model: spm_char_st_en_de.model" >> config_base_en${lang}.yaml
echo "" >> config_base_en${lang}.yaml
echo "shuffle: false" >> config_base_en${lang}.yaml
echo "use_audio_input: true" >> config_base_en${lang}.yaml
echo "use_sample_rate: 16000" >> config_base_en${lang}.yaml
echo "standardize_audio: false" >> config_base_en${lang}.yaml
echo "vocab_filename: spm_char_st_en_de.txt" >> config_base_en${lang}.yaml
echo "" >> config_base_en${lang}.yaml
echo "# required by speech_to_text task but never used" >> config_base_en${lang}.yaml
echo "input_channels: 1" >> config_base_en${lang}.yaml
echo "input_feat_per_channel: 1" >> config_base_en${lang}.yaml
echo "" >> config_base_en${lang}.yaml
# adjust config_large_en${lang}.yaml
cat config_base_en${lang}.yaml | sed "s|standardize_audio: false|standardize_audio: true|" > config_large_en${lang}.yaml
|