File size: 2,537 Bytes
dd9600d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Please log in to huggingface first

MLS_WAV_DIR='' # downloaded mls wav path
LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path
GIGASPEECH_WAV_DIR='' # downloaded gigaspeech wav path
COMMONVOICE_WAV_DIR='' # downloaded commonvoice wav path
EMILIA_WAV_DIR='' # downloaded emilia wav path
OUTPUT_DIR="./output_pretraining/" # output dir, to save checkpoints
TEMPORY_SAVE_TO_DISK="./audio_code_pretraining/" # dac codec saved dir
SAVE_TO_DISK="./dataset_pretraining/" # huggingface metadata saved dir
WANDB_KEY='' # your wandb key for logging

export CUDA_LAUNCH_BLOCKING=1
export TORCH_USE_CUDA_DSA=1

accelerate launch ./training/run_parler_tts_training.py \
    --model_name_or_path "parler-tts/parler-tts-mini-v1" \
    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
    --description_tokenizer_name "google/flan-t5-large" \
    --prompt_tokenizer_name "google/flan-t5-large" \
    --report_to "wandb" \
    --wandb_key ${WANDB_KEY} \
    --overwrite_output_dir true \
    --train_dataset_name "OpenSound/CapSpeech" \
    --train_split_name "train_PT" \
    --eval_dataset_name "OpenSound/CapSpeech" \
    --eval_split_name "validation_PT" \
    --mls_dir ${MLS_WAV_DIR} \
    --librittsrmix_dir ${LIBRITTSRMIX_WAV_DIR} \
    --gigaspeech_dir ${GIGASPEECH_WAV_DIR} \
    --commonvoice_dir ${COMMONVOICE_WAV_DIR} \
    --emilia_dir ${EMILIA_WAV_DIR} \
    --max_eval_samples 96 \
    --per_device_eval_batch_size 32 \
    --target_audio_column_name "audio_path" \
    --description_column_name "caption" \
    --source_column_name "source" \
    --prompt_column_name "text" \
    --max_duration_in_seconds 20 \
    --min_duration_in_seconds 3 \
    --max_text_length 600 \
    --preprocessing_num_workers 32 \
    --do_train true \
    --num_train_epochs 10 \
    --gradient_accumulation_steps 6 \
    --gradient_checkpointing false \
    --per_device_train_batch_size 4 \
    --learning_rate 0.001 \
    --adam_beta1 0.9 \
    --adam_beta2 0.99 \
    --weight_decay 0.01 \
    --lr_scheduler_type "constant_with_warmup" \
    --warmup_steps 5000 \
    --logging_steps 200 \
    --freeze_text_encoder false \
    --per_device_eval_batch_size 4 \
    --audio_encoder_per_device_batch_size 24 \
    --dtype "float16" \
    --seed 456 \
    --output_dir ${OUTPUT_DIR} \
    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
    --save_to_disk ${SAVE_TO_DISK} \
    --dataloader_num_workers 32 \
    --do_eval \
    --evaluation_strategy steps \
    --eval_steps 5000 \
    --save_steps 5000 \
    --group_by_length true