|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
- self-supervised-learning |
|
language: en |
|
datasets: |
|
- librispeech |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 SSL model |
|
|
|
### `espnet/hubert_large_gs_16_librilight60k` |
|
|
|
This model was trained by wanchichen using librispeech recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
|
|
pip install -e . |
|
cd egs2/librispeech/ssl1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model espnet/hubert_large_gs_16_librilight60k |
|
``` |
|
|
|
|
|
|
|
## SSL config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf/tuning/train_ssl_torchaudiohubert_large_960h_pretrain_it2_bins.yaml |
|
print_config: false |
|
log_level: INFO |
|
dry_run: false |
|
iterator_type: sequence |
|
output_dir: exp/hubert_iter1_train_ssl_torchaudiohubert_large_960h_pretrain_it2_bins_raw |
|
ngpu: 1 |
|
seed: 0 |
|
num_workers: 16 |
|
num_att_plot: 3 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: 8 |
|
dist_rank: 0 |
|
local_rank: 0 |
|
dist_master_addr: localhost |
|
dist_master_port: 55415 |
|
dist_launcher: null |
|
multiprocessing_distributed: true |
|
unused_parameters: true |
|
sharded_ddp: false |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: true |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 190 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - valid |
|
- loss |
|
- min |
|
keep_nbest_models: 10 |
|
nbest_averaging_interval: 0 |
|
grad_clip: 5.0 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 16 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: true |
|
log_interval: null |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
pretrain_path: null |
|
init_param: [] |
|
ignore_init_mismatch: false |
|
freeze_param: [] |
|
num_iters_per_epoch: 40000 |
|
batch_size: 20 |
|
valid_batch_size: null |
|
batch_bins: 100000 |
|
valid_batch_bins: null |
|
train_shape_file: |
|
- exp/hubert_iter1_stats_raw/splits16/speech_shape |
|
- exp/hubert_iter1_stats_raw/splits16/text_shape.word |
|
valid_shape_file: |
|
- exp/hubert_iter1_stats_raw/valid/speech_shape |
|
- exp/hubert_iter1_stats_raw/valid/text_shape.word |
|
batch_type: numel |
|
valid_batch_type: null |
|
fold_length: |
|
- 80000 |
|
- 400 |
|
sort_in_batch: descending |
|
sort_batch: descending |
|
multiple_iterator: true |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
train_data_path_and_name_and_type: |
|
- - exp/hubert_iter1_stats_raw/splits16/wav.scp |
|
- speech |
|
- sound |
|
- - exp/hubert_iter1_stats_raw/splits16/text.km.kmeans_iter1_hubert_train_60k_portion0.1_gigaspeech |
|
- text |
|
- text |
|
valid_data_path_and_name_and_type: |
|
- - dump/raw/dev/wav.scp |
|
- speech |
|
- sound |
|
- - dump/raw/dev/text.km.kmeans_iter1_hubert_train_60k_portion0.1_gigaspeech |
|
- text |
|
- text |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adam |
|
optim_conf: |
|
lr: 0.0005 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 32000 |
|
token_list: |
|
- '6' |
|
- '185' |
|
- '233' |
|
- '206' |
|
- '200' |
|
- '47' |
|
- '129' |
|
- '362' |
|
- '436' |
|
- '50' |
|
- '30' |
|
- '137' |
|
- '39' |
|
- '126' |
|
- '81' |
|
- '78' |
|
- '444' |
|
- '439' |
|
- '230' |
|
- '33' |
|
- '14' |
|
- '61' |
|
- '450' |
|
- '239' |
|
- '293' |
|
- '161' |
|
- '410' |
|
- '355' |
|
- '262' |
|
- '475' |
|
- '338' |
|
- '201' |
|
- '242' |
|
- '318' |
|
- '159' |
|
- '56' |
|
- '190' |
|
- '21' |
|
- '259' |
|
- '458' |
|
- '187' |
|
- '268' |
|
- '7' |
|
- '398' |
|
- '67' |
|
- '11' |
|
- '455' |
|
- '241' |
|
- '4' |
|
- '261' |
|
- '412' |
|
- '388' |
|
- '402' |
|
- '414' |
|
- '180' |
|
- '41' |
|
- '198' |
|
- '100' |
|
- '459' |
|
- '96' |
|
- '235' |
|
- '267' |
|
- '203' |
|
- '189' |
|
- '60' |
|
- '160' |
|
- '64' |
|
- '300' |
|
- '295' |
|
- '473' |
|
- '37' |
|
- '130' |
|
- '77' |
|
- '424' |
|
- '369' |
|
- '217' |
|
- '366' |
|
- '3' |
|
- '82' |
|
- '149' |
|
- '330' |
|
- '79' |
|
- '351' |
|
- '223' |
|
- '446' |
|
- '269' |
|
- '148' |
|
- '186' |
|
- '110' |
|
- '62' |
|
- '471' |
|
- '316' |
|
- '433' |
|
- '127' |
|
- '354' |
|
- '243' |
|
- '457' |
|
- '240' |
|
- '375' |
|
- '46' |
|
- '40' |
|
- '339' |
|
- '224' |
|
- '183' |
|
- '179' |
|
- '357' |
|
- '430' |
|
- '83' |
|
- '49' |
|
- '154' |
|
- '237' |
|
- '460' |
|
- '353' |
|
- '289' |
|
- '92' |
|
- '109' |
|
- '311' |
|
- '71' |
|
- '391' |
|
- '406' |
|
- '43' |
|
- '73' |
|
- '418' |
|
- '437' |
|
- '250' |
|
- '463' |
|
- '120' |
|
- '346' |
|
- '146' |
|
- '454' |
|
- '211' |
|
- '274' |
|
- '167' |
|
- '345' |
|
- '10' |
|
- '68' |
|
- '348' |
|
- '244' |
|
- '102' |
|
- '474' |
|
- '192' |
|
- '144' |
|
- '112' |
|
- '25' |
|
- '449' |
|
- '308' |
|
- '405' |
|
- '48' |
|
- '212' |
|
- '205' |
|
- '124' |
|
- '153' |
|
- '9' |
|
- '5' |
|
- '258' |
|
- '306' |
|
- '80' |
|
- '394' |
|
- '328' |
|
- '208' |
|
- '166' |
|
- '36' |
|
- '352' |
|
- '18' |
|
- '397' |
|
- '66' |
|
- '31' |
|
- '16' |
|
- '426' |
|
- '332' |
|
- '23' |
|
- '281' |
|
- '215' |
|
- '88' |
|
- '171' |
|
- '221' |
|
- '184' |
|
- '202' |
|
- '470' |
|
- '247' |
|
- '38' |
|
- '389' |
|
- '315' |
|
- '197' |
|
- '349' |
|
- '304' |
|
- '393' |
|
- '380' |
|
- '132' |
|
- '456' |
|
- '367' |
|
- '479' |
|
- '360' |
|
- '123' |
|
- '162' |
|
- '365' |
|
- '337' |
|
- '467' |
|
- '234' |
|
- '364' |
|
- '376' |
|
- '173' |
|
- '478' |
|
- '425' |
|
- '218' |
|
- '297' |
|
- '469' |
|
- '282' |
|
- '298' |
|
- '451' |
|
- '20' |
|
- '117' |
|
- '52' |
|
- '113' |
|
- '165' |
|
- '280' |
|
- '292' |
|
- '226' |
|
- '104' |
|
- '55' |
|
- '145' |
|
- '286' |
|
- '86' |
|
- '294' |
|
- '15' |
|
- '216' |
|
- '279' |
|
- '275' |
|
- '253' |
|
- '312' |
|
- '378' |
|
- '287' |
|
- '76' |
|
- '168' |
|
- '116' |
|
- '368' |
|
- '396' |
|
- '336' |
|
- '290' |
|
- '53' |
|
- '103' |
|
- '0' |
|
- '411' |
|
- '228' |
|
- '408' |
|
- '285' |
|
- '151' |
|
- '325' |
|
- '193' |
|
- '428' |
|
- '401' |
|
- '320' |
|
- '182' |
|
- '480' |
|
- '264' |
|
- '383' |
|
- '114' |
|
- '115' |
|
- '374' |
|
- '141' |
|
- '22' |
|
- '466' |
|
- '384' |
|
- '174' |
|
- '59' |
|
- '326' |
|
- '105' |
|
- '232' |
|
- '464' |
|
- '251' |
|
- '24' |
|
- '172' |
|
- '150' |
|
- '299' |
|
- '89' |
|
- '344' |
|
- '427' |
|
- '333' |
|
- '434' |
|
- '107' |
|
- '291' |
|
- '194' |
|
- '497' |
|
- '452' |
|
- '317' |
|
- '254' |
|
- '213' |
|
- '499' |
|
- '483' |
|
- '432' |
|
- '95' |
|
- '321' |
|
- '111' |
|
- '8' |
|
- '175' |
|
- '277' |
|
- '65' |
|
- '342' |
|
- '382' |
|
- '301' |
|
- '45' |
|
- '443' |
|
- '63' |
|
- '93' |
|
- '489' |
|
- '74' |
|
- '387' |
|
- '370' |
|
- '340' |
|
- '358' |
|
- '220' |
|
- '429' |
|
- '2' |
|
- '331' |
|
- '181' |
|
- '32' |
|
- '324' |
|
- '191' |
|
- '238' |
|
- '313' |
|
- '157' |
|
- '91' |
|
- '101' |
|
- '118' |
|
- '350' |
|
- '356' |
|
- '486' |
|
- '188' |
|
- '142' |
|
- '419' |
|
- '195' |
|
- '164' |
|
- '487' |
|
- '255' |
|
- '323' |
|
- '222' |
|
- '35' |
|
- '245' |
|
- '359' |
|
- '249' |
|
- '98' |
|
- '271' |
|
- '231' |
|
- '125' |
|
- '29' |
|
- '34' |
|
- '119' |
|
- '134' |
|
- '284' |
|
- '309' |
|
- '409' |
|
- '422' |
|
- '147' |
|
- '484' |
|
- '462' |
|
- '390' |
|
- '440' |
|
- '283' |
|
- '84' |
|
- '108' |
|
- '139' |
|
- '170' |
|
- '303' |
|
- '371' |
|
- '381' |
|
- '278' |
|
- '329' |
|
- '28' |
|
- '87' |
|
- '403' |
|
- '256' |
|
- '441' |
|
- '334' |
|
- '12' |
|
- '260' |
|
- '265' |
|
- '69' |
|
- '122' |
|
- '488' |
|
- '99' |
|
- '42' |
|
- '302' |
|
- '97' |
|
- '70' |
|
- '152' |
|
- '177' |
|
- '138' |
|
- '296' |
|
- '51' |
|
- '491' |
|
- '199' |
|
- '176' |
|
- '204' |
|
- '169' |
|
- '386' |
|
- '494' |
|
- '400' |
|
- '341' |
|
- '229' |
|
- '273' |
|
- '485' |
|
- '135' |
|
- '227' |
|
- '54' |
|
- '314' |
|
- '343' |
|
- '477' |
|
- '465' |
|
- '482' |
|
- '257' |
|
- '435' |
|
- '423' |
|
- '121' |
|
- '496' |
|
- '448' |
|
- '453' |
|
- '85' |
|
- '57' |
|
- '276' |
|
- '210' |
|
- '272' |
|
- '236' |
|
- '407' |
|
- '445' |
|
- '90' |
|
- '266' |
|
- '490' |
|
- '307' |
|
- '155' |
|
- '136' |
|
- '19' |
|
- '319' |
|
- '498' |
|
- '163' |
|
- '75' |
|
- '442' |
|
- '495' |
|
- '421' |
|
- '209' |
|
- '361' |
|
- '156' |
|
- '395' |
|
- '472' |
|
- '415' |
|
- '347' |
|
- '252' |
|
- '468' |
|
- '476' |
|
- '106' |
|
- '143' |
|
- '263' |
|
- '373' |
|
- '327' |
|
- '322' |
|
- '399' |
|
- '404' |
|
- '13' |
|
- '288' |
|
- '207' |
|
- '58' |
|
- '481' |
|
- '131' |
|
- '385' |
|
- '447' |
|
- '219' |
|
- '438' |
|
- '461' |
|
- '416' |
|
- '246' |
|
- '417' |
|
- '26' |
|
- '158' |
|
- '431' |
|
- '270' |
|
- '128' |
|
- '413' |
|
- '310' |
|
- '140' |
|
- '17' |
|
- '392' |
|
- '44' |
|
- '27' |
|
- '214' |
|
- '377' |
|
- '305' |
|
- '72' |
|
- '420' |
|
- '133' |
|
- '363' |
|
- '379' |
|
- '94' |
|
- '225' |
|
- '335' |
|
- '493' |
|
- '492' |
|
- '372' |
|
- '196' |
|
- '248' |
|
- '178' |
|
- '1' |
|
- <unk> |
|
- <sos/eos> |
|
init: null |
|
collate_fn_conf: |
|
label_downsampling: 1 |
|
pad: false |
|
rand_crop: true |
|
input_size: 1 |
|
num_classes: 500 |
|
use_preprocessor: true |
|
token_type: word |
|
bpemodel: null |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: null |
|
speech_volume_normalize: null |
|
rir_scp: null |
|
rir_apply_prob: 1.0 |
|
noise_scp: null |
|
noise_apply_prob: 1.0 |
|
noise_db_range: '13_15' |
|
pred_masked_weight: 1.0 |
|
pred_nomask_weight: 0.0 |
|
loss_weights: 0.0 |
|
frontend: null |
|
frontend_conf: {} |
|
specaug: null |
|
specaug_conf: {} |
|
normalize: null |
|
normalize_conf: {} |
|
preencoder: null |
|
preencoder_conf: {} |
|
encoder: torchaudio_hubert |
|
encoder_conf: |
|
encoder_projection_dropout: 0.0 |
|
encoder_attention_dropout: 0.0 |
|
encoder_ff_interm_dropout: 0.0 |
|
encoder_dropout: 0.0 |
|
encoder_layer_drop: 0.0 |
|
extractor_mode: layer_norm |
|
encoder_embed_dim: 1024 |
|
encoder_num_layers: 24 |
|
encoder_num_heads: 16 |
|
encoder_ff_interm_features: 4096 |
|
encoder_layer_norm_first: true |
|
normalize_feats: true |
|
final_dim: 768 |
|
model: torchaudio |
|
model_conf: {} |
|
required: |
|
- output_dir |
|
- token_list |
|
version: '202301' |
|
distributed: true |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
|
|
|
|
|
|
``` |
|
|
|
or arXiv: |
|
|
|
```bibtex |
|
@misc{watanabe2018espnet, |
|
title={ESPnet: End-to-End Speech Processing Toolkit}, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
year={2018}, |
|
eprint={1804.00015}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|