ash56 commited on 12 days ago

Commit

7b2dad9

verified ·

1 Parent(s): 66a0dab

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq/examples/__pycache__/__init__.cpython-310.pyc +0 -0
fairseq/examples/wav2vec/unsupervised/config/timit_matched/test.uid +192 -0
fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid +1000 -0
fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid +620 -0
fairseq/examples/wav2vec/unsupervised/data/__init__.py +13 -0
fairseq/examples/wav2vec/unsupervised/data/extracted_features_dataset.py +167 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh +15 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh +37 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh +35 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh +63 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh +52 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh +129 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py +135 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh +37 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh +35 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh +175 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh +239 -0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh +281 -0
fairseq/examples/wav2vec/unsupervised/models/__init__.py +11 -0
fairseq/examples/wav2vec/unsupervised/models/wav2vec_u.py +687 -0
fairseq/examples/wav2vec/unsupervised/scripts/apply_pca.py +76 -0
fairseq/examples/wav2vec/unsupervised/scripts/copy_labels.py +10 -0
fairseq/examples/wav2vec/unsupervised/scripts/filter_lexicon.py +40 -0
fairseq/examples/wav2vec/unsupervised/scripts/filter_tsv.py +37 -0
fairseq/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py +45 -0
fairseq/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py +16 -0
fairseq/examples/wav2vec/unsupervised/scripts/mean_pool.py +99 -0
fairseq/examples/wav2vec/unsupervised/scripts/merge_clusters.py +114 -0
fairseq/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py +72 -0
fairseq/examples/wav2vec/unsupervised/scripts/normalize_text.py +22 -0
fairseq/examples/wav2vec/unsupervised/scripts/pca.py +53 -0
fairseq/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py +83 -0
fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio.sh +78 -0
fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh +68 -0
fairseq/examples/wav2vec/unsupervised/scripts/prepare_text.sh +83 -0
fairseq/examples/wav2vec/unsupervised/scripts/prepare_timit.sh +79 -0
fairseq/examples/wav2vec/unsupervised/scripts/remove_silence.py +63 -0
fairseq/examples/wav2vec/unsupervised/scripts/vads.py +98 -0
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py +128 -0
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py +210 -0
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py +119 -0
fairseq/examples/wav2vec/unsupervised/scripts/wer.py +82 -0
fairseq/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py +16 -0
fairseq/examples/wav2vec/unsupervised/tasks/__init__.py +11 -0
fairseq/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py +452 -0
fairseq/examples/wav2vec/xlsr/README.md +95 -0
fairseq/examples/wav2vec/xlsr/config/finetune.yaml +66 -0
fairseq/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py +173 -0
fairseq/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py +222 -0
fairseq/examples/wmt19/README.md +85 -0

fairseq/examples/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (298 Bytes). View file

fairseq/examples/wav2vec/unsupervised/config/timit_matched/test.uid ADDED Viewed

	@@ -0,0 +1,192 @@

+FDHC0_SI1559
+FDHC0_SI2189
+FDHC0_SI929
+FDHC0_SX119
+FDHC0_SX209
+FDHC0_SX29
+FDHC0_SX299
+FDHC0_SX389
+FELC0_SI1386
+FELC0_SI2016
+FELC0_SI756
+FELC0_SX126
+FELC0_SX216
+FELC0_SX306
+FELC0_SX36
+FELC0_SX396
+FJLM0_SI1043
+FJLM0_SI1673
+FJLM0_SI2303
+FJLM0_SX143
+FJLM0_SX233
+FJLM0_SX323
+FJLM0_SX413
+FJLM0_SX53
+FMGD0_SI1564
+FMGD0_SI2194
+FMGD0_SI934
+FMGD0_SX124
+FMGD0_SX214
+FMGD0_SX304
+FMGD0_SX34
+FMGD0_SX394
+FMLD0_SI2185
+FMLD0_SI822
+FMLD0_SI925
+FMLD0_SX115
+FMLD0_SX205
+FMLD0_SX25
+FMLD0_SX295
+FMLD0_SX385
+FNLP0_SI1308
+FNLP0_SI1938
+FNLP0_SI678
+FNLP0_SX138
+FNLP0_SX228
+FNLP0_SX318
+FNLP0_SX408
+FNLP0_SX48
+FPAS0_SI1272
+FPAS0_SI2204
+FPAS0_SI944
+FPAS0_SX134
+FPAS0_SX224
+FPAS0_SX314
+FPAS0_SX404
+FPAS0_SX44
+FPKT0_SI1538
+FPKT0_SI2168
+FPKT0_SI908
+FPKT0_SX188
+FPKT0_SX278
+FPKT0_SX368
+FPKT0_SX8
+FPKT0_SX98
+MBPM0_SI1577
+MBPM0_SI1584
+MBPM0_SI947
+MBPM0_SX137
+MBPM0_SX227
+MBPM0_SX317
+MBPM0_SX407
+MBPM0_SX47
+MCMJ0_SI1094
+MCMJ0_SI464
+MCMJ0_SI602
+MCMJ0_SX104
+MCMJ0_SX14
+MCMJ0_SX194
+MCMJ0_SX284
+MCMJ0_SX374
+MDAB0_SI1039
+MDAB0_SI1669
+MDAB0_SI2299
+MDAB0_SX139
+MDAB0_SX229
+MDAB0_SX319
+MDAB0_SX409
+MDAB0_SX49
+MGRT0_SI1450
+MGRT0_SI2080
+MGRT0_SI820
+MGRT0_SX10
+MGRT0_SX100
+MGRT0_SX190
+MGRT0_SX280
+MGRT0_SX370
+MJDH0_SI1354
+MJDH0_SI1984
+MJDH0_SI724
+MJDH0_SX184
+MJDH0_SX274
+MJDH0_SX364
+MJDH0_SX4
+MJDH0_SX94
+MJLN0_SI1449
+MJLN0_SI2079
+MJLN0_SI819
+MJLN0_SX189
+MJLN0_SX279
+MJLN0_SX369
+MJLN0_SX9
+MJLN0_SX99
+MJMP0_SI1535
+MJMP0_SI1791
+MJMP0_SI905
+MJMP0_SX185
+MJMP0_SX275
+MJMP0_SX365
+MJMP0_SX5
+MJMP0_SX95
+MKLT0_SI1213
+MKLT0_SI1843
+MKLT0_SI583
+MKLT0_SX133
+MKLT0_SX223
+MKLT0_SX313
+MKLT0_SX403
+MKLT0_SX43
+MLLL0_SI1363
+MLLL0_SI1993
+MLLL0_SI733
+MLLL0_SX103
+MLLL0_SX13
+MLLL0_SX193
+MLLL0_SX283
+MLLL0_SX373
+MLNT0_SI1574
+MLNT0_SI1902
+MLNT0_SI642
+MLNT0_SX102
+MLNT0_SX12
+MLNT0_SX192
+MLNT0_SX282
+MLNT0_SX372
+MNJM0_SI1580
+MNJM0_SI2210
+MNJM0_SI950
+MNJM0_SX140
+MNJM0_SX230
+MNJM0_SX320
+MNJM0_SX410
+MNJM0_SX50
+MPAM0_SI1189
+MPAM0_SI1819
+MPAM0_SI1961
+MPAM0_SX109
+MPAM0_SX19
+MPAM0_SX199
+MPAM0_SX289
+MPAM0_SX379
+MTAS1_SI1473
+MTAS1_SI2098
+MTAS1_SI838
+MTAS1_SX118
+MTAS1_SX208
+MTAS1_SX28
+MTAS1_SX298
+MTAS1_SX388
+MTLS0_SI1370
+MTLS0_SI2000
+MTLS0_SI740
+MTLS0_SX110
+MTLS0_SX20
+MTLS0_SX200
+MTLS0_SX290
+MTLS0_SX380
+MWBT0_SI1553
+MWBT0_SI2183
+MWBT0_SI923
+MWBT0_SX113
+MWBT0_SX203
+MWBT0_SX23
+MWBT0_SX293
+MWBT0_SX383
+MWEW0_SI1361
+MWEW0_SI1991
+MWEW0_SI731
+MWEW0_SX101
+MWEW0_SX11
+MWEW0_SX191
+MWEW0_SX281
+MWEW0_SX371

fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid ADDED Viewed

	@@ -0,0 +1,1000 @@

+FAEM0_SI762
+FAEM0_SX42
+FAJW0_SA1
+FAJW0_SX3
+FAJW0_SX93
+FALK0_SX186
+FALK0_SX6
+FALR0_SI1325
+FBAS0_SA1
+FBAS0_SX217
+FBCG1_SA1
+FBCG1_SX172
+FBCG1_SX442
+FBCH0_SX236
+FBCH0_SX416
+FBLV0_SA1
+FBLV0_SI1058
+FBLV0_SX338
+FBLV0_SX68
+FBMH0_SA1
+FBMJ0_SI815
+FCAG0_SA1
+FCAG0_SX153
+FCAG0_SX243
+FCAJ0_SI1479
+FCAJ0_SX309
+FCDR1_SX106
+FCDR1_SX196
+FCEG0_SA2
+FCJF0_SA1
+FCJF0_SX127
+FCJS0_SI1607
+FCJS0_SI2237
+FCJS0_SX257
+FCKE0_SA2
+FCKE0_SX121
+FCLT0_SI2068
+FCLT0_SX448
+FCLT0_SX88
+FCMG0_SA2
+FCMG0_SI1872
+FCMG0_SX72
+FCMM0_SA1
+FCMM0_SA2
+FCMM0_SX183
+FCRZ0_SI2053
+FCRZ0_SX433
+FCYL0_SA1
+FCYL0_SX37
+FDAS1_SI2091
+FDAS1_SX201
+FDAS1_SX381
+FDAW0_SI1406
+FDFB0_SA1
+FDFB0_SA2
+FDFB0_SI2010
+FDFB0_SX58
+FDJH0_SX305
+FDML0_SA2
+FDML0_SX159
+FDML0_SX249
+FDML0_SX429
+FDMY0_SA2
+FDMY0_SX27
+FDNC0_SX198
+FDNC0_SX288
+FDTD0_SX211
+FDXW0_SA1
+FDXW0_SX251
+FDXW0_SX341
+FDXW0_SX71
+FEAC0_SX165
+FEAC0_SX75
+FEAR0_SI622
+FECD0_SX68
+FEEH0_SA1
+FEEH0_SI1742
+FEEH0_SI471
+FEEH0_SX122
+FEME0_SA1
+FEME0_SX155
+FEME0_SX65
+FETB0_SA1
+FETB0_SI1148
+FETB0_SX158
+FEXM0_SI1101
+FGCS0_SX136
+FGCS0_SX226
+FGCS0_SX316
+FGCS0_SX406
+FGDP0_SA1
+FGMB0_SI1775
+FGMB0_SX245
+FHLM0_SX390
+FHXS0_SA2
+FHXS0_SX445
+FJDM2_SA1
+FJDM2_SX232
+FJDM2_SX52
+FJHK0_SX302
+FJKL0_SX212
+FJKL0_SX392
+FJLG0_SI2306
+FJLR0_SA1
+FJRP1_SI2062
+FJRP1_SX82
+FJSK0_SA1
+FJSP0_SX264
+FJSP0_SX354
+FJSP0_SX444
+FJWB1_SA1
+FJWB1_SX345
+FJWB1_SX435
+FJXM0_SA1
+FJXM0_SI581
+FJXM0_SX401
+FJXP0_SA1
+FJXP0_SI1122
+FJXP0_SX132
+FKAA0_SX128
+FKAA0_SX398
+FKDE0_SA1
+FKDE0_SX151
+FKDE0_SX241
+FKDE0_SX421
+FKDE0_SX61
+FKDW0_SX397
+FKFB0_SA2
+FKFB0_SX348
+FKFB0_SX78
+FKKH0_SA1
+FKKH0_SA2
+FKKH0_SX120
+FKKH0_SX390
+FKLC0_SX355
+FKLC1_SI2308
+FKLC1_SX238
+FKLC1_SX328
+FKLC1_SX418
+FKLH0_SA2
+FKLH0_SX177
+FKSR0_SA1
+FKSR0_SA2
+FKSR0_SI1747
+FKSR0_SI487
+FKSR0_SX217
+FLAC0_SX451
+FLAG0_SA2
+FLAG0_SX114
+FLAG0_SX204
+FLAG0_SX24
+FLAG0_SX384
+FLEH0_SI1681
+FLEH0_SI2311
+FLEH0_SX331
+FLET0_SA1
+FLHD0_SI1827
+FLHD0_SX354
+FLJA0_SA1
+FLJA0_SI2338
+FLJD0_SI886
+FLJD0_SX76
+FLJG0_SA2
+FLKM0_SA2
+FLKM0_SI686
+FLKM0_SX260
+FLKM0_SX80
+FLMA0_SA1
+FLMA0_SI613
+FLMA0_SX433
+FLMA0_SX73
+FLMC0_SX22
+FLMK0_SI1035
+FLMK0_SX315
+FLMK0_SX405
+FLOD0_SI1917
+FLOD0_SX117
+FLOD0_SX171
+FLOD0_SX297
+FLTM0_SA1
+FLTM0_SI1070
+FLTM0_SI2330
+FMAH1_SA2
+FMAH1_SX159
+FMBG0_SA2
+FMBG0_SI2264
+FMEM0_SI747
+FMEM0_SX387
+FMJB0_SI547
+FMJB0_SX97
+FMJF0_SA2
+FMJU0_SX309
+FMJU0_SX399
+FMKC0_SI1702
+FMKC0_SX442
+FMKC0_SX82
+FMKF0_SX186
+FMPG0_SA2
+FNKL0_SI1522
+FNTB0_SI1203
+FNTB0_SI573
+FNTB0_SX303
+FPAB1_SI1471
+FPAB1_SX211
+FPAC0_SA2
+FPAD0_SA2
+FPAD0_SX356
+FPAD0_SX86
+FPAF0_SA2
+FPAF0_SX154
+FPAZ0_SA1
+FPAZ0_SA2
+FPAZ0_SX243
+FPJF0_SA1
+FPJF0_SX146
+FPJF0_SX56
+FPLS0_SI1590
+FPLS0_SX330
+FPMY0_SA1
+FPMY0_SX343
+FREH0_SA1
+FREH0_SA2
+FREH0_SX415
+FRJB0_SX347
+FRLL0_SX434
+FSAG0_SA1
+FSAG0_SX243
+FSAH0_SA1
+FSAH0_SA2
+FSAH0_SX164
+FSAH0_SX434
+FSBK0_SA2
+FSBK0_SI1069
+FSBK0_SX169
+FSCN0_SA2
+FSCN0_SI626
+FSCN0_SX266
+FSCN0_SX446
+FSCN0_SX86
+FSDC0_SA2
+FSDC0_SX142
+FSDC0_SX322
+FSDC0_SX52
+FSDJ0_SI485
+FSDJ0_SX215
+FSDJ0_SX305
+FSDJ0_SX395
+FSGF0_SX117
+FSJG0_SX130
+FSJK1_SA2
+FSJK1_SX125
+FSJK1_SX35
+FSJS0_SX181
+FSJW0_SI1963
+FSJW0_SX433
+FSKC0_SI1416
+FSKC0_SI786
+FSKC0_SX246
+FSKL0_SI1529
+FSKL0_SX449
+FSKP0_SA2
+FSLS0_SX156
+FSLS0_SX426
+FSMA0_SA2
+FSMA0_SX181
+FSMM0_SX144
+FSMM0_SX234
+FSMS1_SX244
+FSMS1_SX347
+FSPM0_SA2
+FSPM0_SX161
+FSPM0_SX71
+FSRH0_SI1931
+FSRH0_SI671
+FSRH0_SX221
+FSRH0_SX401
+FTAJ0_SI699
+FTAJ0_SX159
+FTAJ0_SX249
+FTAJ0_SX429
+FTBR0_SX21
+FTBW0_SA1
+FTMG0_SI1532
+FTMG0_SI2162
+FTMG0_SX452
+FVFB0_SA2
+FVFB0_SX132
+FVFB0_SX42
+FVKB0_SA1
+FVMH0_SA2
+FVMH0_SX116
+FVMH0_SX26
+MABC0_SI1620
+MABC0_SI2041
+MABC0_SI781
+MADC0_SX107
+MADC0_SX377
+MADD0_SA2
+MADD0_SI1295
+MADD0_SX178
+MADD0_SX268
+MADD0_SX88
+MAEB0_SX450
+MAEO0_SA1
+MAFM0_SI939
+MAFM0_SX129
+MAFM0_SX309
+MAJP0_SA2
+MAKB0_SI1646
+MAKB0_SX26
+MAKB0_SX386
+MAKR0_SX362
+MAKR0_SX92
+MAPV0_SX213
+MARC0_SA2
+MARC0_SX108
+MARC0_SX18
+MARC0_SX198
+MARW0_SI1906
+MBAR0_SA1
+MBAR0_SX419
+MBAR0_SX59
+MBBR0_SI2315
+MBBR0_SX65
+MBCG0_SA1
+MBCG0_SI486
+MBEF0_SI1281
+MBEF0_SI1911
+MBEF0_SI651
+MBEF0_SX21
+MBEF0_SX381
+MBGT0_SA2
+MBGT0_SX261
+MBGT0_SX351
+MBGT0_SX441
+MBJV0_SA1
+MBJV0_SI617
+MBJV0_SX347
+MBMA0_SI592
+MBMA0_SX232
+MBMA0_SX52
+MBMA1_SI2214
+MBMA1_SX54
+MBML0_SA2
+MBML0_SI1169
+MBML0_SX89
+MBOM0_SA2
+MBOM0_SI2274
+MBOM0_SX294
+MBSB0_SA1
+MBSB0_SX3
+MBTH0_SA2
+MBTH0_SX122
+MBTH0_SX32
+MCAE0_SX277
+MCAL0_SA2
+MCAL0_SI1768
+MCDC0_SA1
+MCDC0_SX212
+MCDD0_SA2
+MCDD0_SI883
+MCDD0_SX253
+MCDD0_SX433
+MCDR0_SI1154
+MCEF0_SX235
+MCEF0_SX415
+MCEW0_SA2
+MCHL0_SX87
+MCLK0_SX310
+MCLM0_SA1
+MCLM0_SI2086
+MCLM0_SI826
+MCPM0_SA1
+MCPM0_SX114
+MCPM0_SX294
+MCPM0_SX384
+MCSS0_SI750
+MCTH0_SA1
+MCTH0_SX39
+MCXM0_SX91
+MDAC0_SA1
+MDAC0_SX181
+MDAC0_SX361
+MDAS0_SX6
+MDBB1_SX106
+MDBB1_SX16
+MDBB1_SX376
+MDBP0_SX168
+MDCD0_SI1415
+MDCD0_SX245
+MDCD0_SX425
+MDCM0_SX40
+MDCM0_SX400
+MDDC0_SI2049
+MDDC0_SI789
+MDDC0_SX159
+MDDC0_SX69
+MDED0_SA1
+MDED0_SA2
+MDEF0_SX123
+MDEF0_SX303
+MDHL0_SI1439
+MDHL0_SX269
+MDHL0_SX449
+MDHS0_SA1
+MDHS0_SA2
+MDHS0_SI1530
+MDHS0_SI2160
+MDJM0_SX105
+MDJM0_SX15
+MDKS0_SX436
+MDLB0_SA2
+MDLC0_SX405
+MDLC1_SA2
+MDLC1_SI2065
+MDLC1_SI2144
+MDLC1_SX445
+MDLC2_SI2244
+MDLC2_SX354
+MDLH0_SA2
+MDLM0_SI1234
+MDLM0_SI1864
+MDLM0_SX154
+MDLM0_SX424
+MDLR0_SA1
+MDLR0_SA2
+MDLR0_SI1863
+MDLR0_SI603
+MDLR0_SX153
+MDLR1_SA1
+MDLR1_SA2
+MDMA0_SI1430
+MDMA0_SX260
+MDMA0_SX80
+MDMT0_SA1
+MDMT0_SA2
+MDMT0_SI1832
+MDMT0_SX122
+MDMT0_SX32
+MDNS0_SA2
+MDNS0_SI2271
+MDNS0_SX201
+MDNS0_SX21
+MDPB0_SX416
+MDPK0_SI1053
+MDPK0_SX333
+MDPK0_SX423
+MDPS0_SI719
+MDPS0_SX359
+MDRD0_SA1
+MDRD0_SX32
+MDSJ0_SI2092
+MDSS0_SA2
+MDSS0_SX441
+MDSS1_SA1
+MDSS1_SI1327
+MDSS1_SI697
+MDSS1_SX157
+MDSS1_SX67
+MDTB0_SI1200
+MDTB0_SI1830
+MDTB0_SX120
+MDWD0_SA2
+MDWD0_SX270
+MDWD0_SX90
+MDWH0_SX215
+MDWH0_SX305
+MDWM0_SA1
+MDWM0_SA2
+MDWM0_SX16
+MDWM0_SX286
+MEAL0_SA2
+MEAL0_SI2177
+MEAL0_SX107
+MEAL0_SX347
+MEDR0_SA1
+MEDR0_SA2
+MEDR0_SI1374
+MEFG0_SA1
+MEGJ0_SA2
+MEGJ0_SX257
+MEGJ0_SX3
+MEJL0_SA1
+MEJL0_SX152
+MEJL0_SX242
+MEJS0_SI610
+MEJS0_SX160
+MEJS0_SX340
+MESG0_SX432
+MESJ0_SX187
+MESJ0_SX97
+MEWM0_SI718
+MEWM0_SX178
+MEWM0_SX88
+MFER0_SI862
+MFER0_SX142
+MFRM0_SX345
+MFRM0_SX435
+MFWK0_SI1879
+MFWK0_SX169
+MFXS0_SX54
+MFXV0_SA2
+MFXV0_SX105
+MGAF0_SA1
+MGAF0_SX22
+MGAF0_SX382
+MGAG0_SA2
+MGAK0_SX226
+MGAK0_SX46
+MGAR0_SX132
+MGAW0_SI535
+MGAW0_SX175
+MGES0_SA1
+MGES0_SI2111
+MGES0_SI851
+MGJC0_SA2
+MGJC0_SX75
+MGRL0_SI2127
+MGRL0_SI867
+MGRL0_SX147
+MGRP0_SA2
+MGSH0_SA2
+MGSH0_SI1806
+MGSH0_SX127
+MGSH0_SX276
+MGSH0_SX6
+MGSL0_SA1
+MGSL0_SI534
+MGSL0_SX264
+MGXP0_SX187
+MGXP0_SX7
+MHBS0_SX315
+MHBS0_SX45
+MHIT0_SA1
+MHJB0_SA1
+MHJB0_SI1017
+MHMG0_SX195
+MHMR0_SA1
+MHMR0_SI489
+MHRM0_SA1
+MHRM0_SI958
+MHRM0_SX148
+MHRM0_SX58
+MHXL0_SI1772
+MHXL0_SX242
+MILB0_SA2
+MJAC0_SX307
+MJAC0_SX71
+MJAE0_SX174
+MJAI0_SA1
+MJAI0_SA2
+MJBG0_SX62
+MJDA0_SI1031
+MJDA0_SX311
+MJDE0_SI463
+MJDG0_SA2
+MJDG0_SI1042
+MJDG0_SI1705
+MJDM0_SA1
+MJDM0_SI974
+MJEB0_SI656
+MJEB0_SX296
+MJEB1_SA2
+MJEB1_SX207
+MJEB1_SX387
+MJEE0_SA1
+MJEE0_SX247
+MJEE0_SX337
+MJFH0_SA2
+MJFH0_SI1107
+MJFR0_SX75
+MJHI0_SA1
+MJHI0_SX158
+MJJB0_SA1
+MJJB0_SX239
+MJJJ0_SX443
+MJJM0_SA2
+MJJM0_SI827
+MJJM0_SX107
+MJKR0_SA1
+MJKR0_SI571
+MJLB0_SX176
+MJLG1_SX292
+MJLS0_SX106
+MJMA0_SA1
+MJMA0_SA2
+MJMD0_SA2
+MJMD0_SX308
+MJMD0_SX38
+MJMM0_SX85
+MJPG0_SI1191
+MJPG0_SX111
+MJPG0_SX201
+MJPG0_SX21
+MJPM0_SA2
+MJPM0_SX378
+MJPM1_SI2280
+MJPM1_SX401
+MJRA0_SA1
+MJRA0_SA2
+MJRA0_SI1236
+MJRA0_SI1866
+MJRA0_SX426
+MJRG0_SI1366
+MJRG0_SI1996
+MJRG0_SX376
+MJRH0_SX225
+MJRH1_SA1
+MJRH1_SI514
+MJRH1_SX154
+MJRH1_SX244
+MJRH1_SX424
+MJRK0_SA1
+MJRK0_SA2
+MJRK0_SI1662
+MJRK0_SX160
+MJRK0_SX250
+MJRK0_SX430
+MJRP0_SA1
+MJRP0_SA2
+MJRP0_SX225
+MJSR0_SA1
+MJSR0_SI1424
+MJSR0_SX344
+MJWG0_SA1
+MJWG0_SX265
+MJWS0_SI513
+MJWS0_SX153
+MJWS0_SX63
+MJWT0_SA1
+MJWT0_SX121
+MJWT0_SX211
+MJWT0_SX301
+MJWT0_SX31
+MJWT0_SX391
+MJXA0_SX427
+MJXL0_SI542
+MKAG0_SA1
+MKAG0_SX259
+MKAJ0_SA2
+MKAJ0_SX154
+MKAM0_SA1
+MKAM0_SX146
+MKAM0_SX326
+MKAM0_SX56
+MKDB0_SA1
+MKDB0_SA2
+MKDB0_SX152
+MKDD0_SA2
+MKES0_SA1
+MKES0_SI1253
+MKES0_SI1883
+MKES0_SX173
+MKJO0_SI1517
+MKJO0_SI887
+MKJO0_SX437
+MKLN0_SI968
+MKLN0_SX248
+MKLR0_SA2
+MKLR0_SI1689
+MKLS0_SA1
+MKLS0_SX357
+MKLS0_SX87
+MKLS1_SA1
+MKLS1_SA2
+MKLS1_SX375
+MKLW0_SA1
+MKRG0_SX411
+MKXL0_SA2
+MKXL0_SX15
+MKXL0_SX375
+MLBC0_SA1
+MLBC0_SI1869
+MLBC0_SX249
+MLEL0_SA1
+MLEL0_SA2
+MLEL0_SI1246
+MLEL0_SX256
+MLEL0_SX436
+MLJC0_SX145
+MLJC0_SX415
+MLJH0_SX64
+MLNS0_SI2037
+MMAA0_SA1
+MMAA0_SA2
+MMAA0_SX35
+MMAB1_SI1494
+MMAB1_SX234
+MMAG0_SA2
+MMAG0_SI1126
+MMAG0_SX316
+MMAM0_SI2227
+MMAM0_SX157
+MMAM0_SX427
+MMAR0_SX256
+MMBS0_SI1781
+MMCC0_SA2
+MMDB0_SX177
+MMDG0_SA1
+MMDG0_SA2
+MMDG0_SI520
+MMDG0_SX160
+MMDG0_SX250
+MMDM0_SI1941
+MMDM0_SI681
+MMDM0_SX141
+MMDM1_SA2
+MMDM1_SI2043
+MMDM1_SX423
+MMDM1_SX63
+MMDS0_SA1
+MMEA0_SA1
+MMEA0_SX128
+MMEA0_SX398
+MMEB0_SA2
+MMEB0_SX187
+MMEB0_SX367
+MMGC0_SA2
+MMGC0_SX135
+MMGC0_SX225
+MMGG0_SX269
+MMGK0_SX332
+MMGK0_SX62
+MMJB1_SA2
+MMRP0_SA2
+MMRP0_SX144
+MMSM0_SX116
+MMSM0_SX206
+MMVP0_SA1
+MMVP0_SA2
+MMWB0_SI989
+MMWB0_SX89
+MMWS0_SA2
+MMWS0_SX168
+MMWS0_SX348
+MMWS0_SX438
+MMWS1_SI1701
+MMXS0_SI2136
+MMXS0_SX246
+MMXS0_SX426
+MNET0_SI816
+MNET0_SX6
+MNTW0_SA2
+MNTW0_SX168
+MNTW0_SX78
+MPAR0_SI2206
+MPAR0_SI946
+MPAR0_SX136
+MPAR0_SX316
+MPEB0_SI1034
+MPEB0_SI1860
+MPEB0_SX240
+MPEB0_SX330
+MPFU0_SI628
+MPFU0_SX448
+MPGH0_SX114
+MPGH0_SX24
+MPGR0_SX240
+MPGR0_SX330
+MPGR1_SX149
+MPPC0_SA1
+MPRD0_SA1
+MPRD0_SX261
+MPRD0_SX351
+MPRD0_SX441
+MPRD0_SX81
+MPRK0_SI1727
+MPRK0_SX107
+MPRK0_SX377
+MPRT0_SA1
+MPRT0_SX310
+MPSW0_SI1067
+MPSW0_SX167
+MPSW0_SX437
+MRAB1_SX128
+MRAB1_SX308
+MRAI0_SA1
+MRAI0_SA2
+MRAI0_SX72
+MRAM0_SA1
+MRAM0_SA2
+MRAM0_SX15
+MRBC0_SI1859
+MRBC0_SX329
+MRBC0_SX419
+MRCG0_SI798
+MRCG0_SX168
+MRCW0_SA1
+MRCW0_SX291
+MRDD0_SI1680
+MRDD0_SX150
+MRDD0_SX277
+MRDD0_SX60
+MRDM0_SI1595
+MRDM0_SX65
+MRDS0_SA1
+MREE0_SX24
+MREH1_SX249
+MREH1_SX69
+MREM0_SA2
+MREW1_SI870
+MRFK0_SX446
+MRFL0_SA1
+MRFL0_SX256
+MRFL0_SX436
+MRFL0_SX76
+MRGM0_SA2
+MRGM0_SX262
+MRGS0_SA2
+MRGS0_SX186
+MRHL0_SI885
+MRHL0_SX345
+MRHL0_SX435
+MRJB1_SA1
+MRJB1_SA2
+MRJB1_SX210
+MRJB1_SX30
+MRJB1_SX390
+MRJH0_SA2
+MRJH0_SX307
+MRJH0_SX79
+MRJM0_SX148
+MRJM1_SA2
+MRJM1_SI1298
+MRJM1_SI1928
+MRJM1_SX128
+MRJT0_SA2
+MRJT0_SI1498
+MRJT0_SX328
+MRJT0_SX418
+MRKM0_SA2
+MRKM0_SX367
+MRLD0_SA2
+MRLD0_SI2224
+MRLD0_SX154
+MRLD0_SX424
+MRLJ0_SA1
+MRLJ0_SX250
+MRLJ0_SX340
+MRLJ1_SA1
+MRLJ1_SA2
+MRLJ1_SX321
+MRLK0_SI843
+MRLK0_SX123
+MRLK0_SX213
+MRMB0_SA2
+MRMB0_SI1581
+MRMB0_SX411
+MRMG0_SA1
+MRMG0_SI1080
+MRMG0_SX450
+MRMH0_SI1349
+MRMH0_SI2281
+MRMH0_SX121
+MRML0_SA2
+MRML0_SX341
+MRPC1_SI2112
+MRRE0_SA2
+MRRE0_SX164
+MRRE0_SX344
+MRRE0_SX74
+MRSO0_SX129
+MRSO0_SX39
+MRSP0_SX259
+MRTC0_SX378
+MRVG0_SI1140
+MRVG0_SX240
+MRWA0_SI973
+MRWA0_SX163
+MRWA0_SX73
+MRWS0_SI1732
+MRWS0_SI472
+MRWS0_SX22
+MRWS0_SX382
+MRXB0_SA2
+MRXB0_SX415
+MSAH1_SI1679
+MSAS0_SX116
+MSAS0_SX206
+MSAS0_SX386
+MSAT0_SA1
+MSAT1_SX263
+MSAT1_SX443
+MSAT1_SX83
+MSDB0_SX197
+MSDB0_SX287
+MSDB0_SX377
+MSDH0_SI2240
+MSDH0_SX440
+MSDH0_SX80
+MSDS0_SA1
+MSEM1_SI1440
+MSEM1_SX180
+MSEM1_SX270
+MSES0_SI1589
+MSES0_SX239
+MSES0_SX419
+MSFH0_SX316
+MSFV0_SI1892
+MSFV0_SX362
+MSFV0_SX92
+MSMR0_SX415
+MSMS0_SA1
+MSMS0_SX173
+MSMS0_SX83
+MSRG0_SA1
+MSRG0_SI1221
+MSTF0_SI766
+MSTF0_SX316
+MSTF0_SX46
+MSVS0_SA2
+MSVS0_SX308
+MTAS0_SX215
+MTAS0_SX35
+MTAS0_SX395
+MTAT0_SX390
+MTAT1_SX59
+MTBC0_SI1803
+MTCS0_SA2
+MTCS0_SI2265
+MTCS0_SX82
+MTDP0_SA2
+MTER0_SA2
+MTER0_SI1787
+MTJG0_SA1
+MTJG0_SI2157
+MTJG0_SX260
+MTJM0_SI1856
+MTJM0_SX146
+MTJU0_SX130
+MTJU0_SX400
+MTKD0_SX107
+MTKD0_SX287
+MTKP0_SI1023
+MTLB0_SA1
+MTLB0_SX234
+MTLC0_SA1
+MTML0_SI2325
+MTML0_SX165
+MTMN0_SA2
+MTMN0_SI1064
+MTMN0_SI2324
+MTMN0_SX434
+MTMT0_SA2
+MTMT0_SI1748
+MTPF0_SX65
+MTPG0_SI1383
+MTPG0_SI753
+MTPG0_SX303
+MTPP0_SX338
+MTPR0_SX340
+MTQC0_SI480
+MTQC0_SX91
+MTRR0_SX198
+MTRR0_SX288
+MTRT0_SA2
+MTRT0_SX254
+MTRT0_SX57
+MTWH1_SX72
+MTXS0_SA1
+MTXS0_SA2
+MVJH0_SI926
+MVJH0_SX206
+MVJH0_SX296
+MVLO0_SA1
+MVRW0_SA2
+MVRW0_SX135
+MVRW0_SX225
+MWAC0_SA2
+MWAC0_SX341
+MWAC0_SX431
+MWAD0_SX432
+MWAD0_SX72
+MWAR0_SA1
+MWAR0_SI1675
+MWCH0_SI1895
+MWCH0_SI2252
+MWCH0_SX182
+MWCH0_SX452
+MWDK0_SA1
+MWDK0_SA2
+MWDK0_SI2017
+MWDK0_SI806
+MWDK0_SX176
+MWDK0_SX86
+MWEM0_SA2
+MWEM0_SI1320
+MWEM0_SI1393
+MWEM0_SX150
+MWGR0_SX346
+MWRE0_SX247
+MWRE0_SX337
+MWRE0_SX427
+MWRP0_SA1
+MWRP0_SX273
+MWRP0_SX363
+MWSB0_SX276
+MWSH0_SX256
+MWSH0_SX76
+MZMB0_SA1

fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid ADDED Viewed

	@@ -0,0 +1,620 @@

+FAEM0_SI1392
+FAJW0_SI1263
+FAJW0_SI633
+FALK0_SI658
+FALR0_SX335
+FAPB0_SI1063
+FAPB0_SI2323
+FAPB0_SX433
+FBAS0_SI1472
+FBAS0_SI2066
+FBCG1_SX352
+FBCH0_SI959
+FBJL0_SI922
+FBLV0_SI1688
+FBMH0_SI1136
+FBMH0_SI970
+FBMJ0_SA1
+FBMJ0_SI1776
+FBMJ0_SI516
+FBMJ0_SX336
+FCDR1_SI1186
+FCDR1_SI1816
+FCDR1_SI556
+FCDR1_SX286
+FCKE0_SI1741
+FCKE0_SI481
+FCLT0_SI808
+FCMG0_SI1142
+FCMG0_SX432
+FCMM0_SI1957
+FCMM0_SX420
+FCYL0_SI667
+FCYL0_SX349
+FDAS1_SI1461
+FDAS1_SI831
+FDAW0_SI1271
+FDAW0_SI2036
+FDJH0_SI935
+FDKN0_SI1202
+FDKN0_SX181
+FDKN0_SX451
+FDMY0_SA1
+FDMY0_SI567
+FDMY0_SI714
+FDMY0_SX387
+FDNC0_SI1278
+FDNC0_SI1908
+FDTD0_SA1
+FDTD0_SX321
+FEAC0_SI615
+FEAR0_SX352
+FECD0_SA1
+FECD0_SI1418
+FECD0_SI788
+FEME0_SI875
+FEME0_SX335
+FEXM0_SA1
+FEXM0_SI482
+FEXM0_SX366
+FGDP0_SI988
+FGDP0_SX88
+FGMB0_SI1145
+FGMB0_SX335
+FGRW0_SA1
+FGRW0_SI1152
+FGRW0_SX162
+FGRW0_SX432
+FHLM0_SX120
+FHLM0_SX349
+FHXS0_SA1
+FHXS0_SI1075
+FHXS0_SI2302
+FHXS0_SX175
+FJDM2_SA2
+FJDM2_SX142
+FJEN0_SA1
+FJEN0_SX327
+FJEN0_SX417
+FJHK0_SI2282
+FJKL0_SI932
+FJLG0_SI1889
+FJLR0_SI1231
+FJRB0_SX402
+FJRP1_SA1
+FJRP1_SI1432
+FJRP1_SX262
+FJRP1_SX352
+FJSK0_SI1052
+FJSP0_SI1434
+FJWB1_SI748
+FJXM0_SX311
+FJXM0_SX41
+FJXP0_SI1752
+FKAA0_SA1
+FKDE0_SI1141
+FKDE0_SI1771
+FKDW0_SI1207
+FKDW0_SI1891
+FKFB0_SI1608
+FKFB0_SX438
+FKKH0_SI1290
+FKKH0_SI1920
+FKLC0_SI985
+FKLC0_SX175
+FKLC1_SI1048
+FKLH0_SI1257
+FKSR0_SX366
+FLAC0_SI1339
+FLAG0_SI1464
+FLAG0_SI834
+FLEH0_SI1051
+FLET0_SI507
+FLJA0_SI1078
+FLJA0_SX178
+FLJD0_SI1516
+FLJG0_SI981
+FLJG0_SX171
+FLJG0_SX351
+FLKM0_SA1
+FLKM0_SI620
+FLKM0_SX350
+FLKM0_SX440
+FLMC0_SI1372
+FLMK0_SA1
+FLMK0_SI1229
+FLTM0_SX170
+FLTM0_SX350
+FLTM0_SX440
+FMAH1_SI879
+FMBG0_SI1160
+FMEM0_SA1
+FMEM0_SX333
+FMJB0_SI1177
+FMJF0_SI624
+FMJF0_SX174
+FMJF0_SX84
+FMJU0_SI1389
+FMKC0_SI1041
+FMKF0_SI1018
+FMPG0_SA1
+FMPG0_SI972
+FMPG0_SX162
+FMPG0_SX342
+FMPG0_SX432
+FNKL0_SI892
+FNTB0_SI679
+FPAB1_SA1
+FPAB1_SI2101
+FPAB1_SI841
+FPAC0_SI1921
+FPAC0_SI661
+FPAD0_SI716
+FPAD0_SX176
+FPAF0_SA1
+FPAF0_SI1054
+FPAZ0_SI2223
+FPAZ0_SI963
+FPJF0_SI1259
+FPJF0_SX352
+FPLS0_SI960
+FPMY0_SI1153
+FPMY0_SI523
+FREH0_SI1945
+FRLL0_SI805
+FSAG0_SI1323
+FSAG0_SX153
+FSAG0_SX333
+FSAG0_SX423
+FSAH0_SI614
+FSAH0_SX327
+FSAK0_SI1300
+FSBK0_SX349
+FSCN0_SA1
+FSCN0_SI705
+FSCN0_SX176
+FSDC0_SI1312
+FSDJ0_SI1115
+FSGF0_SI2187
+FSGF0_SI927
+FSJG0_SA1
+FSJG0_SA2
+FSJG0_SI940
+FSJG0_SX220
+FSJG0_SX40
+FSJG0_SX400
+FSJS0_SA1
+FSJS0_SX451
+FSJW0_SI1333
+FSKP0_SI1098
+FSMA0_SI991
+FSMA0_SX451
+FSMM0_SX324
+FSPM0_SI1241
+FSPM0_SX251
+FSRH0_SX311
+FSSB0_SI1712
+FSSB0_SX362
+FTBR0_SI1402
+FTBR0_SI921
+FTBW0_SI715
+FTBW0_SX175
+FTLG0_SI1743
+FTLG0_SI483
+FTMG0_SI902
+FVFB0_SI1510
+FVKB0_SX349
+FVMH0_SI1466
+FVMH0_SI836
+MADC0_SI1367
+MADC0_SI737
+MAEB0_SI1411
+MAEO0_SI1326
+MAJP0_SI1704
+MAJP0_SX174
+MAKB0_SA2
+MAKB0_SI1016
+MAKB0_SI2276
+MAKB0_SX116
+MAPV0_SI1293
+MAPV0_SI663
+MARW0_SX286
+MARW0_SX349
+MBBR0_SI1055
+MBBR0_SX335
+MBCG0_SI957
+MBCG0_SX327
+MBGT0_SI1841
+MBGT0_SX171
+MBMA0_SI1222
+MBMA1_SI954
+MBMA1_SX324
+MBTH0_SI2102
+MBWP0_SX349
+MCAE0_SI1447
+MCAE0_SI2077
+MCAE0_SI817
+MCAL0_SI1138
+MCDR0_SI1784
+MCDR0_SI524
+MCEF0_SI842
+MCEW0_SA1
+MCEW0_SI2072
+MCEW0_SI812
+MCEW0_SX362
+MCEW0_SX452
+MCHL0_SI1347
+MCHL0_SI1404
+MCLK0_SI2290
+MCLK0_SI650
+MCPM0_SI1824
+MCSS0_SI1380
+MCSS0_SI688
+MCTM0_SI1350
+MCTM0_SI1980
+MDAC0_SI631
+MDAS0_SI1896
+MDAS0_SI636
+MDBP0_SI528
+MDBP0_SX438
+MDCD0_SI785
+MDCD0_SX335
+MDCM0_SI1480
+MDDC0_SI1419
+MDED0_SI540
+MDEF0_SI1123
+MDEM0_SA1
+MDEM0_SI608
+MDEM0_SI800
+MDEM0_SX428
+MDHS0_SI900
+MDJM0_SI1455
+MDKS0_SX166
+MDKS0_SX346
+MDLB0_SI1306
+MDLB0_SX136
+MDLB0_SX406
+MDLC0_SI1395
+MDLC0_SI2025
+MDLC1_SI1435
+MDLH0_SX160
+MDLH0_SX430
+MDLM0_SI604
+MDLR0_SX333
+MDLR1_SI669
+MDMA0_SX170
+MDMA0_SX350
+MDMA0_SX440
+MDNS0_SI1011
+MDNS0_SI873
+MDPB0_SI1760
+MDPB0_SI866
+MDRD0_SI752
+MDSJ0_SI1462
+MDSJ0_SX438
+MDWD0_SI1260
+MDWH0_SA1
+MDWH0_SI1168
+MDWH0_SI665
+MDWM0_SI916
+MEDR0_SI2004
+MEFG0_SI491
+MEFG0_SI598
+MEGJ0_SA1
+MEGJ0_SI1337
+MEGJ0_SI707
+MEGJ0_SX167
+MEJS0_SI1240
+MESG0_SI702
+MESJ0_SI2039
+MFWK0_SX349
+MFXS0_SX324
+MFXV0_SI1005
+MFXV0_SI1342
+MGAF0_SI1282
+MGAG0_SI691
+MGAK0_SI1036
+MGAK0_SX136
+MGAR0_SX312
+MGAW0_SI1165
+MGES0_SX311
+MGJC0_SX435
+MGRL0_SX327
+MGRP0_SI1317
+MGRP0_SX327
+MGSH0_SI1176
+MGSH0_SI546
+MGSL0_SI797
+MGXP0_SI1087
+MGXP0_SI525
+MHBS0_SI945
+MHIT0_SI983
+MHMG0_SI735
+MHMR0_SI1692
+MILB0_SI903
+MJAC0_SI701
+MJAC0_SX251
+MJAE0_SX84
+MJAI0_SI682
+MJAI0_SI710
+MJDC0_SI531
+MJDE0_SA1
+MJDE0_SI1120
+MJDE0_SI490
+MJDE0_SX220
+MJDM0_SI1340
+MJDM0_SX170
+MJDM0_SX350
+MJEB0_SX170
+MJEB1_SI1467
+MJEB1_SI837
+MJFR0_SA1
+MJFR0_SX435
+MJHI0_SI1328
+MJJJ0_SI1163
+MJJM0_SI1251
+MJLB0_SI1616
+MJLS0_SI1726
+MJMA0_SI2125
+MJMD0_SI2288
+MJMM0_SI1255
+MJMM0_SX175
+MJPG0_SI1821
+MJPM0_SI1368
+MJPM1_SX311
+MJRA0_SX336
+MJRG0_SI736
+MJRG0_SX352
+MJRH0_SI1840
+MJRH1_SI1558
+MJRK0_SI880
+MJRP0_SI1845
+MJSR0_SI2054
+MJSR0_SI794
+MJWG0_SI813
+MJWG0_SI895
+MJWG0_SX175
+MJWS0_SX333
+MJWT0_SI1291
+MJWT0_SI1381
+MJXL0_SI1172
+MKAG0_SI979
+MKAH0_SX178
+MKAM0_SI1250
+MKAM0_SI1465
+MKDD0_SI1567
+MKDD0_SI2197
+MKDD0_SI937
+MKDT0_SI814
+MKES0_SI623
+MKLS0_SI1437
+MKLS0_SI2067
+MKLS1_SI915
+MKLW0_SI1571
+MKLW0_SX311
+MKRG0_SI861
+MKXL0_SI1815
+MKXL0_SI1958
+MLBC0_SI1239
+MLEL0_SI616
+MLEL0_SX166
+MLJC0_SI1225
+MLJH0_SA1
+MLJH0_SA2
+MLJH0_SI1422
+MLJH0_SI694
+MLJH0_SX244
+MLSH0_SI1417
+MLSH0_SX247
+MMAA0_SI1588
+MMAA0_SI845
+MMAB1_SI864
+MMAB1_SX324
+MMAG0_SA1
+MMAG0_SI1756
+MMAG0_SX136
+MMAR0_SI1966
+MMAR0_SX166
+MMAR0_SX346
+MMBS0_SI521
+MMBS0_SX161
+MMCC0_SI1338
+MMDB0_SI987
+MMDG0_SI1780
+MMDM0_SI1311
+MMDM1_SX153
+MMDM1_SX333
+MMEB0_SX327
+MMGC0_SI1305
+MMGG0_SI1079
+MMGG0_SX449
+MMLM0_SI2150
+MMPM0_SX161
+MMRP0_SX324
+MMSM0_SI1106
+MMSM0_SI476
+MMVP0_SI654
+MMVP0_SX347
+MMWB0_SA1
+MMWB0_SI2249
+MMWB0_SX359
+MMWB0_SX449
+MNTW0_SI1068
+MNTW0_SI1698
+MPEB0_SI600
+MPFU0_SI1258
+MPGH0_SI675
+MPGR0_SI1410
+MPGR1_SI1499
+MPMB0_SA1
+MPMB0_SA2
+MPMB0_SI1501
+MPMB0_SI2131
+MPMB0_SI871
+MPMB0_SX151
+MPMB0_SX331
+MPMB0_SX421
+MPMB0_SX61
+MPPC0_SI1412
+MPRB0_SI1215
+MPRB0_SI575
+MPRD0_SI801
+MPRD0_SX171
+MPRK0_SA1
+MPRK0_SI1097
+MPRK0_SI467
+MPRK0_SX287
+MRAB0_SI1854
+MRAB1_SI848
+MRAI0_SI2052
+MRAI0_SI792
+MRAI0_SX432
+MRAM0_SI1951
+MRCG0_SA2
+MRCG0_SI1428
+MRCG0_SX348
+MRCG0_SX438
+MRCW0_SI741
+MRDM0_SI1044
+MRDM0_SX335
+MREE0_SI1104
+MREE0_SI1959
+MREH1_SA1
+MREH1_SI1599
+MREH1_SI969
+MREM0_SI511
+MRFK0_SI1076
+MRFL0_SI1156
+MRFL0_SI526
+MRFL0_SX166
+MRGM0_SI532
+MRGM0_SX172
+MRGM0_SX442
+MRGS0_SI1356
+MRGS0_SI726
+MRGS0_SX6
+MRJB1_SI1413
+MRJB1_SI2021
+MRJB1_SX120
+MRJH0_SI1519
+MRJH0_SI889
+MRJH0_SX169
+MRJT0_SI868
+MRJT0_SX58
+MRKM0_SI1267
+MRKM0_SI1391
+MRKM0_SI637
+MRLJ0_SI790
+MRLJ1_SI2301
+MRLK0_SI1468
+MRLR0_SI1196
+MRML0_SA1
+MRML0_SI1421
+MRML0_SX161
+MRML0_SX251
+MRMS0_SI2057
+MRRE0_SA1
+MRRE0_SI1334
+MRRE0_SI952
+MRSO0_SI1206
+MRSP0_SI1429
+MRTC0_SI1458
+MRTJ0_SA1
+MRTJ0_SI772
+MRTJ0_SX142
+MRTJ0_SX232
+MRTJ0_SX52
+MRWS0_SI1102
+MRXB0_SI2215
+MRXB0_SI955
+MSAS0_SI1376
+MSAS0_SI746
+MSDH0_SI980
+MSDH0_SX170
+MSDS0_SI1077
+MSDS0_SX267
+MSDS0_SX357
+MSEM1_SI2070
+MSEM1_SI810
+MSFH0_SA1
+MSFH0_SI1738
+MSFH0_SX136
+MSFH0_SX406
+MSFV0_SI632
+MSJK0_SI1596
+MSJK0_SX336
+MSMC0_SI509
+MSMR0_SI1150
+MSMS0_SI1433
+MSRR0_SI1761
+MSRR0_SI501
+MSTF0_SI852
+MSVS0_SI2198
+MSVS0_SI938
+MSVS0_SX398
+MTAB0_SI1572
+MTAB0_SX312
+MTAT0_SA1
+MTAT0_SI1110
+MTAT0_SI811
+MTAT1_SI779
+MTAT1_SX149
+MTAT1_SX329
+MTBC0_SI543
+MTCS0_SI712
+MTDB0_SI1401
+MTDB0_SI771
+MTDP0_SA1
+MTDP0_SI1521
+MTDP0_SX171
+MTDP0_SX351
+MTER0_SA1
+MTER0_SI1157
+MTER0_SX437
+MTJG0_SX170
+MTJS0_SA2
+MTJS0_SI1822
+MTJS0_SI562
+MTJS0_SX382
+MTJU0_SI2020
+MTKD0_SI630
+MTKP0_SI2283
+MTKP0_SI454
+MTLB0_SI1134
+MTLB0_SX324
+MTLC0_SI1313
+MTLC0_SI1477
+MTML0_SX435
+MTMN0_SI582
+MTMT0_SI488
+MTPP0_SI1508
+MTPR0_SI2230
+MTPR0_SX160
+MTPR0_SX430
+MTQC0_SA1
+MTQC0_SI1441
+MTQC0_SX181
+MTQC0_SX451
+MTRC0_SI589
+MTRR0_SI918
+MTRT0_SI1227
+MTXS0_SI1060
+MTXS0_SI2320
+MTXS0_SX160
+MTXS0_SX430
+MVJH0_SI1556
+MVLO0_SI517
+MWAC0_SI1601
+MWAC0_SX161
+MWAC0_SX251
+MWAR0_SI1045
+MWDK0_SI1436
+MWEM0_SX420
+MWRE0_SA2
+MWRE0_SI1057
+MWRE0_SX67
+MWRP0_SI1443
+MWSB0_SI996
+MWSH0_SI1426
+MWSH0_SI796
+MWSH0_SX166

fairseq/examples/wav2vec/unsupervised/data/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .extracted_features_dataset import ExtractedFeaturesDataset
+from .random_input_dataset import RandomInputDataset
+__all__ = [
+    "ExtractedFeaturesDataset",
+    "RandomInputDataset",
+]

fairseq/examples/wav2vec/unsupervised/data/extracted_features_dataset.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import contextlib
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset, data_utils
+logger = logging.getLogger(__name__)
+class ExtractedFeaturesDataset(FairseqDataset):
+    def __init__(
+        self,
+        path,
+        split,
+        min_length=3,
+        max_length=None,
+        labels=None,
+        label_dict=None,
+        shuffle=True,
+        sort_by_length=True,
+        aux_target_postfix=None,
+    ):
+        super().__init__()
+        self.min_length = min_length
+        self.max_length = max_length
+        self.shuffle = shuffle
+        self.sort_by_length = sort_by_length
+        self.label_dict = label_dict
+        if labels is not None:
+            assert label_dict is not None
+        self.sizes = []
+        self.offsets = []
+        self.labels = []
+        self.aux_tgt = None
+        path = os.path.join(path, split)
+        data_path = path
+        self.data = np.load(data_path + ".npy", mmap_mode="r")
+        offset = 0
+        skipped = 0
+        if not os.path.exists(path + f".{labels}"):
+            labels = None
+        with open(data_path + ".lengths", "r") as len_f, open(
+            path + f".{labels}", "r"
+        ) if labels is not None else contextlib.ExitStack() as lbl_f:
+            for line in len_f:
+                length = int(line.rstrip())
+                lbl = None if labels is None else next(lbl_f).rstrip().split()
+                if length >= min_length and (
+                    max_length is None or length <= max_length
+                ):
+                    self.sizes.append(length)
+                    self.offsets.append(offset)
+                    if lbl is not None:
+                        self.labels.append(lbl)
+                offset += length
+        self.sizes = np.asarray(self.sizes)
+        self.offsets = np.asarray(self.offsets)
+        if aux_target_postfix is not None:
+            if not os.path.exists(path+f".{aux_target_postfix}"):
+                logger.info(f"auxaliry target for {split} missing")
+            else:
+                with open(path+f".{aux_target_postfix}", "r") as t_f:
+                    self.aux_tgt = [
+                        torch.LongTensor(list(map(int,seg.strip().split())))\
+                                    for seg in t_f]
+        logger.info(f"loaded {len(self.offsets)}, skipped {skipped} samples")
+    def __getitem__(self, index):
+        offset = self.offsets[index]
+        end = self.sizes[index] + offset
+        feats = torch.from_numpy(self.data[offset:end].copy()).float()
+        res = {"id": index, "features": feats}
+        if len(self.labels) > 0:
+            res["target"] = self.label_dict.encode_line(
+                self.labels[index],
+                line_tokenizer=lambda x: x,
+                append_eos=False,
+            )
+        if self.aux_tgt:
+            res["aux_target"] = self.aux_tgt[index]
+        return res
+    def __len__(self):
+        return len(self.sizes)
+    def collater(self, samples):
+        if len(samples) == 0:
+            return {}
+        features = [s["features"] for s in samples]
+        sizes = [len(s) for s in features]
+        target_size = max(sizes)
+        collated_features = features[0].new_zeros(
+            len(features), target_size, features[0].size(-1)
+        )
+        padding_mask = torch.BoolTensor(collated_features.shape[:-1]).fill_(False)
+        for i, (f, size) in enumerate(zip(features, sizes)):
+            collated_features[i, :size] = f
+            padding_mask[i, size:] = True
+        res = {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": {"features": collated_features, "padding_mask": padding_mask},
+        }
+        if len(self.labels) > 0:
+            target = data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                pad_idx=self.label_dict.pad(),
+                left_pad=False,
+            )
+            res["target"] = target
+        if self.aux_tgt:
+            idxs = torch.nn.utils.rnn.pad_sequence(
+                [s["aux_target"] for s in samples],
+                batch_first=True,
+                padding_value=-1,
+            )
+            res["net_input"]["aux_target"] = idxs
+        return res
+    def num_tokens(self, index):
+        return self.size(index)
+    def size(self, index):
+        return self.sizes[index]
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        if self.sort_by_length:
+            order.append(self.sizes)
+            return np.lexsort(order)[::-1]
+        else:
+            return order[0]

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export train_cmd="run.pl --mem 2G"
+export decode_cmd="run.pl --mem 4G"
+export mkgraph_cmd="run.pl --mem 8G"

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+sil_prob=0.5
+num_sil_states=3
+num_nonsil_states=1
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+set -eux
+dict=$1
+data_dir=$2
+dict_dir=$data_dir/local/dict
+tmplm_dir=$data_dir/local/lang_tmp
+lm_dir=$data_dir/lang
+mkdir -p $dict_dir $tmplm_dir $lm_dir
+# prepare dict
+echo "SIL" > $dict_dir/silence_phones.txt
+echo "SIL" > $dict_dir/optional_silence.txt
+awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt
+echo "SIL SIL" > $dict_dir/lexicon.txt
+echo "<UNK> SIL" >> $dict_dir/lexicon.txt
+awk '{print $1" "$1}' $dict >> $dict_dir/lexicon.txt
+echo "SIL" > $dict_dir/extra_questions.txt
+awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt
+# prepare lang
+utils/prepare_lang.sh --sil-prob $sil_prob --position-dependent-phones false \
+  --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \
+  $dict_dir "<UNK>" $tmplm_dir $lm_dir

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env bash
+langdir=""
+lmdir=""
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+arpa_lm=$1
+data=$2
+if [ -z $langdir ]; then
+  langdir=$data/lang
+fi
+if [ -z $lmdir ]; then
+  lmdir=$data/lang_test
+fi
+if [ ! -d $langdir ]; then
+  echo "$langdir not found. run local/prepare_lang.sh first" && exit 1
+fi
+mkdir -p $lmdir
+cp -r $langdir/* $lmdir
+if [[ "$arpa_lm" == *.gz ]]; then
+  gunzip -c $arpa_lm | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt - $lmdir/G.fst
+else
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt $arpa_lm $lmdir/G.fst
+fi
+fstisstochastic $lmdir/G.fst
+utils/validate_lang.pl $lmdir || exit 1
+echo "done preparing lm ($lmdir)"

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2014  Guoguo Chen
+# Apache 2.0
+[ -f ./path.sh ] && . ./path.sh
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+data=$1
+lang_or_graph=$2
+dir=$3
+symtab=$lang_or_graph/words.txt
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+mkdir -p $dir/scoring/log
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
+    lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+    lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+    lattice-best-path --word-symbol-table=$symtab \
+      ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
+done
+# Note: the double level of quoting for the sed command
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
+    cat $dir/scoring/LMWT.$wip.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+    ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+done
+exit 0;

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/bin/bash
+split="dev_other"
+ref_data=""
+get_best_wer=true
+dec_name="decode"
+graph_name="graph"
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+exp_root=$1
+set -eu
+echo "==== WER w.r.t. pseudo transcript"
+for x in $exp_root/*/${dec_name}_${split}*; do grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh; done
+if [ ! -z $ref_data ]; then
+  echo "==== WER w.r.t. real transcript (select based on pseudo WER)"
+  ref_txt=$ref_data/$split/text
+  for x in $exp_root/*/${dec_name}_${split}*; do
+    lang=$(dirname $x)/$graph_name
+    lmwt=$(
+      grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh |
+      sed 's/.*wer_\(.*\)$/\1/g' | sed 's/_/./g'
+    )
+    tra=$x/scoring/$lmwt.tra
+    cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \
+      compute-wer --text --mode=present \
+      ark:$ref_txt  ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra
+  done
+fi
+if [ ! -z $ref_data ] && $get_best_wer; then
+  echo "==== WER w.r.t. real transcript (select based on true WER)"
+  ref_txt=$ref_data/$split/text
+  for x in $exp_root/*/${dec_name}_${split}*; do
+    lang=$(dirname $x)/$graph_name
+    for tra in $x/scoring/*.tra; do
+      cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \
+        compute-wer --text --mode=present \
+        ark:$ref_txt  ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra
+    done | sort -k2n | head -n1
+  done
+fi
+exit 0;

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env bash
+out_root=/tmp
+out_name=train_${RANDOM}
+num_nonsil_states=1
+valid="dev_other"
+train="train"
+mono_size="-1"  # 2000
+tri1_size="-1"  # 5000
+tri2b_size="-1"  # 10000
+tri3b_size="-1"  # 10000
+# Acoustic model parameters
+numLeavesTri1=2000
+numGaussTri1=10000
+numLeavesMLLT=2500
+numGaussMLLT=15000
+numLeavesSAT=2500
+numGaussSAT=15000
+stage=1
+max_stage=1
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+data=$1
+lang=$2
+lang_test=$3
+exp_root=$out_root/$out_name
+# you might not want to do this for interactive shells.
+set -e
+if [ $stage -le 1 ] && [ $max_stage -ge 1 ]; then
+  # train a monophone system
+  if [ ! $mono_size -eq -1 ]; then
+    utils/subset_data_dir.sh $data/$train $mono_size $data/${train}_${mono_size}
+    mono_train=${train}_${mono_size}
+  else
+    mono_train=${train}
+  fi
+  steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
+    --initial-beam 40 --regular-beam 60 --retry-beam 120 \
+    $data/$mono_train $lang $exp_root/mono
+  utils/mkgraph.sh $lang_test $exp_root/mono $exp_root/mono/graph
+  steps/decode.sh --nj 20 --cmd "$decode_cmd" \
+    $exp_root/mono/graph $data/$valid $exp_root/mono/decode_$valid &
+fi
+if [ $stage -le 2 ] && [ $max_stage -ge 2 ]; then
+  # train a first delta + delta-delta triphone system on a subset of 5000 utterances
+  if [ ! $tri1_size -eq -1 ]; then
+    utils/subset_data_dir.sh $data/$train $tri1_size $data/${train}_${tri1_size}
+    tri1_train=${train}_${tri1_size}
+  else
+    tri1_train=${train}
+  fi
+  steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+    $data/$tri1_train $lang \
+    $exp_root/mono $exp_root/mono_ali_${tri1_train}
+  steps_gan/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+      --num_nonsil_states $num_nonsil_states $numLeavesTri1 $numGaussTri1 \
+      $data/$tri1_train $lang \
+      $exp_root/mono_ali_${tri1_train} $exp_root/tri1
+  utils/mkgraph.sh $lang_test $exp_root/tri1 $exp_root/tri1/graph
+  steps/decode.sh --nj 20 --cmd "$decode_cmd" \
+    $exp_root/tri1/graph $data/$valid $exp_root/tri1/decode_$valid &
+fi
+if [ $stage -le 3 ] && [ $max_stage -ge 3 ]; then
+  # train an LDA+MLLT system.
+  if [ ! $tri2b_size -eq -1 ]; then
+    utils/subset_data_dir.sh $data/$train $tri2b_size $data/${train}_${tri2b_size}
+    tri2b_train=${train}_${tri2b_size}
+  else
+    tri2b_train=${train}
+  fi
+  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+    $data/$tri2b_train $lang \
+    $exp_root/tri1 $exp_root/tri1_ali_${tri2b_train}
+  steps_gan/train_lda_mllt.sh --cmd "$train_cmd" \
+      --num_nonsil_states $num_nonsil_states \
+      --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT \
+      $data/$tri2b_train $lang \
+      $exp_root/tri1_ali_${tri2b_train} $exp_root/tri2b
+  utils/mkgraph.sh $lang_test $exp_root/tri2b $exp_root/tri2b/graph
+  steps/decode.sh --nj 20 --cmd "$decode_cmd" \
+    $exp_root/tri2b/graph $data/$valid $exp_root/tri2b/decode_$valid &
+fi
+if [ $stage -le 4 ] && [ $max_stage -ge 4 ]; then
+  # Train tri3b, which is LDA+MLLT+SAT on 10k utts
+  if [ ! $tri3b_size -eq -1 ]; then
+    utils/subset_data_dir.sh $data/$train $tri3b_size $data/${train}_${tri3b_size}
+    tri3b_train=${train}_${tri3b_size}
+  else
+    tri3b_train=${train}
+  fi
+  steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
+    $data/$tri3b_train $lang \
+    $exp_root/tri2b $exp_root/tri2b_ali_${tri2b_train}
+  steps_gan/train_sat.sh --cmd "$train_cmd" \
+    --num_nonsil_states $num_nonsil_states $numLeavesSAT $numGaussSAT \
+    $data/$tri3b_train $lang \
+    $exp_root/tri2b_ali_${tri2b_train} $exp_root/tri3b
+  utils/mkgraph.sh $lang_test $exp_root/tri3b $exp_root/tri3b/graph
+  steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
+    $exp_root/tri3b/graph $data/$valid $exp_root/tri3b/decode_$valid &
+fi
+wait

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Implement unsupervised metric for decoding hyperparameter selection:
+    $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$
+"""
+import argparse
+import logging
+import math
+import sys
+import kenlm
+import editdistance
+from g2p_en import G2p
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("ref_tra", help="reference pseudo labels")
+    parser.add_argument("hyp_tra", help="decoded pseudo labels to be assess")
+    parser.add_argument("--kenlm_path", default="/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o5.bin", help="")
+    parser.add_argument("--uppercase", action="store_true", help="")
+    parser.add_argument("--skipwords", default="", help="")
+    parser.add_argument("--gt_tra", default="", help="ground truth pseudo labels for computing oracle WER")
+    parser.add_argument("--min_vt_uer", default=0.0, type=float)
+    parser.add_argument("--phonemize", action="store_true", help="phonemize word hypotheses, used when reference is phone transcript")
+    parser.add_argument("--phonemize_lexicon", default="", type=str, help="use a lexicon for phonemizing")
+    return parser
+def load_tra(tra_path):
+    with open(tra_path, "r") as f:
+        uid_to_tra = {}
+        for line in f:
+            toks = line.rstrip().split()
+            uid, tra = toks[0], " ".join(toks[1:])
+            uid_to_tra[uid] = tra
+    logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}")
+    return uid_to_tra
+def load_lex(lex_path):
+    with open(lex_path, "r") as f:
+        w2p = {}
+        for line in f:
+            w, p = line.rstrip().split(None, 1)
+            w2p[w] = p.split()
+    return w2p
+def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict):
+    d_cnt = 0
+    w_cnt = 0
+    w_cnt_h = 0
+    for uid in hyp_uid_to_tra:
+        ref = ref_uid_to_tra[uid].split()
+        if g2p_dict is not None:
+            hyp = []
+            for word in hyp_uid_to_tra[uid].split():
+                if word in g2p_dict:
+                    hyp = hyp + g2p_dict[word]
+                else:
+                    logger.warning(f"{word} not in g2p_dict")
+        elif g2p is not None:
+            hyp = g2p(hyp_uid_to_tra[uid])
+            hyp = [p for p in hyp if p != "'" and p != " "]
+            hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp]
+        else:
+            hyp = hyp_uid_to_tra[uid].split()
+        logger.debug((
+            f"======================\n"
+            f"HYP: {' '.join(hyp)}\n"
+            f"REF: {' '.join(ref)}"
+        ))
+        d_cnt += editdistance.eval(ref, hyp)
+        w_cnt += len(ref)
+        w_cnt_h += len(hyp)
+    wer = float(d_cnt) / w_cnt
+    logger.debug((
+        f"wer = {wer*100:.2f}%; num. of ref words = {w_cnt}; "
+        f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}"
+    ))
+    return wer
+def compute_lm_ppl(hyp_uid_to_tra, score_fn):
+    lm_score = 0.
+    w_cnt = 0
+    for hyp in hyp_uid_to_tra.values():
+        cur_score = score_fn(hyp)
+        cur_cnt = len(hyp.split()) + 1  # plus one for </s>
+        lm_score += cur_score
+        w_cnt += cur_cnt
+        logger.debug((
+            f"======================\n"
+            f"score sum/avg = {cur_score:.2f}/{cur_score/cur_cnt:.2f}\n"
+            f"hyp = {hyp}"
+        ))
+    lm_ppl = math.pow(10, -lm_score / w_cnt)
+    logger.debug(f"lm ppl = {lm_ppl:.2f}; num. of words = {w_cnt}")
+    return lm_ppl
+def main():
+    args = get_parser().parse_args()
+    logger.debug(f"Args: {args}")
+    ref_uid_to_tra = load_tra(args.ref_tra)
+    hyp_uid_to_tra = load_tra(args.hyp_tra)
+    assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys()))
+    lm = kenlm.Model(args.kenlm_path)
+    skipwords = set(args.skipwords.split(","))
+    def compute_lm_score(s):
+        s = " ".join(w for w in s.split() if w not in skipwords)
+        s = s.upper() if args.uppercase else s
+        return lm.score(s)
+    g2p, g2p_dict = None, None
+    if args.phonemize:
+        if args.phonemize_lexicon:
+            g2p_dict = load_lex(args.phonemize_lexicon)
+        else:
+            g2p = G2p()
+    wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict)
+    lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score)
+    gt_wer = -math.inf
+    if args.gt_tra:
+        gt_uid_to_tra = load_tra(args.gt_tra)
+        gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None)
+    score = math.log(lm_ppl) * max(wer, args.min_vt_uer)
+    logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%")
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+split="dev_other"
+ref_txt=""  # ground truth transcript path
+psd_txt=""  # pseudo transcript path
+get_best_wer=true
+dec_name="decode"
+graph_name="graph"
+kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+exp_root=$1
+unsup_args=""
+if [ $# -ge 2 ]; then
+  unsup_args=$2
+fi
+set -eu
+if [ ! -z $ref_txt ] && $get_best_wer; then
+  echo "==== WER w.r.t. real transcript (select based on unsupervised metric)"
+  for x in $exp_root/*/${dec_name}_${split}*; do
+    lang=$(dirname $x)/$graph_name
+    (
+      for tra in $x/scoring/*.tra; do
+        cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' > $tra.txt
+        python local/unsup_select.py $psd_txt $tra.txt --kenlm_path $kenlm_path --gt_tra $ref_txt $unsup_args
+      done 2>/dev/null | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1
+    ) &
+  done
+fi
+wait

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+split="dev_other"
+ref_txt=""  # ground truth transcript path
+psd_txt=""  # pseudo transcript path
+get_best_wer=true
+dec_name="decode"
+graph_name="graph"
+kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin
+phonemize_lexicon=""
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+. /private/home/wnhsu/unsup_asr/fairseq-py-unsup/env.sh
+exp_root=$1
+set -eu
+if [ ! -z $ref_txt ] && $get_best_wer; then
+  echo "==== WER w.r.t. real transcript (select based on unsupervised metric)"
+  for x in $exp_root/*/${dec_name}_${split}*; do
+    lang=$(dirname $x)/$graph_name
+    for tra in $x/scoring/*.tra; do
+      cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:\<UNK\>::g' > $tra.txt
+      python local/unsup_select.py $psd_txt $tra.txt \
+        --kenlm_path $kenlm_path --gt_tra $ref_txt --phonemize \
+        --phonemize_lexicon "$phonemize_lexicon"
+    done | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1
+  done
+fi

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh ADDED Viewed

	@@ -0,0 +1,175 @@

+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# Begin configuration.
+stage=-4 #  This allows restarting after partway, when something when wrong.
+config=
+cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+num_iters=35    # Number of iterations of training
+max_iter_inc=25 # Last iter to increase #Gauss on.
+beam=10
+careful=false
+retry_beam=40
+boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+power=0.25 # Exponent for number of gaussians according to occurrence counts
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+norm_vars=false # deprecated.  Prefer --cmvn-opts "--norm-vars=true"
+                # use the option --cmvn-opts "--norm-means=false"
+cmvn_opts=
+delta_opts=
+context_opts=   # use"--context-width=5 --central-position=2" for quinphone
+num_nonsil_states=3
+# End configuration.
+echo "$0 $@"  # Print the command line for logging
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+if [ $# != 6 ]; then
+   echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
+   echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   exit 1;
+fi
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
+  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+done
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
+oov=`cat $lang/oov.int` || exit 1;
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+nj=`cat $alidir/num_jobs` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+sdata=$data/split$nj;
+split_data.sh $data $nj || exit 1;
+[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
+  echo "$0: warning: ignoring CMVN options from source directory $alidir"
+$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
+echo $cmvn_opts  > $dir/cmvn_opts # keep track of options to CMVN.
+[ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
+rm $dir/.error 2>/dev/null
+if [ $stage -le -3 ]; then
+  echo "$0: accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+    acc-tree-stats $context_opts \
+    --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
+  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
+  rm $dir/*.treeacc
+fi
+if [ $stage -le -2 ]; then
+  echo "$0: getting questions for tree-building, via clustering"
+  # preparing questions, roots file...
+  cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts \
+    $dir/treeacc $lang/phones/sets.int \
+    $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $context_opts $lang/topo $dir/questions.int \
+    $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+  echo "$0: building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+  $cmd $dir/log/init_model.log \
+    gmm-init-model  --write-occs=$dir/1.occs  \
+      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl || exit 1;
+  if grep 'no stats' $dir/log/init_model.log; then
+     echo "** The warnings above about 'no stats' generally mean you have phones **"
+     echo "** (or groups of phones) in your phone set that had no corresponding data. **"
+     echo "** You should probably figure out whether something went wrong, **"
+     echo "** or whether your data just doesn't happen to have examples of those **"
+     echo "** phones. **"
+  fi
+  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
+  rm $dir/treeacc
+fi
+if [ $stage -le -1 ]; then
+  # Convert the alignments.
+  echo "$0: converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+if [ $stage -le 0 ]; then
+  echo "$0: compiling graphs of transcripts"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
+     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
+      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+x=1
+while [ $x -lt $num_iters ]; do
+  echo "$0: training pass $x"
+  if [ $stage -le $x ]; then
+    if echo $realign_iters | grep -w $x >/dev/null; then
+      echo "$0: aligning data"
+      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
+      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
+        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
+         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+    fi
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
+       "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
+    $cmd $dir/log/update.$x.log \
+      gmm-est --mix-up=$numgauss --power=$power \
+        --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
+       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
+    rm $dir/$x.mdl $dir/$x.*.acc
+    rm $dir/$x.occs
+  fi
+  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
+  x=$[$x+1];
+done
+rm $dir/final.mdl $dir/final.occs 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+ln -s $x.occs $dir/final.occs
+steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
+# Summarize warning messages...
+utils/summarize_warnings.pl  $dir/log
+steps/info/gmm_dir_info.pl $dir
+echo "$0: Done training system with delta+delta-delta features in $dir"
+exit 0

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh ADDED Viewed

	@@ -0,0 +1,239 @@

+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#
+# LDA+MLLT refers to the way we transform the features after computing
+# the MFCCs: we splice across several frames, reduce the dimension (to 40
+# by default) using Linear Discriminant Analysis), and then later estimate,
+# over multiple iterations, a diagonalizing transform known as MLLT or STC.
+# See http://kaldi-asr.org/doc/transform.html for more explanation.
+#
+# Apache 2.0.
+# Begin configuration.
+cmd=run.pl
+config=
+stage=-5
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+mllt_iters="2 4 6 12";
+num_iters=35    # Number of iterations of training
+max_iter_inc=25  # Last iter to increase #Gauss on.
+dim=40
+beam=10
+retry_beam=40
+careful=false
+boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+power=0.25 # Exponent for number of gaussians according to occurrence counts
+randprune=4.0 # This is approximately the ratio by which we will speed up the
+              # LDA and MLLT calculations via randomized pruning.
+splice_opts=
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+norm_vars=false # deprecated.  Prefer --cmvn-opts "--norm-vars=false"
+cmvn_opts=
+context_opts=   # use "--context-width=5 --central-position=2" for quinphone.
+# End configuration.
+train_tree=true  # if false, don't actually train the tree.
+use_lda_mat=  # If supplied, use this LDA[+MLLT] matrix.
+num_nonsil_states=3
+echo "$0 $@"  # Print the command line for logging
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+if [ $# != 6 ]; then
+  echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>"
+  echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
+  [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1;
+done
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
+oov=`cat $lang/oov.int` || exit 1;
+nj=`cat $alidir/num_jobs` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+mkdir -p $dir/log
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+echo $nj >$dir/num_jobs
+echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
+           # so that later stages of system building can know what they were.
+[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
+  echo "$0: warning: ignoring CMVN options from source directory $alidir"
+$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
+echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN.
+sdata=$data/split$nj;
+split_data.sh $data $nj || exit 1;
+splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
+# Note: $feats gets overwritten later in the script.
+feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
+if [ $stage -le -5 ]; then
+  if [ -z "$use_lda_mat" ]; then
+    echo "$0: Accumulating LDA statistics."
+    rm $dir/lda.*.acc 2>/dev/null
+    $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
+    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
+      acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
+      $dir/lda.JOB.acc || exit 1;
+    est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \
+      2>$dir/log/lda_est.log || exit 1;
+    rm $dir/lda.*.acc
+  else
+    echo "$0: Using supplied LDA matrix $use_lda_mat"
+    cp $use_lda_mat $dir/0.mat || exit 1;
+    [ ! -z "$mllt_iters" ] && \
+      echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
+      echo "     which you might not want; to disable MLLT, specify --mllt-iters ''" && \
+      sleep 5
+  fi
+fi
+cur_lda_iter=0
+if [ $stage -le -4 ] && $train_tree; then
+  echo "$0: Accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+    acc-tree-stats $context_opts \
+    --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
+  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
+  $cmd $dir/log/sum_tree_acc.log \
+    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
+  rm $dir/*.treeacc
+fi
+if [ $stage -le -3 ] && $train_tree; then
+  echo "$0: Getting questions for tree clustering."
+  # preparing questions, roots file...
+  cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts $dir/treeacc $lang/phones/sets.int \
+    $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $context_opts $lang/topo $dir/questions.int \
+    $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+  echo "$0: Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+fi
+if [ $stage -le -2 ]; then
+  echo "$0: Initializing the model"
+  if $train_tree; then
+    gmm-init-model  --write-occs=$dir/1.occs  \
+      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
+    rm $dir/treeacc
+  else
+    cp $alidir/tree $dir/ || exit 1;
+    $cmd JOB=1 $dir/log/init_model.log \
+      gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
+        "$feats subset-feats ark:- ark:-|" || exit 1;
+  fi
+fi
+if [ $stage -le -1 ]; then
+  # Convert the alignments.
+  echo "$0: Converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
+  echo "$0: Compiling graphs of transcripts"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
+     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
+      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+x=1
+while [ $x -lt $num_iters ]; do
+  echo Training pass $x
+  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
+    echo Aligning data
+    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
+    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
+      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
+      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+  fi
+  if echo $mllt_iters | grep -w $x >/dev/null; then
+    if [ $stage -le $x ]; then
+      echo "$0: Estimating MLLT"
+      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
+        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
+        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
+        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
+        || exit 1;
+      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
+      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
+        2> $dir/log/transform_means.$x.log || exit 1;
+      compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
+      rm $dir/$x.*.macc
+    fi
+    feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |"
+    cur_lda_iter=$x
+  fi
+  if [ $stage -le $x ]; then
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
+      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
+    $cmd $dir/log/update.$x.log \
+      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
+        $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
+    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
+  fi
+  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
+  x=$[$x+1];
+done
+rm $dir/final.{mdl,mat,occs} 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+ln -s $x.occs $dir/final.occs
+ln -s $cur_lda_iter.mat $dir/final.mat
+steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
+# Summarize warning messages...
+utils/summarize_warnings.pl $dir/log
+steps/info/gmm_dir_info.pl $dir
+echo "$0: Done training system with LDA+MLLT features in $dir"
+exit 0

fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh ADDED Viewed

	@@ -0,0 +1,281 @@

+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# This does Speaker Adapted Training (SAT), i.e. train on
+# fMLLR-adapted features.  It can be done on top of either LDA+MLLT, or
+# delta and delta-delta features.  If there are no transforms supplied
+# in the alignment directory, it will estimate transforms itself before
+# building the tree (and in any case, it estimates transforms a number
+# of times during training).
+# Begin configuration section.
+stage=-5
+exit_stage=-100 # you can use this to require it to exit at the
+                # beginning of a specific stage.  Not all values are
+                # supported.
+fmllr_update_type=full
+cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+careful=false
+boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
+realign_iters="10 20 30";
+fmllr_iters="2 4 6 12";
+silence_weight=0.0 # Weight on silence in fMLLR estimation.
+num_iters=35   # Number of iterations of training
+max_iter_inc=25 # Last iter to increase #Gauss on.
+power=0.2 # Exponent for number of gaussians according to occurrence counts
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+phone_map=
+train_tree=true
+tree_stats_opts=
+cluster_phones_opts=
+compile_questions_opts=
+# End configuration section.
+num_nonsil_states=3
+echo "$0 $@"  # Print the command line for logging
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+if [ $# != 6 ]; then
+  echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
+  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
+done
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$max_iter_inc]  # per-iter #gauss increment
+oov=`cat $lang/oov.int`
+nj=`cat $alidir/num_jobs` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl`
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+sdata=$data/split$nj;
+splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
+phone_map_opt=
+[ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"
+mkdir -p $dir/log
+cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null # delta option.
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+echo $nj >$dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+# Set up features.
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+## Set up speaker-independent features.
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir
+    cp $alidir/full.mat $dir 2>/dev/null
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+## Get initial fMLLR transforms (possibly from alignment dir)
+if [ -f $alidir/trans.1 ]; then
+  echo "$0: Using transforms from $alidir"
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+  cur_trans_dir=$alidir
+else
+  if [ $stage -le -5 ]; then
+    echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
+    # The next line is necessary because of $silphonelist otherwise being incorrect; would require
+    # old $lang dir which would require another option.  Not needed anyway.
+    [ ! -z "$phone_map" ] && \
+       echo "$0: error: you must provide transforms if you use the --phone-map option." && exit 1;
+    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
+      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
+      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \
+      ark:- ark:$dir/trans.JOB || exit 1;
+  fi
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
+  cur_trans_dir=$dir
+fi
+if [ $stage -le -4 ] && $train_tree; then
+  # Get tree stats.
+  echo "$0: Accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+    acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
+  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
+  $cmd $dir/log/sum_tree_acc.log \
+    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
+  rm $dir/*.treeacc
+fi
+if [ $stage -le -3 ] && $train_tree; then
+  echo "$0: Getting questions for tree clustering."
+  # preparing questions, roots file...
+  cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) \
+    $cluster_phones_opts $context_opts \
+    $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+  echo "$0: Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+fi
+if [ $stage -le -2 ]; then
+  echo "$0: Initializing the model"
+  if $train_tree; then
+    gmm-init-model  --write-occs=$dir/1.occs  \
+      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
+    rm $dir/treeacc
+  else
+    cp $alidir/tree $dir/ || exit 1;
+    $cmd JOB=1 $dir/log/init_model.log \
+      gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
+        "$feats subset-feats ark:- ark:-|" || exit 1;
+  fi
+fi
+if [ $stage -le -1 ]; then
+  # Convert the alignments.
+  echo "$0: Converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $phone_map_opt $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+[ "$exit_stage" -eq 0 ] && echo "$0: Exiting early: --exit-stage $exit_stage" && exit 0;
+if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
+  echo "$0: Compiling graphs of transcripts"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
+     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
+      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+x=1
+while [ $x -lt $num_iters ]; do
+   echo Pass $x
+  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
+    echo Aligning data
+    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
+    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
+      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
+      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+  fi
+  if echo $fmllr_iters | grep -w $x >/dev/null; then
+    if [ $stage -le $x ]; then
+      echo Estimating fMLLR transforms
+      # We estimate a transform that's additional to the previous transform;
+      # we'll compose them.
+      $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \
+        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
+        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
+        gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+        --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \
+        "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1;
+      for n in `seq $nj`; do
+        ! ( compose-transforms --b-is-affine=true \
+          ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \
+          && mv $dir/composed_trans.$n $dir/trans.$n && \
+          rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \
+          && echo "$0: Error composing transforms" && exit 1;
+      done
+    fi
+    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+    cur_trans_dir=$dir
+  fi
+  if [ $stage -le $x ]; then
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-acc-stats-ali $dir/$x.mdl "$feats" \
+      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
+    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
+    $cmd $dir/log/update.$x.log \
+      gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
+    rm $dir/$x.mdl $dir/$x.*.acc
+    rm $dir/$x.occs
+  fi
+  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
+  x=$[$x+1];
+done
+if [ $stage -le $x ]; then
+  # Accumulate stats for "alignment model"-- this model is
+  # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
+  # with the final speaker-adapted model.
+  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
+    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
+    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
+  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
+  # Update model.
+  $cmd $dir/log/est_alimdl.log \
+    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
+    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
+  rm $dir/$x.*.acc
+fi
+rm $dir/final.{mdl,alimdl,occs} 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+ln -s $x.occs $dir/final.occs
+ln -s $x.alimdl $dir/final.alimdl
+steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
+utils/summarize_warnings.pl $dir/log
+(
+  echo "$0: Likelihood evolution:"
+  for x in `seq $[$num_iters-1]`; do
+    tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
+        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
+        END{ d /= t2; l /= t; printf("%s ", d+l); } '
+  done
+  echo
+) | tee $dir/log/summary.log
+steps/info/gmm_dir_info.pl $dir
+echo "$0: done training SAT system in $dir"
+exit 0

fairseq/examples/wav2vec/unsupervised/models/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .wav2vec_u import Wav2vec_U
+__all__ = [
+    "Wav2vec_U",
+]

fairseq/examples/wav2vec/unsupervised/models/wav2vec_u.py ADDED Viewed

	@@ -0,0 +1,687 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from enum import Enum, auto
+import math
+import numpy as np
+from typing import Tuple, List, Optional, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import autograd
+from fairseq import checkpoint_utils, utils
+from fairseq.dataclass import FairseqDataclass
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.modules import (
+    SamePad,
+    TransposeLast,
+)
+class SegmentationType(Enum):
+    NONE = auto()
+    RANDOM = auto()
+    UNIFORM_RANDOM = auto()
+    UNIFORM_RANDOM_JOIN = auto()
+    JOIN = auto()
+@dataclass
+class SegmentationConfig(FairseqDataclass):
+    type: SegmentationType = SegmentationType.NONE
+    subsample_rate: float = 0.25
+    mean_pool: bool = True
+    mean_pool_join: bool = False
+    remove_zeros: bool = False
+@dataclass
+class Wav2vec_UConfig(FairseqDataclass):
+    discriminator_kernel: int = 3
+    discriminator_dilation: int = 1
+    discriminator_dim: int = 256
+    discriminator_causal: bool = True
+    discriminator_linear_emb: bool = False
+    discriminator_depth: int = 1
+    discriminator_max_pool: bool = False
+    discriminator_act_after_linear: bool = False
+    discriminator_dropout: float = 0.0
+    discriminator_spectral_norm: bool = False
+    discriminator_weight_norm: bool = False
+    generator_kernel: int = 4
+    generator_dilation: int = 1
+    generator_stride: int = 1
+    generator_pad: int = -1
+    generator_bias: bool = False
+    generator_dropout: float = 0.0
+    generator_batch_norm: int = 0
+    generator_residual: bool = False
+    blank_weight: float = 0
+    blank_mode: str = "add"
+    blank_is_sil: bool = False
+    no_softmax: bool = False
+    smoothness_weight: float = 0.0
+    smoothing: float = 0.0
+    smoothing_one_sided: bool = False
+    gradient_penalty: float = 0.0
+    probabilistic_grad_penalty_slicing: bool = False
+    code_penalty: float = 0.0
+    mmi_weight: float = 0.0
+    target_dim: int = 64
+    target_downsample_rate: int = 2
+    gumbel: bool = False
+    hard_gumbel: bool = True
+    temp: Tuple[float, float, float] = (2, 0.1, 0.99995)
+    input_dim: int = 128
+    segmentation: SegmentationConfig = SegmentationConfig()
+class Segmenter(nn.Module):
+    cfg: SegmentationConfig
+    def __init__(self, cfg: SegmentationConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.subsample_rate = cfg.subsample_rate
+    def pre_segment(self, dense_x, dense_padding_mask):
+        return dense_x, dense_padding_mask
+    def logit_segment(self, logits, padding_mask):
+        return logits, padding_mask
+class RandomSegmenter(Segmenter):
+    def pre_segment(self, dense_x, dense_padding_mask):
+        target_num = math.ceil(dense_x.size(1) * self.subsample_rate)
+        ones = torch.ones(dense_x.shape[:-1], device=dense_x.device)
+        indices, _ = ones.multinomial(target_num).sort(dim=-1)
+        indices_ld = indices.unsqueeze(-1).expand(-1, -1, dense_x.size(-1))
+        dense_x = dense_x.gather(1, indices_ld)
+        dense_padding_mask = dense_padding_mask.gather(1, index=indices)
+        return dense_x, dense_padding_mask
+class UniformRandomSegmenter(Segmenter):
+    def pre_segment(self, dense_x, dense_padding_mask):
+        bsz, tsz, fsz = dense_x.shape
+        target_num = math.ceil(tsz * self.subsample_rate)
+        rem = tsz % target_num
+        if rem > 0:
+            dense_x = F.pad(dense_x, [0, 0, 0, target_num - rem])
+            dense_padding_mask = F.pad(
+                dense_padding_mask, [0, target_num - rem], value=True
+            )
+        dense_x = dense_x.view(bsz, target_num, -1, fsz)
+        dense_padding_mask = dense_padding_mask.view(bsz, target_num, -1)
+        if self.cfg.mean_pool:
+            dense_x = dense_x.mean(dim=-2)
+            dense_padding_mask = dense_padding_mask.all(dim=-1)
+        else:
+            ones = torch.ones((bsz, dense_x.size(2)), device=dense_x.device)
+            indices = ones.multinomial(1)
+            indices = indices.unsqueeze(-1).expand(-1, target_num, -1)
+            indices_ld = indices.unsqueeze(-1).expand(-1, -1, -1, fsz)
+            dense_x = dense_x.gather(2, indices_ld).reshape(bsz, -1, fsz)
+            dense_padding_mask = dense_padding_mask.gather(2, index=indices).reshape(
+                bsz, -1
+            )
+        return dense_x, dense_padding_mask
+class JoinSegmenter(Segmenter):
+    def logit_segment(self, logits, padding_mask):
+        preds = logits.argmax(dim=-1)
+        if padding_mask.any():
+            preds[padding_mask] = -1  # mark pad
+        uniques = []
+        bsz, tsz, csz = logits.shape
+        for p in preds:
+            uniques.append(
+                p.cpu().unique_consecutive(return_inverse=True, return_counts=True)
+            )
+        new_tsz = max(u[0].numel() for u in uniques)
+        new_logits = logits.new_zeros(bsz, new_tsz, csz)
+        new_pad = padding_mask.new_zeros(bsz, new_tsz)
+        for b in range(bsz):
+            u, idx, c = uniques[b]
+            keep = u != -1
+            if self.cfg.remove_zeros:
+                keep.logical_and_(u != 0)
+            if self.training and not self.cfg.mean_pool_join:
+                u[0] = 0
+                u[1:] = c.cumsum(0)[:-1]
+                m = c > 1
+                r = torch.rand(m.sum())
+                o = (c[m] * r).long()
+                u[m] += o
+                new_logits[b, : u.numel()] = logits[b, u]
+            else:
+                new_logits[b].index_add_(
+                    dim=0, index=idx.to(new_logits.device), source=logits[b]
+                )
+                new_logits[b, : c.numel()] /= c.unsqueeze(-1).to(new_logits.device)
+            new_sz = keep.sum()
+            if not keep.all():
+                kept_logits = new_logits[b, : c.numel()][keep]
+                new_logits[b, :new_sz] = kept_logits
+            if new_sz < new_tsz:
+                pad = new_tsz - new_sz
+                new_logits[b, -pad:] = 0
+                new_pad[b, -pad:] = True
+        return new_logits, new_pad
+class UniformRandomJoinSegmenter(UniformRandomSegmenter, JoinSegmenter):
+    pass
+SEGMENT_FACTORY = {
+    SegmentationType.NONE: Segmenter,
+    SegmentationType.RANDOM: RandomSegmenter,
+    SegmentationType.UNIFORM_RANDOM: UniformRandomSegmenter,
+    SegmentationType.UNIFORM_RANDOM_JOIN: UniformRandomJoinSegmenter,
+    SegmentationType.JOIN: JoinSegmenter,
+}
+class Discriminator(nn.Module):
+    def __init__(self, dim, cfg: Wav2vec_UConfig):
+        super().__init__()
+        inner_dim = cfg.discriminator_dim
+        kernel = cfg.discriminator_kernel
+        dilation = cfg.discriminator_dilation
+        self.max_pool = cfg.discriminator_max_pool
+        if cfg.discriminator_causal:
+            padding = kernel - 1
+        else:
+            padding = kernel // 2
+        def make_conv(in_d, out_d, k, p=0, has_dilation=True):
+            conv = nn.Conv1d(
+                in_d,
+                out_d,
+                kernel_size=k,
+                padding=p,
+                dilation=dilation if has_dilation else 1,
+            )
+            if cfg.discriminator_spectral_norm:
+                conv = nn.utils.spectral_norm(conv)
+            elif cfg.discriminator_weight_norm:
+                conv = nn.utils.weight_norm(conv)
+            return conv
+        inner_net = [
+            nn.Sequential(
+                make_conv(inner_dim, inner_dim, kernel, padding),
+                SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
+                nn.Dropout(cfg.discriminator_dropout),
+                nn.GELU(),
+            )
+            for _ in range(cfg.discriminator_depth - 1)
+        ] + [
+            make_conv(inner_dim, 1, kernel, padding, has_dilation=False),
+            SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
+        ]
+        if cfg.discriminator_linear_emb:
+            emb_net = [make_conv(dim, inner_dim, 1)]
+        else:
+            emb_net = [
+                make_conv(dim, inner_dim, kernel, padding),
+                SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
+            ]
+        if cfg.discriminator_act_after_linear:
+            emb_net.append(nn.GELU())
+        self.net = nn.Sequential(
+            *emb_net,
+            nn.Dropout(cfg.discriminator_dropout),
+            *inner_net,
+        )
+    def forward(self, x, padding_mask):
+        x = x.transpose(1, 2)  # BTC -> BCT
+        x = self.net(x)
+        x = x.transpose(1, 2)
+        x_sz = x.size(1)
+        if padding_mask is not None and padding_mask.any() and padding_mask.dim() > 1:
+            padding_mask = padding_mask[:, : x.size(1)]
+            x[padding_mask] = float("-inf") if self.max_pool else 0
+            x_sz = x_sz - padding_mask.sum(dim=-1)
+        x = x.squeeze(-1)
+        if self.max_pool:
+            x, _ = x.max(dim=-1)
+        else:
+            x = x.sum(dim=-1)
+            x = x / x_sz
+        return x
+class Generator(nn.Module):
+    def __init__(self, input_dim, output_dim, cfg: Wav2vec_UConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.output_dim = output_dim
+        self.stride = cfg.generator_stride
+        self.dropout = nn.Dropout(cfg.generator_dropout)
+        self.batch_norm = cfg.generator_batch_norm != 0
+        self.residual = cfg.generator_residual
+        padding = (
+            cfg.generator_kernel // 2 if cfg.generator_pad < 0 else cfg.generator_pad
+        )
+        self.proj = nn.Sequential(
+            TransposeLast(),
+            nn.Conv1d(
+                input_dim,
+                output_dim,
+                kernel_size=cfg.generator_kernel,
+                stride=cfg.generator_stride,
+                dilation=cfg.generator_dilation,
+                padding=padding,
+                bias=cfg.generator_bias,
+            ),
+            TransposeLast(),
+        )
+        if self.batch_norm:
+            self.bn = nn.BatchNorm1d(input_dim)
+            self.bn.weight.data.fill_(cfg.generator_batch_norm)
+        if self.residual:
+            self.in_proj = nn.Linear(input_dim, input_dim)
+    def forward(self, dense_x, tokens, dense_padding_mask):
+        result = {}
+        if self.batch_norm:
+            dense_x = self.bn_padded_data(dense_x, dense_padding_mask)
+        if self.residual:
+            inter_x = self.in_proj(self.dropout(dense_x))
+            dense_x = dense_x + inter_x
+            result["inter_x"] = inter_x
+        dense_x = self.dropout(dense_x)
+        dense_x = self.proj(dense_x)
+        if self.stride > 1:
+            dense_padding_mask = dense_padding_mask[:, :: self.stride]
+        if dense_padding_mask.size(1) != dense_x.size(1):
+            new_padding = dense_padding_mask.new_zeros(dense_x.shape[:-1])
+            diff = new_padding.size(1) - dense_padding_mask.size(1)
+            if diff > 0:
+                new_padding[:, diff:] = dense_padding_mask
+            else:
+                assert diff < 0
+                new_padding = dense_padding_mask[:, :diff]
+            dense_padding_mask = new_padding
+        token_x = None
+        if tokens is not None:
+            token_x = dense_x.new_zeros(tokens.numel(), self.output_dim)
+            token_x.scatter_(1, tokens.view(-1, 1).long(), 1)
+            token_x = token_x.view(tokens.shape + (self.output_dim,))
+        result["dense_x"] = dense_x
+        result["token_x"] = token_x
+        result["dense_padding_mask"] = dense_padding_mask
+        return result
+    def bn_padded_data(self, feature, padding_mask):
+        normed_feature = feature.clone()
+        normed_feature[~padding_mask] = self.bn(
+            feature[~padding_mask].unsqueeze(-1)
+        ).squeeze(-1)
+        return normed_feature
+@register_model("wav2vec_u", dataclass=Wav2vec_UConfig)
+class Wav2vec_U(BaseFairseqModel):
+    def calc_gradient_penalty(self, real_data, fake_data):
+        b_size = min(real_data.size(0), fake_data.size(0))
+        t_size = min(real_data.size(1), fake_data.size(1))
+        if self.cfg.probabilistic_grad_penalty_slicing:
+            def get_slice(data, dim, target_size):
+                size = data.size(dim)
+                diff = size - target_size
+                if diff <= 0:
+                    return data
+                start = np.random.randint(0, diff + 1)
+                return data.narrow(dim=dim, start=start, length=target_size)
+            real_data = get_slice(real_data, 0, b_size)
+            real_data = get_slice(real_data, 1, t_size)
+            fake_data = get_slice(fake_data, 0, b_size)
+            fake_data = get_slice(fake_data, 1, t_size)
+        else:
+            real_data = real_data[:b_size, :t_size]
+            fake_data = fake_data[:b_size, :t_size]
+        alpha = torch.rand(real_data.size(0), 1, 1)
+        alpha = alpha.expand(real_data.size())
+        alpha = alpha.to(real_data.device)
+        interpolates = alpha * real_data + ((1 - alpha) * fake_data)
+        disc_interpolates = self.discriminator(interpolates, None)
+        gradients = autograd.grad(
+            outputs=disc_interpolates,
+            inputs=interpolates,
+            grad_outputs=torch.ones(disc_interpolates.size(), device=real_data.device),
+            create_graph=True,
+            retain_graph=True,
+            only_inputs=True,
+        )[0]
+        gradient_penalty = (gradients.norm(2, dim=1) - 1) ** 2
+        return gradient_penalty
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self.update_num = num_updates
+        self.curr_temp = max(
+            self.max_temp * self.temp_decay ** num_updates, self.min_temp
+        )
+    def discrim_step(self, num_updates):
+        return num_updates % 2 == 1
+    def get_groups_for_update(self, num_updates):
+        return "discriminator" if self.discrim_step(num_updates) else "generator"
+    def __init__(self, cfg: Wav2vec_UConfig, target_dict):
+        super().__init__()
+        self.cfg = cfg
+        self.zero_index = target_dict.index("<SIL>") if "<SIL>" in target_dict else 0
+        self.smoothness_weight = cfg.smoothness_weight
+        output_size = len(target_dict)
+        self.pad = target_dict.pad()
+        self.eos = target_dict.eos()
+        self.smoothing = cfg.smoothing
+        self.smoothing_one_sided = cfg.smoothing_one_sided
+        self.no_softmax = cfg.no_softmax
+        self.gumbel = cfg.gumbel
+        self.hard_gumbel = cfg.hard_gumbel
+        self.last_acc = None
+        self.gradient_penalty = cfg.gradient_penalty
+        self.code_penalty = cfg.code_penalty
+        self.mmi_weight = cfg.mmi_weight
+        self.blank_weight = cfg.blank_weight
+        self.blank_mode = cfg.blank_mode
+        self.blank_index = target_dict.index("<SIL>") if cfg.blank_is_sil else 0
+        assert self.blank_index != target_dict.unk()
+        self.discriminator = Discriminator(output_size, cfg)
+        for p in self.discriminator.parameters():
+            p.param_group = "discriminator"
+        self.pca_A = self.pca_b = None
+        d = cfg.input_dim
+        self.segmenter = SEGMENT_FACTORY[cfg.segmentation.type](cfg.segmentation)
+        self.generator = Generator(d, output_size, cfg)
+        for p in self.generator.parameters():
+            p.param_group = "generator"
+        for p in self.segmenter.parameters():
+            p.param_group = "generator"
+        self.max_temp, self.min_temp, self.temp_decay = cfg.temp
+        self.curr_temp = self.max_temp
+        self.update_num = 0
+        if self.mmi_weight > 0:
+            self.target_downsample_rate = cfg.target_downsample_rate
+            self.decoder = nn.Linear(d, cfg.target_dim)
+            for p in self.decoder.parameters():
+                p.param_group = "generator"
+    @classmethod
+    def build_model(cls, cfg, task):
+        return cls(cfg, task.target_dictionary)
+    def get_logits(
+        self,
+        net_output: Optional[Dict[str, List[Optional[torch.Tensor]]]],
+        normalize: bool = False,
+    ):
+        logits = net_output["logits"]
+        if self.blank_weight != 0:
+            if self.blank_mode == "add":
+                logits[..., self.blank_index] += self.blank_weight
+            elif self.blank_mode == "set":
+                logits[..., self.blank_index] = self.blank_weight
+            else:
+                raise Exception(f"invalid blank mode {self.blank_mode}")
+        padding = net_output["padding_mask"]
+        if padding.any():
+            logits[padding] = float("-inf")
+            logits[padding][..., self.blank_index] = float("inf")
+        if normalize:
+            logits = utils.log_softmax(logits.float(), dim=-1)
+        return logits.transpose(0, 1)
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[
+            torch.Tensor, Optional[Dict[str, List[Optional[torch.Tensor]]]]
+        ],
+        log_probs: bool,
+        sample: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        logits = self.get_logits(net_output)
+        probs = super().get_normalized_probs(logits, log_probs, sample)
+        # BTC -> TBC for ctc
+        probs = probs.transpose(0, 1)
+        return probs
+    def normalize(self, dense_x):
+        bsz, tsz, csz = dense_x.shape
+        if dense_x.numel() == 0:
+            raise Exception(dense_x.shape)
+        _, k = dense_x.max(-1)
+        hard_x = (
+            dense_x.new_zeros(bsz * tsz, csz)
+            .scatter_(-1, k.view(-1, 1), 1.0)
+            .view(-1, csz)
+        )
+        hard_probs = torch.mean(hard_x.float(), dim=0)
+        code_perplexity = torch.exp(
+            -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+        )
+        avg_probs = torch.softmax(dense_x.reshape(-1, csz).float(), dim=-1).mean(dim=0)
+        prob_perplexity = torch.exp(
+            -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
+        )
+        if not self.no_softmax:
+            if self.training and self.gumbel:
+                dense_x = F.gumbel_softmax(
+                    dense_x.float(), tau=self.curr_temp, hard=self.hard_gumbel
+                ).type_as(dense_x)
+            else:
+                dense_x = dense_x.softmax(-1)
+        return dense_x, code_perplexity, prob_perplexity
+    def forward(
+        self,
+        features,
+        padding_mask,
+        random_label=None,
+        dense_x_only=False,
+        segment=True,
+        aux_target=None,
+    ):
+        if segment:
+            features, padding_mask = self.segmenter.pre_segment(features, padding_mask)
+        orig_size = features.size(0) * features.size(1) - padding_mask.sum()
+        gen_result = self.generator(features, random_label, padding_mask)
+        orig_dense_x, token_x = gen_result["dense_x"], gen_result["token_x"]
+        orig_dense_padding_mask = gen_result["dense_padding_mask"]
+        if segment:
+            dense_x, dense_padding_mask = self.segmenter.logit_segment(
+                orig_dense_x, orig_dense_padding_mask
+            )
+        else:
+            dense_x = orig_dense_x
+            dense_padding_mask = orig_dense_padding_mask
+        dense_logits = dense_x
+        prob_perplexity = None
+        code_perplexity = None
+        if not (self.no_softmax and dense_x_only):
+            dense_x, code_perplexity, prob_perplexity = self.normalize(dense_logits)
+        if dense_x_only or self.discriminator is None:
+            return {
+                "logits": dense_x,
+                "padding_mask": dense_padding_mask,
+            }
+        token_padding_mask = random_label == self.pad
+        dense_y = self.discriminator(dense_x, dense_padding_mask)
+        token_y = self.discriminator(token_x, token_padding_mask)
+        sample_size = features.size(0)
+        d_step = self.discrim_step(self.update_num)
+        fake_smooth = self.smoothing
+        real_smooth = self.smoothing
+        if self.smoothing_one_sided:
+            fake_smooth = 0
+        zero_loss = None
+        smoothness_loss = None
+        code_pen = None
+        mmi_loss = None
+        if d_step:
+            loss_dense = F.binary_cross_entropy_with_logits(
+                dense_y,
+                dense_y.new_ones(dense_y.shape) - fake_smooth,
+                reduction="sum",
+            )
+            loss_token = F.binary_cross_entropy_with_logits(
+                token_y,
+                token_y.new_zeros(token_y.shape) + real_smooth,
+                reduction="sum",
+            )
+            if self.training and self.gradient_penalty > 0:
+                grad_pen = self.calc_gradient_penalty(token_x, dense_x)
+                grad_pen = grad_pen.sum() * self.gradient_penalty
+            else:
+                grad_pen = None
+        else:
+            grad_pen = None
+            loss_token = None
+            loss_dense = F.binary_cross_entropy_with_logits(
+                dense_y,
+                dense_y.new_zeros(dense_y.shape) + fake_smooth,
+                reduction="sum",
+            )
+            num_vars = dense_x.size(-1)
+            if prob_perplexity is not None:
+                code_pen = (num_vars - prob_perplexity) / num_vars
+                code_pen = code_pen * sample_size * self.code_penalty
+            if self.smoothness_weight > 0:
+                smoothness_loss = F.mse_loss(
+                    dense_logits[:, :-1], dense_logits[:, 1:], reduction="none"
+                )
+                smoothness_loss[dense_padding_mask[:, 1:]] = 0
+                smoothness_loss = (
+                    smoothness_loss.mean() * sample_size * self.smoothness_weight
+                )
+            if (self.mmi_weight > 0) and (aux_target is not None):
+                inter_x = self.decoder(gen_result["inter_x"])
+                if self.target_downsample_rate > 1:
+                    aux_target = aux_target[:, :: self.target_downsample_rate]
+                max_t_len = min(aux_target.shape[1], inter_x.shape[1])
+                mmi_loss = F.cross_entropy(
+                    inter_x[:, :max_t_len].transpose(1, 2),
+                    aux_target[:, :max_t_len],
+                    ignore_index=-1,
+                    reduction="none",
+                )
+                mmi_loss = mmi_loss.mean() * mmi_loss.shape[0] * self.mmi_weight
+        result = {
+            "losses": {
+                "grad_pen": grad_pen,
+                "code_pen": code_pen,
+                "smoothness": smoothness_loss,
+                "mmi": mmi_loss,
+            },
+            "temp": self.curr_temp,
+            "code_ppl": code_perplexity,
+            "prob_ppl": prob_perplexity,
+            "d_steps": int(d_step),
+            "sample_size": sample_size,
+        }
+        suff = "_d" if d_step else "_g"
+        result["losses"]["dense" + suff] = loss_dense
+        result["losses"]["token" + suff] = loss_token
+        return result

fairseq/examples/wav2vec/unsupervised/scripts/apply_pca.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import math
+import numpy as np
+import tqdm
+import torch
+from shutil import copyfile
+from npy_append_array import NpyAppendArray
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="transforms features via a given pca and stored them in target dir"
+    )
+    # fmt: off
+    parser.add_argument('source', help='directory with features')
+    parser.add_argument('--split', help='which split to read', required=True)
+    parser.add_argument('--save-dir', help='where to save the output', required=True)
+    parser.add_argument('--pca-path', type=str, help='pca location. will append _A.npy and _b.npy', required=True)
+    parser.add_argument('--batch-size', type=int, default=2048000, help='batch size')
+    parser.add_argument('--unfiltered', action='store_true', help='process the unfiltered version')
+    # fmt: on
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    source_path = osp.join(args.source, args.split)
+    data_poth = source_path + "_unfiltered" if args.unfiltered else source_path
+    print(f"data path: {data_poth}")
+    features = np.load(data_poth + ".npy", mmap_mode="r")
+    pca_A = torch.from_numpy(np.load(args.pca_path + "_A.npy")).cuda()
+    pca_b = torch.from_numpy(np.load(args.pca_path + "_b.npy")).cuda()
+    os.makedirs(args.save_dir, exist_ok=True)
+    save_path = osp.join(args.save_dir, args.split)
+    copyfile(source_path + ".tsv", save_path + ".tsv")
+    copyfile(data_poth + ".lengths", save_path + ".lengths")
+    if osp.exists(source_path + ".phn"):
+        copyfile(source_path + ".phn", save_path + ".phn")
+    if osp.exists(source_path + ".wrd"):
+        copyfile(source_path + ".wrd", save_path + ".wrd")
+    if osp.exists(save_path + ".npy"):
+        os.remove(save_path + ".npy")
+    npaa = NpyAppendArray(save_path + ".npy")
+    batches = math.ceil(features.shape[0] / args.batch_size)
+    with torch.no_grad():
+        for b in tqdm.trange(batches):
+            start = b * args.batch_size
+            end = start + args.batch_size
+            x = torch.from_numpy(features[start:end]).cuda()
+            x = torch.matmul(x, pca_A) + pca_b
+            npaa.append(x.cpu().numpy())
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/copy_labels.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+for idx, line in enumerate(sys.stdin):
+    print(f"utt{idx:010d} {line}", end="")

fairseq/examples/wav2vec/unsupervised/scripts/filter_lexicon.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+from fairseq.data import Dictionary
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="filters a lexicon given a unit dictionary"
+    )
+    parser.add_argument("-d", "--unit-dict", help="unit dictionary", required=True)
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    d = Dictionary.load(args.unit_dict)
+    symbols = set(d.symbols)
+    for line in sys.stdin:
+        items = line.rstrip().split()
+        skip = len(items) < 2
+        for x in items[1:]:
+            if x not in symbols:
+                skip = True
+                break
+        if not skip:
+            print(line, end="")
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/filter_tsv.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import argparse
+import sys
+parser = argparse.ArgumentParser()
+parser.add_argument("--tsv", required=True, type=str)
+parser.add_argument("--no-skip", action="store_true")
+parser.add_argument("--keep", action="store_true")
+params = parser.parse_args()
+def get_fname(line):
+    p = os.path.basename(line.split("\t")[0])
+    p = os.path.splitext(p)[0]
+    return p
+# filenames to exclude
+seen = set()
+with open(params.tsv) as f:
+    if not params.no_skip:
+        root = next(f).rstrip()
+    for line in f:
+        seen.add(get_fname(line))
+for i, line in enumerate(sys.stdin):
+    exists = get_fname(line) in seen
+    keep = (exists and params.keep) or (not exists and not params.keep)
+    if i == 0 or keep:
+        print(line, end="")

fairseq/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+from g2p_en import G2p
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--compact",
+        action="store_true",
+        help="if set, compacts phones",
+    )
+    args = parser.parse_args()
+    compact = args.compact
+    wrd_to_phn = {}
+    g2p = G2p()
+    for line in sys.stdin:
+        words = line.strip().split()
+        phones = []
+        for w in words:
+            if w not in wrd_to_phn:
+                wrd_to_phn[w] = g2p(w)
+                if compact:
+                    wrd_to_phn[w] = [
+                        p[:-1] if p[-1].isnumeric() else p for p in wrd_to_phn[w]
+                    ]
+            phones.extend(wrd_to_phn[w])
+        try:
+            print(" ".join(phones))
+        except:
+            print(wrd_to_phn, words, phones, file=sys.stderr)
+            raise
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+def main():
+    for line in sys.stdin:
+        print(line.replace(" ", "").replace("|", " ").strip())
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/mean_pool.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import math
+import numpy as np
+import tqdm
+import torch
+import torch.nn.functional as F
+from shutil import copyfile
+from npy_append_array import NpyAppendArray
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="mean pools representations by compressing uniform splits of the data"
+    )
+    # fmt: off
+    parser.add_argument('source', help='directory with features')
+    parser.add_argument('--split', help='which split to read', required=True)
+    parser.add_argument('--save-dir', help='where to save the output', required=True)
+    parser.add_argument('--subsample-rate', type=float, default=0.5, help='size to subsample data to')
+    parser.add_argument('--remove-extra', action='store_true', help='if true, removes extra states that cant be pooled, otherwise pads with 0s')
+    # fmt: on
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    source_path = osp.join(args.source, args.split)
+    print(f"data path: {source_path}")
+    features = np.load(source_path + ".npy", mmap_mode="r")
+    os.makedirs(args.save_dir, exist_ok=True)
+    save_path = osp.join(args.save_dir, args.split)
+    copyfile(source_path + ".tsv", save_path + ".tsv")
+    if os.path.exists(source_path + ".phn"):
+        copyfile(source_path + ".phn", save_path + ".phn")
+    if os.path.exists(source_path + ".wrd"):
+        copyfile(source_path + ".wrd", save_path + ".wrd")
+    if os.path.exists(osp.join(args.source, "dict.phn.txt")):
+        copyfile(
+            osp.join(args.source, "dict.phn.txt"),
+            osp.join(args.save_dir, "dict.phn.txt"),
+        )
+    if osp.exists(save_path + ".npy"):
+        os.remove(save_path + ".npy")
+    npaa = NpyAppendArray(save_path + ".npy")
+    with open(source_path + ".lengths", "r") as lf:
+        lengths = lf.readlines()
+    fsz = features.shape[-1]
+    start = 0
+    with torch.no_grad():
+        with open(save_path + ".lengths", "w") as lengths_out:
+            for length in tqdm.tqdm(lengths):
+                length = int(length)
+                end = start + length
+                feats = features[start:end]
+                start += length
+                x = torch.from_numpy(feats).cuda()
+                target_num = math.ceil(length * args.subsample_rate)
+                rem = length % target_num
+                if rem > 0:
+                    if args.remove_extra:
+                        to_rem = target_num - rem
+                        target_num -= 1
+                        x = x[:-to_rem]
+                    else:
+                        to_add = target_num - rem
+                        x = F.pad(x, [0, 0, 0, to_add])
+                        x[-to_add:] = x[-to_add - 1]
+                x = x.view(target_num, -1, fsz)
+                x = x.mean(dim=-2)
+                print(target_num, file=lengths_out)
+                npaa.append(x.cpu().numpy())
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/merge_clusters.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import numpy as np
+import tqdm
+import torch
+import random
+from shutil import copyfile
+from npy_append_array import NpyAppendArray
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="transforms features via a given pca and stored them in target dir"
+    )
+    # fmt: off
+    parser.add_argument('source', help='directory with features')
+    parser.add_argument('--split', help='which split to read', required=True)
+    parser.add_argument('--save-dir', help='where to save the output', required=True)
+    parser.add_argument('--cluster-dir', help='where the clusters are')
+    parser.add_argument('--pooling', type=str, default='mean', choices=['mean', 'sample'], help='how to pool')
+    # fmt: on
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    source_path = osp.join(args.source, args.split)
+    cluster_path = osp.join(args.cluster_dir, args.split + ".src")
+    print(f"data path: {source_path}")
+    features = np.load(source_path + ".npy", mmap_mode="r")
+    sizes = []
+    offsets = []
+    offset = 0
+    with open(source_path + ".lengths", "r") as len_f:
+        for line in len_f:
+            length = int(line.rstrip())
+            sizes.append(length)
+            offsets.append(offset)
+            offset += length
+    clusters = []
+    with open(cluster_path, "r") as cf:
+        for line in cf:
+            line = line.rstrip()
+            items = line.split()
+            items = list(map(int, items))
+            clusters.append(items)
+    os.makedirs(args.save_dir, exist_ok=True)
+    save_path = osp.join(args.save_dir, args.split)
+    copyfile(source_path + ".tsv", save_path + ".tsv")
+    if os.path.exists(source_path + ".phn"):
+        copyfile(source_path + ".phn", save_path + ".phn")
+    if os.path.exists(osp.join(args.source, "dict.phn.txt")):
+        copyfile(
+            osp.join(args.source, "dict.phn.txt"),
+            osp.join(args.save_dir, "dict.phn.txt"),
+        )
+    if os.path.exists(source_path + ".wrd"):
+        copyfile(source_path + ".wrd", save_path + ".wrd")
+    if osp.exists(save_path + ".npy"):
+        os.remove(save_path + ".npy")
+    npaa = NpyAppendArray(save_path + ".npy")
+    def merge(feats, clust):
+        feats = torch.from_numpy(feats.copy())
+        clust = torch.LongTensor(clust)
+        _, counts = clust.unique_consecutive(return_counts=True)
+        curr = 0
+        merged = []
+        for c in counts:
+            c = c.item()
+            start = curr
+            end = curr + c
+            curr += c
+            if args.pooling == "mean":
+                new_x = feats[start:end].mean(dim=0)
+            elif args.pooling == "sample":
+                new_x = feats[start + int(random.random() * c)]
+            else:
+                raise NotImplementedError()
+            merged.append(new_x)
+        return torch.stack(merged, dim=0).numpy()
+    with open(save_path + ".lengths", "w") as l_f:
+        for size, offset, clust in tqdm.tqdm(
+            zip(sizes, offsets, clusters), total=len(sizes)
+        ):
+            end = size + offset
+            feats = features[offset:end]
+            feats = merge(feats, clust)
+            print(len(feats), file=l_f)
+            npaa.append(feats)
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import fasttext as ft
+import os
+import regex
+import sys
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="reads text from stdin and outputs normalized, lid-filtered version to stdout"
+    )
+    parser.add_argument(
+        "--fasttext-model",
+        help="path to fasttext model",
+        default="lid.187.bin",
+    )
+    parser.add_argument("--lang", help="language id", required=True)
+    parser.add_argument(
+        "--lid-threshold",
+        type=float,
+        help="threshold for this lang id probability",
+        default=0.4,
+    )
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")
+    lg = args.lang.lower()
+    lg_label = f"__label__{lg}"
+    thresh = args.lid_threshold
+    if os.path.exists(args.fasttext_model):
+        model = ft.load_model(args.fasttext_model)
+    else:
+        print(
+            f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. "
+            f"To enable language filtering, please download the latest language id model "
+            f"from https://fasttext.cc/docs/en/language-identification.html",
+            file=sys.stderr,
+        )
+        model = None
+    for line in sys.stdin:
+        line = line.strip()
+        line = filter_r.sub(" ", line)
+        line = " ".join(line.split())
+        if model is not None:
+            lid, prob = model.predict(line, k=100)
+            try:
+                target_idx = lid.index(lg_label)
+            except ValueError:
+                continue
+            if target_idx == 0 or prob[target_idx] >= thresh:
+                print(line)
+        else:
+            print(line)
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/normalize_text.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import regex
+import sys
+def main():
+    filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")
+    for line in sys.stdin:
+        line = line.strip()
+        line = filter_r.sub(" ", line)
+        line = " ".join(line.split())
+        print(line)
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/pca.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import numpy as np
+import faiss
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="compute a pca matrix given an array of numpy features"
+    )
+    # fmt: off
+    parser.add_argument('data', help='numpy file containing features')
+    parser.add_argument('--output', help='where to save the pca matrix', required=True)
+    parser.add_argument('--dim', type=int, help='dim for pca reduction', required=True)
+    parser.add_argument('--eigen-power', type=float, default=0, help='eigen power, -0.5 for whitening')
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    print("Reading features")
+    x = np.load(args.data, mmap_mode="r")
+    print("Computing PCA")
+    pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power)
+    pca.train(x)
+    b = faiss.vector_to_array(pca.b)
+    A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
+    os.makedirs(args.output, exist_ok=True)
+    prefix = str(args.dim)
+    if args.eigen_power != 0:
+        prefix += f"_{args.eigen_power}"
+    np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T)
+    np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import numpy as np
+import sys
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="converts words to phones adding optional silences around in between words"
+    )
+    parser.add_argument(
+        "--sil-prob",
+        "-s",
+        type=float,
+        default=0,
+        help="probability of inserting silence between each word",
+    )
+    parser.add_argument(
+        "--surround",
+        action="store_true",
+        help="if set, surrounds each example with silence",
+    )
+    parser.add_argument(
+        "--lexicon",
+        help="lexicon to convert to phones",
+        required=True,
+    )
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    sil_prob = args.sil_prob
+    surround = args.surround
+    sil = "<SIL>"
+    wrd_to_phn = {}
+    with open(args.lexicon, "r") as lf:
+        for line in lf:
+            items = line.rstrip().split()
+            assert len(items) > 1, line
+            assert items[0] not in wrd_to_phn, items
+            wrd_to_phn[items[0]] = items[1:]
+    for line in sys.stdin:
+        words = line.strip().split()
+        if not all(w in wrd_to_phn for w in words):
+            continue
+        phones = []
+        if surround:
+            phones.append(sil)
+        sample_sil_probs = None
+        if sil_prob > 0 and len(words) > 1:
+            sample_sil_probs = np.random.random(len(words) - 1)
+        for i, w in enumerate(words):
+            phones.extend(wrd_to_phn[w])
+            if (
+                sample_sil_probs is not None
+                and i < len(sample_sil_probs)
+                and sample_sil_probs[i] < sil_prob
+            ):
+                phones.append(sil)
+        if surround:
+            phones.append(sil)
+        print(" ".join(phones))
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio.sh ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env zsh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+source_dir=$1
+tgt_dir=$2
+model=$3
+if [ -z "$4" ]
+  then
+    dim=512
+  else
+    dim=$4
+fi
+echo "using $dim dim for PCA"
+if [ -z "$5" ]
+  then
+    layer=14
+  else
+    layer=$5
+fi
+echo "extracting from layer $layer"
+train_split=train
+valid_split=valid
+test_split=test
+all_splits=($train_split)
+if [[ -f "$source_dir/valid.tsv" ]]; then
+    all_splits+=('valid')
+fi
+if [[ -f "$source_dir/test.tsv" ]]; then
+    all_splits+=('test')
+fi
+echo "processing splits: $all_splits"
+mkdir -p $tgt_dir
+cp $source_dir/*.tsv $tgt_dir
+cp $source_dir/*.wrd $tgt_dir
+cp $source_dir/*.ltr $tgt_dir
+cp $source_dir/*.phn $tgt_dir
+cp $source_dir/dict* $tgt_dir
+setopt shwordsplit
+for split in $all_splits; do
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \
+  --save-dir $tgt_dir --checkpoint $model --layer $layer
+done
+python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py $tgt_dir/${train_split}.tsv \
+--checkpoint $model --save-dir $tgt_dir -f "CLUS128" --sample-pct 1.0
+for split in $all_splits; do
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py $tgt_dir \
+  --checkpoint $model --path $tgt_dir/CLUS128 --split $split
+done
+python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/pca.py $tgt_dir/${train_split}.npy --output $tgt_dir/pca --dim $dim
+for split in $all_splits; do
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/apply_pca.py $tgt_dir --split $split --save-dir $tgt_dir/precompute_pca$dim --pca-path $tgt_dir/pca/${dim}_pca --batch-size 1048000
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/merge_clusters.py $tgt_dir/precompute_pca$dim --cluster-dir $tgt_dir/CLUS128 \
+  --split $split --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean --pooling mean
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/mean_pool.py $tgt_dir/precompute_pca${dim}_cls128_mean \
+  --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean_pooled --split $split
+done

fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env zsh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+source_dir=$1
+tgt_dir=$2
+model=$3
+if [ -z "$4" ]
+  then
+    dim=64
+  else
+    dim=$4
+fi
+echo "using $dim clusters for auxilary target"
+if [ -z "$5" ]
+  then
+    layer=14
+  else
+    layer=$5
+fi
+echo "extracting from layer $layer"
+train_split=train
+valid_split=valid
+test_split=test
+all_splits=($train_split)
+if [[ -f "$source_dir/valid.tsv" ]]; then
+    all_splits+=('valid')
+fi
+if [[ -f "$source_dir/test.tsv" ]]; then
+    all_splits+=('test')
+fi
+echo "processing splits: $all_splits"
+mkdir -p $tgt_dir
+cp $source_dir/*.tsv $tgt_dir
+cp $source_dir/*.wrd $tgt_dir
+cp $source_dir/*.ltr $tgt_dir
+cp $source_dir/*.phn $tgt_dir
+cp $source_dir/dict* $tgt_dir
+setopt shwordsplit
+for split in $all_splits; do
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \
+  --save-dir $tgt_dir --checkpoint $model --layer $layer
+done
+mkdir -p $tgt_dir/mfcc
+# Consider spliting corpus into chuncks for large corpus, see HuBERT preprocessing for more details
+python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_mfcc_feature.py \
+  $tgt_dir $train_split 1 0 $tgt_dir/mfcc
+python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py \
+  $tgt_dir/mfcc $train_split $tgt_dir/mfcc/cls$dim 1 0 $tgt_dir/mfcc/cls${dim}_idx
+cp $tgt_dir/mfcc/cls${dim}_idx/${train_split}_0_1.km $tgt_dir/$train_split.km

fairseq/examples/wav2vec/unsupervised/scripts/prepare_text.sh ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env zsh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+lg=$1
+text_path=$2
+target_dir=$3
+min_phones=$4
+phonemizer=$5
+lid_path=$6
+sil_prob=$7
+if [ -z "$lid_path" ]; then
+  lid_path="lid.187.bin"
+fi
+ph_lg=${lg:l}
+if test "$lg" = 'fr'; then
+  ph_lg='fr-fr'
+elif test "$lg" = 'en'; then
+  ph_lg='en-us'
+elif test "$lg" = 'pt'; then
+  ph_lg='pt-br'
+fi
+ESPEAK_PATH=''
+if test "$phonemizer" = 'espeak'; then
+  ESPEAK_PATH=$(which espeak)
+elif test "$phonemizer" = 'espeak-ng'; then
+  ESPEAK_PATH=$(which espeak-ng)
+elif test "$phonemizer" = 'G2P'; then
+  ESPEAK_PATH=''
+else
+  echo "Unknown phonemizer $phonemizer. Valid options are espeak, espean-ng and G2P"
+  exit 1
+fi
+echo $lg
+echo $ph_lg
+echo $text_path
+echo $target_dir
+echo "min phone seen threshold is $min_phones"
+mkdir -p $target_dir
+python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py --lang $lg --fasttext-model $lid_path < $text_path | grep -v '\-\-\-' >! $target_dir/lm.upper.lid.txt
+python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/lm.upper.lid.txt --only-source --destdir $target_dir --thresholdsrc 2 --padding-factor 1 --dict-only
+cut -f1 -d' ' $target_dir/dict.txt | grep -v -x '[[:punct:]]*' | grep -Pv '\d\d\d\d\d+' >! $target_dir/words.txt
+if [ -z "$ESPEAK_PATH" ]; then
+  python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py --compact < $target_dir/words.txt > $target_dir/phones.txt
+else
+  # echoing 1 into corpus will prevent the mismatch lines between lexicon and phones in case the phonemizer fails
+  one=$(echo "1" | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -p ' ' -w '' -l $ph_lg --language-switch remove-flags)
+  sed 's/$/ 1/' $target_dir/words.txt | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -o $target_dir/phones.txt -p ' ' -w '' -l $ph_lg -j 70 --language-switch remove-flags
+  echo "one is ${one}"
+  sed -i "s/${one}$//" $target_dir/phones.txt
+fi
+paste $target_dir/words.txt $target_dir/phones.txt >! $target_dir/lexicon.lst
+python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc $min_phones --padding-factor 1 --dict-only
+python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/filter_lexicon.py -d $target_dir/phones/dict.txt < $target_dir/lexicon.lst >! $target_dir/lexicon_filtered.lst
+python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py -s $sil_prob --surround --lexicon $target_dir/lexicon_filtered.lst < $target_dir/lm.upper.lid.txt >! $target_dir/phones/lm.phones.filtered.txt
+cp $target_dir/phones/dict.txt $target_dir/phones/dict.phn.txt
+echo "<SIL> 0" >> $target_dir/phones/dict.phn.txt
+python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones/lm.phones.filtered.txt --workers 70 --only-source --destdir $target_dir/phones --srcdict $target_dir/phones/dict.phn.txt
+$KENLM_ROOT/lmplz -o 4 < $target_dir/lm.upper.lid.txt --discount_fallback --prune 0 0 0 3 >! $target_dir/kenlm.wrd.o40003.arpa
+$KENLM_ROOT/build_binary $target_dir/kenlm.wrd.o40003.arpa $target_dir/kenlm.wrd.o40003.bin
+lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words_sil lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'"
+lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn
+$KENLM_ROOT/lmplz -o 4 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.04.arpa
+$KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.04.arpa $target_dir/phones/lm.phones.filtered.04.bin
+$KENLM_ROOT/lmplz -o 6 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.06.arpa
+$KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.06.arpa $target_dir/phones/lm.phones.filtered.06.bin
+lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_phn_sil lm_arpa=$target_dir/phones/lm.phones.filtered.06.arpa data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'"

fairseq/examples/wav2vec/unsupervised/scripts/prepare_timit.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+timit_root=$1  # assume it is the upper-cased version
+tgt_dir=$2
+model=$3
+set -eu
+setups="matched unmatched"
+splits="test valid train train_text"
+tgt_dir=$(realpath $tgt_dir)
+sph2wav=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+wav_dir=$tgt_dir/wav
+mkdir -p $tgt_dir $wav_dir
+find $timit_root/{TRAIN,TEST} -iname "*.WAV" > $tgt_dir/all_sph.flist
+cat $tgt_dir/all_sph.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).WAV#\1_\2#g' > $tgt_dir/all.uid
+paste -d' ' $tgt_dir/{all_sph.flist,all.uid} | \
+  awk -v sph2wav=$sph2wav -v wav_dir=$wav_dir '{print sph2wav " -f wav " $1 " > " wav_dir "/" $2 ".wav"}' \
+  > $tgt_dir/sph2wav.sh
+bash $tgt_dir/sph2wav.sh
+cat $tgt_dir/all.uid | awk -v wav_dir=$(pwd)/$wav_dir '{print $1" "wav_dir"/"$1".wav"}' | sort > $tgt_dir/all_wav.scp
+cut -d' ' -f2 $tgt_dir/all_wav.scp | xargs -I{} soxi -s {} > $tgt_dir/all.dur
+paste -d' ' $tgt_dir/{all_wav.scp,all.dur} > $tgt_dir/all_wav_dur.scp
+rm $tgt_dir/{all.uid,all_sph.flist,sph2wav.sh}
+find $timit_root/{TRAIN,TEST} -iname "*.PHN" > $tgt_dir/all_phn60.flist
+while read line; do
+  if [ ! -f $line ]; then
+    >&2 echo "Cannot find transcription file '$line'" && exit 1;
+  fi
+  cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;'
+done < $tgt_dir/all_phn60.flist > $tgt_dir/all.phn60
+cat $tgt_dir/all_phn60.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).PHN#\1_\2#g' | \
+  paste -d' ' - $tgt_dir/all.phn60 | \
+  $KALDI_ROOT/egs/timit/s5/local/timit_norm_trans.pl -i - -m $KALDI_ROOT/egs/timit/s5/conf/phones.60-48-39.map -to 39 | \
+  sort > $tgt_dir/all.phn
+echo "done preparing wav and 39-phone transcripts"
+for s in $setups; do
+  mkdir -p $tgt_dir/$s
+  for x in $splits; do
+    uid_path=config/timit_${s}/${x}.uid
+    grep -w -f $uid_path $tgt_dir/all.phn | cut -d' ' -f2- > $tgt_dir/$s/$x.phn
+    ln -sf $(realpath $tgt_dir/$s/$x.phn) $tgt_dir/$s/$x.wrd
+    echo "/" > $tgt_dir/$s/$x.tsv &&  grep -w -f $uid_path $tgt_dir/all_wav_dur.scp | cut -d' ' -f2- | sed 's# #\t#'  >> $tgt_dir/$s/$x.tsv
+  done
+  for x in $splits; do
+    cat $tgt_dir/$s/$x.phn
+  done | tr ' ' '\n' | sort -u | awk '{print $1" "1}' > $tgt_dir/$s/dict.phn.txt
+  ln -sf $(realpath $tgt_dir/$s/dict.phn.txt) $tgt_dir/$s/dict.wrd.txt
+done
+echo "done preparing unmatched and matched setups for TIMIT"
+for s in $setups; do
+  zsh scripts/prepare_audio.sh $tgt_dir/$s $tgt_dir/$s/feat $model
+  lm_dir=$tgt_dir/$s/phones
+  fst_dir=$tgt_dir/$s/fst/phn_to_phn
+  python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $tgt_dir/$s/train_text.phn --workers 10 --only-source --destdir $lm_dir --srcdict $tgt_dir/$s/dict.phn.txt
+  $KENLM_ROOT/lmplz -o 3 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.03.arpa
+  $KENLM_ROOT/build_binary $lm_dir/train_text_phn.03.arpa $lm_dir/train_text_phn.03.bin
+  $KENLM_ROOT/lmplz -o 4 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.04.arpa
+  $KENLM_ROOT/build_binary $lm_dir/train_text_phn.04.arpa $lm_dir/train_text_phn.04.bin
+  python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$fst_dir lm_arpa=$lm_dir/train_text_phn.03.arpa data_dir=$tgt_dir/$s in_labels=phn
+done
+echo "done preprocessing audio and text for wav2vec-U"

fairseq/examples/wav2vec/unsupervised/scripts/remove_silence.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+get intervals from .vads file, specify output data, and this script removes silences and saves the audio data in out path folder
+paths=shards/train.tsv
+vads=shards/train.vads
+python remove_silence.py --paths $paths --vads $vads
+"""
+import os
+import argparse
+import torch
+import torchaudio
+import tqdm
+parser = argparse.ArgumentParser()
+parser.add_argument("--tsv", default="", type=str)
+parser.add_argument("--vads", default="", type=str)
+parser.add_argument("--out", type=str)
+params = parser.parse_args()
+# load paths
+paths = []
+with open(params.tsv) as f:
+    root = next(f).rstrip()
+    for line in f:
+        paths.append(os.path.join(root, line.rstrip().split("\t")[0]))
+# load vads
+list_intervals = []
+with open(params.vads) as f:
+    for line in f:
+        interval = [
+            [int(w.split(":")[0]), int(w.split(":")[1])] for w in line.rstrip().split()
+        ]
+        list_intervals.append(interval)
+# load audio and keep only intervals (i.e. remove silences)
+for i in tqdm.trange(len(paths)):
+    data, _ = torchaudio.load(paths[i])
+    if len(list_intervals[i]) > 0:
+        data_filtered = torch.cat(
+            [data[0][int(it[0]) : int(it[1])] for it in list_intervals[i]]
+        ).unsqueeze(0)
+    else:
+        data_filtered = data
+    # YOU MAY NEED TO MODIFY THIS TO GET THE RIGHT SUBPATH
+    # outpath = params.out + '/'.join(paths[i].split('/')[-1])
+    outpath = params.out + "/" + "/".join(paths[i].split("/")[-2:])
+    if not os.path.isdir("/".join(outpath.split("/")[:-1])):
+        os.makedirs("/".join(outpath.split("/")[:-1]))
+    if not os.path.exists(outpath):
+        torchaudio.save(outpath, data_filtered, sample_rate=16000)
+    else:
+        print(outpath, "exists!")

fairseq/examples/wav2vec/unsupervised/scripts/vads.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+from copy import deepcopy
+from scipy.signal import lfilter
+import numpy as np
+from tqdm import tqdm
+import soundfile as sf
+import os.path as osp
+def get_parser():
+    parser = argparse.ArgumentParser(description="compute vad segments")
+    parser.add_argument(
+        "--rvad-home",
+        "-r",
+        help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)",
+        required=True,
+    )
+    return parser
+def rvad(speechproc, path):
+    winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
+    ftThres = 0.5
+    vadThres = 0.4
+    opts = 1
+    data, fs = sf.read(path)
+    assert fs == 16_000, "sample rate must be 16khz"
+    ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
+    # --spectral flatness --
+    pv01 = np.zeros(ft.shape[0])
+    pv01[np.less_equal(ft, ftThres)] = 1
+    pitch = deepcopy(ft)
+    pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
+    # --filtering--
+    ENERGYFLOOR = np.exp(-50)
+    b = np.array([0.9770, -0.9770])
+    a = np.array([1.0000, -0.9540])
+    fdata = lfilter(b, a, data, axis=0)
+    # --pass 1--
+    noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy(
+        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
+    )
+    # sets noisy segments to zero
+    for j in range(n_noise_samp):
+        fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0
+    vad_seg = speechproc.snre_vad(
+        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
+    )
+    return vad_seg, data
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    sys.path.append(args.rvad_home)
+    import speechproc
+    stride = 160
+    lines = sys.stdin.readlines()
+    root = lines[0].rstrip()
+    for fpath in tqdm(lines[1:]):
+        path = osp.join(root, fpath.split()[0])
+        vads, wav = rvad(speechproc, path)
+        start = None
+        vad_segs = []
+        for i, v in enumerate(vads):
+            if start is None and v == 1:
+                start = i * stride
+            elif start is not None and v == 0:
+                vad_segs.append((start, i * stride))
+                start = None
+        if start is not None:
+            vad_segs.append((start, len(wav)))
+        print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs))
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import numpy as np
+import tqdm
+import torch
+import sys
+import faiss
+import torch.nn.functional as F
+from wav2vec_cluster_faiss import parse_faiss_specs, Wav2VecFeatureReader
+def get_parser():
+    parser = argparse.ArgumentParser(description="apply clusters")
+    # fmt: off
+    parser.add_argument('data', help='location of tsv files')
+    parser.add_argument('--split', help='split to process', required=True)
+    parser.add_argument('--labels', help='split to process', default="phn")
+    parser.add_argument('--path', help='path to pca and centroids', required=True)
+    parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True)
+    parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14)
+    parser.add_argument('--max-tsz', type=int, help='batch kmeans up to this much', default=14)
+    # fmt: on
+    return parser
+def get_iterator(args):
+    label_path = osp.join(args.data, f"{args.split}.{args.labels}")
+    if osp.exists(label_path):
+        lp = open(label_path, "r")
+    else:
+        lp = None
+    with open(osp.join(args.data, f"{args.split}.tsv"), "r") as fp:
+        lines = fp.read().split("\n")
+        root = lines.pop(0).strip()
+        files = [line.rstrip() for line in lines if len(line) > 0]
+        if lp is not None:
+            lbls = [line.rstrip() for line in lp]
+        else:
+            lbls = [None] * len(files)
+        num = len(files)
+        reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
+        def iterate():
+            for fname, lbl in zip(files, lbls):
+                file = osp.join(root, fname.split("\t")[0])
+                feats = reader.get_feats(file)
+                yield feats.data, fname, lbl
+        return iterate, num, root
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    spec = osp.basename(args.path)
+    try:
+        faiss_spec = parse_faiss_specs(spec.rstrip("/"))[0]
+    except:
+        print(spec)
+        raise
+    print("Faiss Spec:", faiss_spec, file=sys.stderr)
+    if faiss_spec.pca:
+        A = torch.from_numpy(np.load(osp.join(args.path, "pca_A.npy"))).cuda()
+        b = torch.from_numpy(np.load(osp.join(args.path, "pca_b.npy"))).cuda()
+        print("Loaded PCA", file=sys.stderr)
+    centroids = np.load(osp.join(args.path, "centroids.npy"))
+    print("Loaded centroids", centroids.shape, file=sys.stderr)
+    res = faiss.StandardGpuResources()
+    index_flat = (
+        faiss.IndexFlatL2(centroids.shape[1])
+        if not faiss_spec.sphere
+        else faiss.IndexFlatIP(centroids.shape[1])
+    )
+    faiss_index = faiss.index_cpu_to_gpu(res, 0, index_flat)
+    faiss_index.add(centroids)
+    generator, num, root = get_iterator(args)
+    iterator = generator()
+    had_labels = False
+    label_path = osp.join(args.path, f"{args.split}.{args.labels}")
+    with torch.no_grad():
+        with open(osp.join(args.path, f"{args.split}.src"), "w") as fp, open(
+            osp.join(args.path, f"{args.split}.tsv"), "w"
+        ) as pp, open(label_path, "w") as lp:
+            print(root, file=pp)
+            for f, fname, lbl in tqdm.tqdm(iterator, total=num):
+                if faiss_spec.pca:
+                    f = torch.mm(f, A) + b
+                if faiss_spec.norm:
+                    f = F.normalize(f, p=2, dim=-1)
+                f = f.cpu().numpy()
+                _, z = faiss_index.search(f, 1)
+                print(" ".join(str(x.item()) for x in z), file=fp)
+                print(fname, file=pp)
+                if lbl is not None:
+                    print(lbl, file=lp)
+                    had_labels = True
+    if not had_labels:
+        os.remove(label_path)
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import gc
+import os
+import os.path as osp
+import random
+import numpy as np
+import tqdm
+import torch
+from collections import namedtuple
+import faiss
+import fairseq
+import soundfile as sf
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="compute kmeans codebook from kaldi-computed feats"
+    )
+    # fmt: off
+    parser.add_argument('data', help='location of tsv files')
+    parser.add_argument('--save-dir', help='where to save the output', required=True)
+    parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True)
+    parser.add_argument('--sample-pct', '-r', type=float, help='percentage of timesteps to sample', default=0)
+    parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14)
+    parser.add_argument('--faiss-specs', '-f', type=str,
+                        help='faiss index specs; separated by space '
+                             'format is: PCAx_NORM_CLUSx_SPHERICAL -> '
+                                'PCAx if exists first apply PCA '
+                                'NORM if exists, normalize the vector by L2 norm '
+                                'CLUSx must exist, cluster to x clusters '
+                                'SPEHRICAL if exists, apply spherical kmeans',
+                        default='l2')
+    # fmt: on
+    return parser
+faiss_spec = namedtuple("faiss_spec", ["pca", "norm", "n_clus", "sphere", "spec_str"])
+def parse_faiss_specs(specs_str):
+    specs = []
+    for ss in specs_str.split():
+        comps = ss.split("_")
+        pca = 0
+        norm = False
+        n_clus = 0
+        sphere = False
+        for c in comps:
+            if c.startswith("PCA"):
+                pca = int(c[3:])
+            elif c == "NORM":
+                norm = True
+            elif c.startswith("CLUS"):
+                n_clus = int(c[4:])
+            elif c == "SPHERICAL":
+                sphere = True
+        assert n_clus > 0
+        specs.append(
+            faiss_spec(pca=pca, norm=norm, n_clus=n_clus, sphere=sphere, spec_str=ss)
+        )
+    return specs
+class Wav2VecFeatureReader(object):
+    def __init__(self, cp_file, layer):
+        state = fairseq.checkpoint_utils.load_checkpoint_to_cpu(cp_file)
+        self.layer = layer
+        if "cfg" in state:
+            w2v_args = state["cfg"]
+            task = fairseq.tasks.setup_task(w2v_args.task)
+            model = task.build_model(w2v_args.model)
+        else:
+            w2v_args = state["args"]
+            task = fairseq.tasks.setup_task(w2v_args)
+            model = task.build_model(w2v_args)
+        model.load_state_dict(state["model"], strict=True)
+        model.eval()
+        model.cuda()
+        self.model = model
+    def read_audio(self, fname):
+        """Load an audio file and return PCM along with the sample rate"""
+        wav, sr = sf.read(fname)
+        assert sr == 16e3
+        return wav
+    def get_feats(self, loc):
+        x = self.read_audio(loc)
+        with torch.no_grad():
+            source = torch.from_numpy(x).view(1, -1).float().cuda()
+            res = self.model(
+                source=source, mask=False, features_only=True, layer=self.layer
+            )
+            return res["layer_results"][self.layer][0].squeeze(1)
+def get_iterator(args):
+    with open(args.data, "r") as fp:
+        lines = fp.read().split("\n")
+        root = lines.pop(0).strip()
+        files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0]
+        if getattr(args, "sample_pct", 0) > 0:
+            files = random.sample(files, int(args.sample_pct * len(files)))
+        num = len(files)
+        reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
+        def iterate():
+            for fname in files:
+                feats = reader.get_feats(fname)
+                yield feats.cpu().numpy()
+    return iterate, num
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    faiss_specs = parse_faiss_specs(args.faiss_specs)
+    print("Faiss Specs:", faiss_specs)
+    feat_path = osp.join(args.save_dir, "features")
+    if osp.exists(feat_path + ".npy"):
+        feats = np.load(feat_path + ".npy")
+    else:
+        generator, num = get_iterator(args)
+        iterator = generator()
+        feats = []
+        for f in tqdm.tqdm(iterator, total=num):
+            feats.append(f)
+        del iterator
+        del generator
+        feats = np.concatenate(feats)
+        print(feats.shape)
+        os.makedirs(args.save_dir, exist_ok=True)
+        # np.save(feat_path, feats)
+        gc.collect()
+        torch.cuda.empty_cache()
+    reload = False
+    for spec in faiss_specs:
+        print("Processing spec", spec)
+        if reload:
+            print("Reloading...")
+            del feats
+            gc.collect()
+            feats = np.load(feat_path + ".npy")
+        save_path = osp.join(args.save_dir, spec.spec_str)
+        os.makedirs(save_path, exist_ok=True)
+        d = feats.shape[-1]
+        x = feats
+        if spec.pca > 0:
+            print("Computing PCA")
+            pca = faiss.PCAMatrix(d, spec.pca)
+            pca.train(x)
+            d = spec.pca
+            b = faiss.vector_to_array(pca.b)
+            A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
+            np.save(osp.join(save_path, "pca_A"), A.T)
+            np.save(osp.join(save_path, "pca_b"), b)
+            print("Applying PCA")
+            x = pca.apply_py(x)
+        if spec.norm:
+            reload = spec.pca <= 0
+            print("Normalizing")
+            faiss.normalize_L2(x)
+        print("Computing kmeans")
+        kmeans = faiss.Kmeans(
+            d,
+            spec.n_clus,
+            niter=50,
+            verbose=True,
+            spherical=spec.sphere,
+            max_points_per_centroid=feats.shape[0],
+            gpu=True,
+            nredo=3,
+        )
+        kmeans.train(x)
+        np.save(osp.join(save_path, "centroids"), kmeans.centroids)
+        del kmeans
+        del x
+        gc.collect()
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as osp
+import tqdm
+import torch
+import torch.nn.functional as F
+from shutil import copyfile
+from npy_append_array import NpyAppendArray
+import fairseq
+import soundfile as sf
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="compute kmeans codebook from kaldi-computed feats"
+    )
+    # fmt: off
+    parser.add_argument('data', help='location of tsv files')
+    parser.add_argument('--split', help='which split to read', required=True)
+    parser.add_argument('--save-dir', help='where to save the output', required=True)
+    parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec ctc model', required=True)
+    parser.add_argument('--layer', type=int, default=14, help='which layer to use')
+    # fmt: on
+    return parser
+class Wav2VecFeatureReader(object):
+    def __init__(self, cp_file, layer):
+        model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [cp_file]
+        )
+        model = model[0]
+        model.eval()
+        model.cuda()
+        self.model = model
+        self.task = task
+        self.layer = layer
+    def read_audio(self, fname):
+        """Load an audio file and return PCM along with the sample rate"""
+        wav, sr = sf.read(fname)
+        assert sr == 16e3
+        return wav
+    def get_feats(self, loc):
+        x = self.read_audio(loc)
+        with torch.no_grad():
+            source = torch.from_numpy(x).float().cuda()
+            if self.task.cfg.normalize:
+                assert source.dim() == 1, source.dim()
+                with torch.no_grad():
+                    source = F.layer_norm(source, source.shape)
+            source = source.view(1, -1)
+            m_res = self.model(source=source, mask=False, features_only=True, layer=self.layer)
+            return m_res["x"].squeeze(0).cpu()
+def get_iterator(args):
+    with open(osp.join(args.data, args.split) + ".tsv", "r") as fp:
+        lines = fp.read().split("\n")
+        root = lines.pop(0).strip()
+        files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0]
+        num = len(files)
+        reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
+        def iterate():
+            for fname in files:
+                w2v_feats = reader.get_feats(fname)
+                yield w2v_feats
+    return iterate, num
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    os.makedirs(args.save_dir, exist_ok=True)
+    def create_files(dest):
+        copyfile(osp.join(args.data, args.split) + ".tsv", dest + ".tsv")
+        if osp.exists(osp.join(args.data, args.split) + ".wrd"):
+            copyfile(osp.join(args.data, args.split) + ".wrd", dest + ".wrd")
+        if osp.exists(osp.join(args.data, args.split) + ".phn"):
+            copyfile(osp.join(args.data, args.split) + ".phn", dest + ".phn")
+        if osp.exists(dest + ".npy"):
+            os.remove(dest + ".npy")
+        npaa = NpyAppendArray(dest + ".npy")
+        return npaa
+    save_path = osp.join(args.save_dir, args.split)
+    npaa = create_files(save_path)
+    generator, num = get_iterator(args)
+    iterator = generator()
+    with open(save_path + ".lengths", "w") as l_f:
+        for w2v_feats in tqdm.tqdm(iterator, total=num):
+            print(len(w2v_feats), file=l_f)
+            if len(w2v_feats) > 0:
+                npaa.append(w2v_feats.numpy())
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/scripts/wer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Implement unsupervised metric for decoding hyperparameter selection:
+    $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$
+"""
+import argparse
+import logging
+import sys
+import editdistance
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--hypo", help="hypo transcription", required=True)
+    parser.add_argument(
+        "-r", "--reference", help="reference transcription", required=True
+    )
+    return parser
+def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p):
+    d_cnt = 0
+    w_cnt = 0
+    w_cnt_h = 0
+    for uid in hyp_uid_to_tra:
+        ref = ref_uid_to_tra[uid].split()
+        if g2p is not None:
+            hyp = g2p(hyp_uid_to_tra[uid])
+            hyp = [p for p in hyp if p != "'" and p != " "]
+            hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp]
+        else:
+            hyp = hyp_uid_to_tra[uid].split()
+        d_cnt += editdistance.eval(ref, hyp)
+        w_cnt += len(ref)
+        w_cnt_h += len(hyp)
+    wer = float(d_cnt) / w_cnt
+    logger.debug(
+        (
+            f"wer = {wer * 100:.2f}%; num. of ref words = {w_cnt}; "
+            f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}"
+        )
+    )
+    return wer
+def main():
+    args = get_parser().parse_args()
+    errs = 0
+    count = 0
+    with open(args.hypo, "r") as hf, open(args.reference, "r") as rf:
+        for h, r in zip(hf, rf):
+            h = h.rstrip().split()
+            r = r.rstrip().split()
+            errs += editdistance.eval(r, h)
+            count += len(r)
+    logger.info(f"UER: {errs / count * 100:.2f}%")
+if __name__ == "__main__":
+    main()
+def load_tra(tra_path):
+    with open(tra_path, "r") as f:
+        uid_to_tra = {}
+        for line in f:
+            uid, tra = line.split(None, 1)
+            uid_to_tra[uid] = tra
+    logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}")
+    return uid_to_tra

fairseq/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+def main():
+    for line in sys.stdin:
+        print(" ".join(list(line.strip().replace(" ", "|"))) + " |")
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/unsupervised/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .unpaired_audio_text import UnpairedAudioText
+__all__ = [
+    "UnpairedAudioText",
+]

fairseq/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+from dataclasses import dataclass, field
+import logging
+import math
+import os
+from typing import Optional
+import torch
+from fairseq.logging import metrics
+from fairseq.tasks import FairseqTask, register_task
+from ..data import ExtractedFeaturesDataset, RandomInputDataset
+from fairseq.data import (
+    Dictionary,
+    data_utils,
+    StripTokenDataset,
+)
+from fairseq.dataclass import FairseqDataclass
+from fairseq.distributed.utils import get_data_parallel_world_size
+from omegaconf import MISSING
+from examples.speech_recognition.kaldi.kaldi_decoder import (
+    KaldiDecoder,
+    KaldiDecoderConfig,
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class DecodingConfig(FairseqDataclass):
+    kenlm_path: Optional[str] = None
+    lm_weight: float = 0
+    blank_weight: float = 0
+@dataclass
+class UnpairedAudioTextConfig(FairseqDataclass):
+    data: str = field(
+        default=MISSING, metadata={"help": "path to data directory containing audio"}
+    )
+    text_data: str = field(
+        default=MISSING, metadata={"help": "path to data directory containing text"}
+    )
+    max_length: Optional[int] = None
+    labels: Optional[str] = field(
+        default=None,
+        metadata={"help": "extension of the label file to load, used for fine-tuning"},
+    )
+    aux_target_postfix: Optional[str] = field(
+        default=None,
+        metadata={"help": "auxaliry target filename extension"},
+    )
+    unfiltered: bool = field(
+        default=False, metadata={"help": "load data with _unfiltered suffix"}
+    )
+    ctc_eval: bool = field(
+        default=False, metadata={"help": "eval UER as if computed by CTC"}
+    )
+    sort_by_length: bool = field(
+        default=True, metadata={"help": "sort examples by length of audio timesteps"}
+    )
+    shuffle: bool = field(default=True, metadata={"help": "shuffle examples"})
+    append_eos: bool = field(default=False, metadata={"help": "append eos"})
+    uppercase: Optional[bool] = field(
+        default=False, metadata={"help": "uppercase for LM score computation"}
+    )
+    skipwords: Optional[str] = field(
+        default="",
+        metadata={
+            "help": "comma-separated words to be removed for LM score computation"
+        },
+    )
+    kenlm_path: Optional[str] = None
+    vocab_usage_power: float = 2
+    word_decoder_config: Optional[KaldiDecoderConfig] = None
+    word_kenlm_path: Optional[str] = None
+    decoding_config: DecodingConfig = DecodingConfig()
+@register_task("unpaired_audio_text", dataclass=UnpairedAudioTextConfig)
+class UnpairedAudioText(FairseqTask):
+    """ """
+    cfg: UnpairedAudioTextConfig
+    def __init__(
+        self,
+        cfg: UnpairedAudioTextConfig,
+        source_dictionary=None,
+        target_dictionary=None,
+    ):
+        super().__init__(cfg)
+        self._target_dictionary = target_dictionary
+        self._source_dictionary = source_dictionary
+        self.num_symbols = (
+            len([s for s in target_dictionary.symbols if not s.startswith("madeup")])
+            - target_dictionary.nspecial
+        )
+        self.sil_id = (
+            target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1
+        )
+        self.kenlm = None
+        if cfg.kenlm_path is not None:
+            import kenlm
+            self.kenlm = kenlm.Model(cfg.kenlm_path)
+        self.word_kenlm = None
+        if cfg.word_kenlm_path is not None:
+            import kenlm
+            self.word_kenlm = kenlm.Model(cfg.word_kenlm_path)
+        self.uppercase = cfg.uppercase
+        self.skipwords = set(cfg.skipwords.split(","))
+        def str_postprocess(s):
+            s = " ".join(w for w in s.split() if w not in self.skipwords)
+            s = s.upper() if self.uppercase else s
+            return s
+        self.str_postprocess = str_postprocess
+        self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s))
+        self.compute_word_score = None
+        if cfg.word_decoder_config is not None:
+            self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10)
+            def compute_word_score(logits, padding):
+                res = self.kaldi_decoder.decode(logits, padding)
+                for r in res:
+                    r = r.result()
+                    assert len(r) == 1
+                    r = r[0]
+                    yield r["score"], r["words"]
+            self.compute_word_score = compute_word_score
+    @classmethod
+    def setup_task(cls, cfg: UnpairedAudioTextConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+        Args:
+            cfg (AudioPretrainingConfig): configuration of this task
+        """
+        dict_path = os.path.join(cfg.text_data, "dict.txt")
+        if os.path.exists(dict_path):
+            target_dictionary = Dictionary.load(dict_path)
+        else:
+            dict_path = os.path.join(cfg.data, f"dict.{cfg.labels}.txt")
+            target_dictionary = Dictionary.load(dict_path)
+        return cls(cfg, target_dictionary=target_dictionary)
+    def optimizer_step(self, optimizer, model, update_num):
+        if hasattr(model, "get_groups_for_update"):
+            groups = model.get_groups_for_update(update_num)
+            optimizer.step(groups={groups})
+        else:
+            optimizer.step()
+    def valid_step(self, sample, model, criterion):
+        res = model(
+            **sample["net_input"],
+            dense_x_only=True,
+        )
+        dense_x = res["logits"]
+        padding_mask = res["padding_mask"]
+        word_scores = None
+        if self.compute_word_score is not None:
+            word_scores = self.compute_word_score(dense_x.cpu(), padding_mask.cpu())
+        z = dense_x.argmax(-1)
+        z[padding_mask] = self.target_dictionary.pad()
+        vocab_seen = torch.zeros(self.num_symbols, dtype=torch.bool)
+        import editdistance
+        c_err = 0
+        c_len = 0
+        pred_c_len = 0
+        lm_score_sum = 0
+        for i, (x, t, id) in enumerate(
+            zip(
+                z,
+                sample["target"] if "target" in sample else [None] * len(z),
+                sample["id"],
+            )
+        ):
+            if t is not None:
+                t = t[(t >= self.target_dictionary.nspecial)]
+            x = x[
+                (x >= self.target_dictionary.nspecial)
+                & (x < (self.num_symbols + self.target_dictionary.nspecial))
+            ]
+            if self.sil_id >= 0:
+                x = x[x != self.sil_id]
+            vocab_seen[x - self.target_dictionary.nspecial] = True
+            pred_units_arr = x
+            if self.cfg.ctc_eval:
+                pred_units_arr = pred_units_arr.unique_consecutive()
+                pred_units_arr = pred_units_arr[pred_units_arr != 0]
+            if id == 0:
+                if t is not None:
+                    logger.info(f"REF: {self.target_dictionary.string(t)}")
+                logger.info(f"HYP: {self.target_dictionary.string(pred_units_arr)}")
+                if self.kenlm is not None:
+                    if t is not None:
+                        ref_lm_s = self.compute_lm_score(
+                            self.target_dictionary.string(t)
+                        )
+                        logger.info(
+                            f"LM [REF]: {ref_lm_s}, {math.pow(10, -ref_lm_s / (len(t) + 1))}"
+                        )
+                    hyp_lm_s = self.compute_lm_score(
+                        self.target_dictionary.string(pred_units_arr)
+                    )
+                    logger.info(
+                        f"LM [HYP]: {hyp_lm_s}, {math.pow(10, -hyp_lm_s / (len(pred_units_arr) + 1))}"
+                    )
+            pred_units_arr = pred_units_arr.tolist()
+            pred_c_len += len(pred_units_arr)
+            if t is not None:
+                t = t.tolist()
+                c_err += editdistance.eval(pred_units_arr, t)
+                c_len += len(t)
+            else:
+                c_len = pred_c_len
+            if self.kenlm is not None:
+                pred_str = self.target_dictionary.string(pred_units_arr)
+                lm_score = self.compute_lm_score(pred_str)
+                lm_score_sum += lm_score
+        kaldi_score_sum = 0
+        word_lm_sum = 0
+        num_words = 0
+        if word_scores is not None:
+            for score, words in word_scores:
+                kaldi_score_sum += score
+                num_words += len(words)
+                if self.word_kenlm is not None:
+                    word_lm_sum += self.kenlm.score(" ".join(words))
+        try:
+            world_size = get_data_parallel_world_size()
+        except:
+            world_size = 1
+        logging_output = {
+            "loss": c_err,
+            "_num_char_errors": c_err,
+            "_num_chars": c_len,
+            "_num_pred_chars": pred_c_len,
+            "ntokens": c_len,
+            "nsentences": z.size(0),
+            "sample_size": c_len,
+            "_world_size": world_size,
+            "_lm_score_sum": lm_score_sum,
+            "_kaldi_score_sum": kaldi_score_sum,
+            "_word_lm_sum": word_lm_sum,
+            "_num_words": num_words,
+            "_vocab_seen": vocab_seen,
+        }
+        return c_err, c_len, logging_output
+    def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
+        data_path = self.cfg.data
+        task_cfg = task_cfg or self.cfg
+        has_unpaired_text = os.path.exists(
+            os.path.join(self.cfg.text_data, f"{split}.idx")
+        )
+        self.datasets[split] = ExtractedFeaturesDataset(
+            path=data_path,
+            split=split,
+            min_length=3,
+            max_length=task_cfg.max_length,
+            labels=None if has_unpaired_text else task_cfg.labels,
+            label_dict=self.target_dictionary,
+            shuffle=getattr(task_cfg, "shuffle", True),
+            sort_by_length=task_cfg.sort_by_length,
+            aux_target_postfix=task_cfg.aux_target_postfix,
+        )
+        logger.info(f"split {split} has unpaired text? {has_unpaired_text}")
+        if has_unpaired_text:
+            text_dataset = data_utils.load_indexed_dataset(
+                os.path.join(self.cfg.text_data, split), self.target_dictionary
+            )
+            text_dataset = StripTokenDataset(text_dataset, self.target_dictionary.eos())
+            self.datasets[split] = RandomInputDataset(
+                self.datasets[split],
+                text_dataset,
+                ["random_label"],
+                add_to_input=True,
+                pad_idx=self.target_dictionary.pad(),
+            )
+    @property
+    def source_dictionary(self):
+        return self._source_dictionary
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self._target_dictionary
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return None
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        zero = torch.scalar_tensor(0.0)
+        num_char_errors = sum(
+            log.get("_num_char_errors", zero) for log in logging_outputs
+        )
+        num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs)
+        num_word_errors = sum(
+            log.get("_num_word_errors", zero) for log in logging_outputs
+        )
+        num_words = sum(log.get("_num_words", zero) for log in logging_outputs)
+        num_pred_chars = sum(
+            log.get("_num_pred_chars", zero) for log in logging_outputs
+        )
+        lm_score_sum = sum(log.get("_lm_score_sum", zero) for log in logging_outputs)
+        vocab_seen = (
+            sum(log.get("_vocab_seen", zero) for log in logging_outputs)
+            .bool()
+            .sum()
+            .item()
+        )
+        kaldi_score_sum = sum(
+            log.get("_kaldi_score_sum", zero) for log in logging_outputs
+        )
+        word_lm_sum = sum(log.get("_word_lm_sum", zero) for log in logging_outputs)
+        metrics.log_scalar_sum("_num_char_errors", num_char_errors)
+        metrics.log_scalar_sum("_num_chars", num_chars)
+        metrics.log_scalar_sum("_num_word_errors", num_word_errors)
+        metrics.log_scalar_sum("_num_words", num_words)
+        metrics.log_scalar_sum("lm_score_sum", lm_score_sum)
+        metrics.log_scalar_sum("num_pred_chars", num_pred_chars)
+        if self.cfg.word_kenlm_path is not None:
+            metrics.log_scalar_sum("kaldi_score_sum", kaldi_score_sum)
+            metrics.log_scalar_sum("word_lm_sum", word_lm_sum)
+        if num_chars > 0:
+            metrics.log_derived(
+                "uer",
+                lambda meters: meters["_num_char_errors"].sum
+                * 100.0
+                / meters["_num_chars"].sum
+                if meters["_num_chars"].sum > 0
+                else float("nan"),
+            )
+            if lm_score_sum < 0 and vocab_seen > 0:
+                metrics.log_scalar("vocab_seen_pct", vocab_seen / self.num_symbols)
+                metrics.log_derived(
+                    "weighted_lm_ppl",
+                    lambda meters: math.pow(
+                        10,
+                        -meters["lm_score_sum"].sum
+                        / (
+                            meters["num_pred_chars"].sum + meters["nsentences"].sum
+                        ),  # account for </s>
+                    )
+                    / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power,
+                )
+                metrics.log_derived(
+                    "lm_ppl",
+                    lambda meters: math.pow(
+                        10,
+                        -meters["lm_score_sum"].sum
+                        / (
+                            meters["num_pred_chars"].sum + meters["nsentences"].sum
+                        ),  # account for </s>
+                    ),
+                )
+            else:
+                metrics.log_derived("weighted_lm_ppl", lambda meters: float("inf"))
+        if num_words > 0:
+            if word_lm_sum != 0:
+                metrics.log_derived(
+                    "word_lm_ppl",
+                    lambda meters: math.pow(
+                        10,
+                        -meters["word_lm_sum"].sum
+                        / (
+                            meters["_num_words"].sum + meters["nsentences"].sum
+                        ),  # account for </s>
+                    ),
+                )
+                metrics.log_derived(
+                    "weighted_word_lm_ppl",
+                    lambda meters: math.pow(
+                        10,
+                        -meters["word_lm_sum"].sum
+                        / (
+                            meters["_num_words"].sum + meters["nsentences"].sum
+                        ),  # account for </s>
+                    )
+                    / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power,
+                )
+            if self.cfg.word_kenlm_path is not None:
+                metrics.log_derived(
+                    "kaldi_score",
+                    lambda meters: meters["kaldi_score_sum"].sum
+                    / meters["nsentences"].sum,
+                )
+    def build_model(self, cfg: FairseqDataclass, from_checkpoint=False):
+        model = super().build_model(cfg)
+        return model

fairseq/examples/wav2vec/xlsr/README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# XLS-R
+XLS-R is a set of large-scale models for self-supervised cross-lingual speech representation learning based on wav2vec 2.0. It was pretrained on 128 languages and approximately 436K hours of unlabeled speech data. With finetuning, these models achieve state of the art performance in speech translation, speech recognition and language identification. We evaluate the model across multiple benchmarks such as CoVoST-2 for speech translation, BABEL / MLS / CommonVoice / VoxPopuli for automatic speech recognition, and VoxLingua107 for language identification as we llas VoxCeleb1 for speaker identification. More details about this work can be found in our [paper](https://arxiv.org/pdf/2111.09296.pdf) and download links can be found below.
+Model | Link
+|------|------
+XLS-R 300M | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_300m.pt)
+XLS-R 1B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_960m_1000k.pt)
+XLS-R 2B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_2B_1000k.pt)
+You can also download these models [here](https://huggingface.co/models?other=xls_r) and read more about it in the [blogpost](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) from Hugging Face.
+## Speech Translation Finetuned Models
+We multilingually finetune XLS-R models on [CoVoST 2](https://github.com/facebookresearch/covost), which has 21
+into-English and 15 out-of-English directions.
+Model | Directions | Link
+|------|------|------
+XLS-R 300M | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_21_en.pt)
+XLS-R 300M | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_en_15.pt)
+XLS-R 1B | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_21_en.pt)
+XLS-R 1B | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_en_15.pt)
+XLS-R 2B | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_21_en.pt)
+XLS-R 2B | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_en_15.pt)
+XLS-R 2B | 21 langs &#8594; En + En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_22_16.pt)
+## ASR Finetuning
+You can refer the original wav2vec documentation on detailed instructions about how to finetune a pretrained model with CTC [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#fine-tune-a-pre-trained-model-with-ctc). Below is an example command and you can find the values for different hyperparameters to reproduce the results in our paper.
+```shell script
+$ fairseq-hydra-train \
+    distributed_training.distributed_port=$PORT \
+    task.data=/path/to/data \
+    model.w2v_path=/path/to/model.pt \
+    --config-dir /path/to/fairseq-py/examples/wav2vec/xlsr/config \
+    --config-name finetune
+```
+For finetuning the 300M as well as 1B model, we use the same hyperparameter setting defined in `finetune.yaml`. We vary `optimization.max_update` as described in the below table and the `optimization.lr` is picked from the interval [2e-5, 3e-4] based on dev word error rate.
+Benchmark | Total Number of Updates
+|------|------
+Babel | 26000
+Common Voice | 13000
+VoxPopuli | 50000
+MLS 10h | 20000
+For finetuning the 2B model, we make some additional changes for `finetune.yaml` . We use the fully_sharded `distributed_training.ddp_backend` provided by the [fairscale](https://github.com/facebookresearch/fairscale) library and and set `model.activation_checkpoint` to true. We also increase `dataset.max_tokens` to 2560000 and use a total effective batch size of 2560000*24. We sweep for the best `optimization.lr` within the interval [3e−6,3e−5] using dev error rate. For common voice dataset, we pick the `model.mask_prob` for different languages among {0.30, 0.40} based on best dev error rate.
+## LID Inference
+Model | Link
+|------|------
+XLS-R 300M + ft Voxlingua107 | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_300m_voxlingua107_ft.pt)
+How to run inference & calculate accuracy (step-by-step):
+1. Download the Voxlingua107 checkpoint from the table above.
+1. Use this python script to extract logit/embedding from the XLSR model: https://github.com/fairinternal/fairseq-py/blob/xlsr2/examples/wav2vec/gen_audio_embedding.py
+```shell command
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python3 examples/wav2vec/gen_audio_embedding.py \
+    /fsx/data/VoxLingua107/manifest --path "/path/to/checkpoint.pt" \
+    --task audio_classification --batch-size 90 --gen-subset test \
+    --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
+    --infer-xtimes 10 --infer-max-sample-size 160000 --output-path /tmp/tmp_voxling_infer.npz
+```
+2. Calculate the overall accuracy, 0-5 seconds and 5-20 seconds:
+```shell command
+PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \
+    --task cls --merge mean_logit --data /tmp/tmp_voxling_infer.npz
+Output:
+| run classification evaluation
+| acc = 94.34% -- err = 5.66% -- correct=1518 total=1609
+| acc 0to5 = 90.91% -- err = 9.09% -- c_5=230.0 t_5=253
+| acc 5to20 = 94.99% -- err = 5.01% -- c_20=1288.0 t_20=1356
+```
+## Citation
+Please cite as:
+``` bibtex
+@article{babu2021xlsr,
+      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
+      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
+      year={2021},
+      volume={abs/2111.09296},
+      journal={arXiv},
+}
+```

fairseq/examples/wav2vec/xlsr/config/finetune.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 1000
+  save_interval_updates: 1000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: ???
+  normalize: true
+  labels: ltr
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval_updates: 1000
+  valid_subset: valid
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+optimization:
+  max_update: ???
+  lr: [0.0003]
+  sentence_avg: true
+  update_freq: [5]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.75
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000
+  checkpoint_activations: false

fairseq/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Usage:
+    This scripts it to evaluate the classification accuracy/error rate from the embedding extracted
+    by gen_audio_embedding.py
+    Example (LID classification)
+    PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \
+            --data /fsx/androstj/exps/lid_voxlingua/infer/atj_xlsr2_100pct_300M_mean_fast_upd_100k_new.npz \
+            --task cls --merge mean_logit
+"""
+import numpy as np
+import sklearn
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+import ipdb
+import logging
+import argparse
+from scipy.special import softmax
+log=logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+def calculate_eer(y_label, y_score):
+    # y denotes groundtruth scores,
+    # y_score denotes the prediction scores.
+    from scipy.optimize import brentq
+    from sklearn.metrics import roc_curve
+    from scipy.interpolate import interp1d
+    fpr, tpr, thresholds = roc_curve(y_label, y_score, pos_label=1)
+    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+    optimal_threshold = interp1d(fpr, thresholds)(eer)
+    return eer, optimal_threshold
+def calculate_minDCF(y_label, y_score, p_target=0.01, c_miss=1, c_fa=1):
+    # https://github.com/kaldi-asr/kaldi/blob/master/egs/sre08/v1/sid/compute_min_dcf.py
+    from sklearn.metrics import det_curve
+    fpr, fnr, thresholds = det_curve(y_label, y_score, pos_label=1)
+    min_c_det = float("inf")
+    min_c_det_threshold = thresholds[0]
+    for i in range(0, len(fpr)):
+        # See Equation (2).  it is a weighted sum of false negative
+        # and false positive errors.
+        c_det = c_miss * fnr[i] * p_target + c_fa * fpr[i] * (1 - p_target)
+        if c_det < min_c_det:
+            min_c_det = c_det
+            min_c_det_threshold = thresholds[i]
+    # See Equations (3) and (4).  Now we normalize the cost.
+    c_def = min(c_miss * p_target, c_fa * (1 - p_target))
+    min_dcf = min_c_det / c_def
+    return min_dcf, min_c_det_threshold
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', help='npz contains name & latent file')
+    parser.add_argument('--task', choices=['cls', 'veri', 'cls_voxlingua'])
+    parser.add_argument('--merge', choices=['mean_logit', 'first_logit', 'mean_latent_sim', 'first_latent_sim', 'mean_logit_sim', 'first_logit_sim'])
+    parser.add_argument('--veri-pair', help='verification file contains 1/0 utt_x utt_y')
+    parser.add_argument('--scaler', type=str, choices=['mean_var'])
+    parser.add_argument('--compress-method', choices=['pca'])
+    parser.add_argument('--compress-dim', type=int)
+    args = parser.parse_args()
+    if args.task in ['cls', 'cls_voxlingua']:
+        print('| run classification evaluation')
+        data = np.load(args.data)
+        data_logit = data['logit']
+        data_target = data['target']
+        data_src_len = data['src_len']
+        assert data_logit.shape[0] ==  data_target.shape[0]
+        B = data_logit.shape[0]
+        correct = 0
+        total = 0
+        data_prob = softmax(data_logit, axis=2)
+        correct_vs_len = np.empty((B, 2))
+        for ii in range(B):
+            _target = data_target[ii]
+            if args.merge == 'mean_logit':
+                _prob = np.mean(data_prob[ii], axis=0)
+                top_1 = np.argmax(_prob)
+            elif args.merge == 'first_logit':
+                _prob = data_prob[ii][0]
+                top_1 = np.argmax(_prob)
+            else :
+                raise ValueError()
+            is_top_1 = (1 if top_1 == _target else 0)
+            correct += is_top_1
+            total += 1
+            _src_len = data_src_len[ii] / 16000
+            correct_vs_len[ii] = [is_top_1, _src_len]
+        acc = correct / total * 100
+        t_5 = correct_vs_len[:, 1] <= 5
+        t_20 = correct_vs_len[:, 1] > 5
+        c_5 = correct_vs_len[t_5, 0].sum()
+        c_20 = correct_vs_len[t_20, 0].sum()
+        t_5 = t_5.sum()
+        t_20 = t_20.sum()
+        acc_5 = c_5 / t_5 * 100
+        acc_20 = c_20 / t_20 * 100
+        print(f'| acc = {acc:.2f}% -- err = {100-acc:.2f}% -- {correct=} {total=}')
+        print(f'| acc 0to5 = {acc_5:.2f}% -- err = {100-acc_5:.2f}% -- {c_5=} {t_5=}')
+        print(f'| acc 5to20 = {acc_20:.2f}% -- err = {100-acc_20:.2f}% -- {c_20=} {t_20=}')
+    if args.task == 'veri':
+        print('| run verification evaluation')
+        veri_pairs = []
+        with open(args.veri_pair) as ff:
+            for fi in ff:
+                a,b,c = fi.split()
+                a = int(a)
+                veri_pairs.append([a,b,c])
+        data = np.load(args.data)
+        if 'logit' in args.merge:
+            data_latent = data['logit']
+        elif 'latent' in args.merge:
+            data_latent = data['latent']
+        else :
+            raise ValueError()
+        data_name  = data['name']
+        assert len(data_name) == len(data_latent)
+        map_name_latent = {}
+        from sklearn.pipeline import make_pipeline
+        pipe = []
+        if args.scaler == 'mean_var':
+            print(f'| apply StandardScaler')
+            pipe.append(StandardScaler())
+        if args.compress_method == 'pca':
+            n_comp = args.compress_dim
+            print(f'| apply PCA with {n_comp=}')
+            from sklearn.decomposition import PCA
+            pipe.append(PCA(n_components=n_comp))
+        if len(pipe) > 0 :
+            pipe = make_pipeline(*pipe)
+            data_latent_2d = data_latent.reshape(-1, data_latent.shape[-1])
+            pipe.fit(data_latent_2d)
+            data_latent_2d = pipe.transform(data_latent_2d)
+            data_latent = data_latent_2d.reshape(data_latent.shape[0], data_latent.shape[1], -1)
+        for ii in range(len(data_name)):
+            map_name_latent[data_name[ii]] = data_latent[ii]
+        labels = []
+        scores = []
+        for lbl, pair_a, pair_b in tqdm(veri_pairs):
+            labels.append(lbl)
+            pair_a = map_name_latent[pair_a]
+            pair_b = map_name_latent[pair_b]
+            assert pair_a.ndim == pair_b.ndim == 2
+            score = cosine_similarity(pair_a, pair_b)
+            if args.merge.startswith('mean'):
+                score = np.mean(score)
+            elif args.merge.startswith('first'):
+                score = score[0, 0]
+            else :
+                raise ValueError()
+            scores.append(score)
+        labels = np.array(labels)
+        scores = np.array(scores)
+        eer, eer_threshold = calculate_eer(labels, scores)
+        minDCF, minDCF_threshold = calculate_minDCF(labels, scores)
+        print('='*40)
+        print(f'| EER = {eer*100:.2f}%\tthreshold = {eer_threshold:.2f}')
+        print(f'| minDCF = {minDCF:.2f}\tthreshold = {minDCF_threshold:.2f}')

fairseq/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Usage:
+    This script is used to extract the embedding / logit for speech classification task.
+    1. Set fdir into your model checkpoint directory
+    2. Run the following command (preferrably on GPU machine to speed up the inference process)
+   CUDA_VISIBLE_DEVICES=0 python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \
+    --task audio_classification --batch-size 90 --gen-subset test \
+    --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
+    --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir
+    Example:
+    Case: LID logit extraction
+    fdir='/fsx/androstj/exps/voxlingua_lid_train_all/ckpt_100pct_300m_voxling-act_linear-pool_mean_fast-lr_1e-4-phase_0.1_0.4_0.5-maxupd_100000-ufreq_1-mprob_0.5-fz_0-cr_softmax/0/checkpoints/checkpoint_best.pt'
+    python3  examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \
+        --task audio_classification --batch-size 90 --gen-subset test \
+        --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
+        --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir
+"""
+import torch
+from fairseq import checkpoint_utils, distributed_utils, options, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import metrics, progress_bar
+from fairseq import checkpoint_utils, data, options, tasks
+from fairseq.data import FileAudioDataset, AddTargetDataset, Dictionary
+from fairseq.tasks.audio_classification import LabelEncoder
+import ipdb
+import copy
+import sys
+from tqdm import tqdm
+import tempfile
+import numpy as np
+import sklearn
+def subset_manifest(infer_manifest, veri_pair):
+    with open(infer_manifest) as ff, open(veri_pair) as gg, \
+            tempfile.NamedTemporaryFile('w', delete=False) as ww:
+        fnames = ff.read().strip().split("\n")
+        basedir = fnames[0]
+        needed_fname = []
+        for gi in gg.read().strip().split('\n'):
+            _, x1, x2 = gi.split()
+            needed_fname.append(x1)
+            needed_fname.append(x2)
+        needed_fname = set(needed_fname)
+        ww.write(basedir+'\n')
+        for ii in range(1, len(fnames)):
+            x1,x2 = fnames[ii].split()
+            if x1 in needed_fname:
+                ww.write(fnames[ii]+'\n')
+    print(f'| subset manifest for verification: {ww.name}')
+    return ww.name
+def wrap_target_dataset(infer_manifest, dataset, task):
+    label_path = infer_manifest.replace(".tsv", ".label")
+    with open(label_path, "r") as f:
+        labels = f.read().strip().split("\n")
+        assert len(labels) == len(dataset)
+    process_label = LabelEncoder(task.target_dictionary)
+    dataset = AddTargetDataset(dataset, labels,
+            pad=task.target_dictionary.pad(),
+            eos=task.target_dictionary.eos(),
+            batch_targets=True,
+            process_label=process_label,
+            add_to_input=False)
+    return dataset
+def resample_data(source, padding_mask, n_sample, max_sample_len):
+    # source: BxT
+    # padding_mask: BxT
+    B = source.shape[0]
+    T = source.shape[1]
+    sources = []
+    padding_masks = []
+    seq_len = (~padding_mask).sum(1)
+    for jj in range(n_sample):
+        new_source = source.new_zeros(B, max_sample_len)
+        new_padding_mask = padding_mask.new_zeros(B, max_sample_len)
+        for ii in range(B):
+            if seq_len[ii] > max_sample_len:
+                start = np.random.randint(0, seq_len[ii]-max_sample_len+1)
+                end = start + max_sample_len
+            else :
+                start = 0
+                end = seq_len[ii]
+            new_source[ii, 0:end-start] = source[ii, start:end]
+            new_padding_mask[ii, end-start+1:] = True
+        sources.append(new_source)
+        padding_masks.append(new_padding_mask)
+    return sources, padding_masks
+def resample_sample(sample, n_sample, max_sample_len):
+    new_sources, new_padding_masks = resample_data(sample['net_input']['source'], sample['net_input']['padding_mask'], n_sample, max_sample_len)
+    new_samples = []
+    for ii in range(n_sample):
+        new_sample = copy.deepcopy(sample)
+        new_sample['net_input']['source'] = new_sources[ii]
+        new_sample['net_input']['padding_mask'] = new_padding_masks[ii]
+        new_samples.append(new_sample)
+    return new_samples
+if __name__ == '__main__':
+    np.random.seed(123)
+    # Parse command-line arguments for generation
+    parser = options.get_generation_parser(default_task='audio_classification')
+    # parser.add_argument('--infer-merge', type=str, default='mean')
+    parser.add_argument('--infer-xtimes', type=int, default=1)
+    parser.add_argument('--infer-max-sample-size', type=int, default=5*16000)  # 5 secs
+    parser.add_argument('--infer-manifest', type=str)
+    parser.add_argument('--verification-pair', type=str, required=False,
+            help='''
+            a file that contains pairs of utts to evaluated if they are from same speaker or not
+            format: (following voxceleb)
+            1/0 <wav_pair_a> <wav_pair_b>
+            ''')
+    parser.add_argument('--output-path', type=str)
+    # parser.add_argument('--infer-xtimes', type=int, default=1)
+    args = options.parse_args_and_arch(parser)
+    # Setup task
+    # task = tasks.setup_task(args)
+    use_cuda = not args.cpu
+    # Load model & task
+    print('| loading model from {}'.format(args.path))
+    arg_overrides = {
+        'data': args.data,
+        # 'mask_prob': 0
+        #'max_sample_size': sys.maxsize,
+        #'min_sample_size': 0,
+    }
+    state = checkpoint_utils.load_checkpoint_to_cpu(args.path)
+    # move to AWS
+    state['cfg']['model']['w2v_path'] = state['cfg']['model']['w2v_path'].replace('/checkpoint/arbabu/XLSR2/model_versions/', '/fsx/data/model_versions/').replace('/checkpoint/kushall/final_model_checkpoints/wav2vec2/', '/fsx/data/wav2vec_ckpt/')
+    state['cfg']['task']['data'] = state['cfg']['task']['data'].replace('/checkpoint/kushall/data/', '/fsx/data/')
+    models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task([args.path],
+            arg_overrides=arg_overrides,
+            task=None,
+            state=state)
+    model = models[0]
+    model.eval()
+    if use_cuda:
+        model.cuda()
+    # Load dataset
+    task.load_dataset(args.gen_subset)
+    dataset = task.dataset(args.gen_subset)
+    infer_manifest = args.infer_manifest
+    # only decode needed utts
+    # infer_manifest = subset_manifest(infer_manifest,
+            # args.verification_pair)
+    infer_dataset = FileAudioDataset(infer_manifest,
+            sample_rate=task.cfg.sample_rate,
+            max_sample_size=10**10, #task.cfg.max_sample_size,
+            min_sample_size=1, #task.cfg.min_sample_size,
+            pad=True,
+            normalize=task.cfg.normalize)
+    # add target (if needed)
+    infer_dataset = wrap_target_dataset(infer_manifest, infer_dataset, task)
+    itr = task.get_batch_iterator(
+            dataset=infer_dataset,
+            max_sentences=args.batch_size,
+            ).next_epoch_itr(shuffle=False)
+    # correct = 0
+    # total = 0
+    list_uttname = []
+    list_latent = []
+    list_logit = []
+    list_target = []
+    list_src_len = []
+    with torch.no_grad():
+        for _, sample in tqdm(enumerate(itr)):
+            # resample if needed
+            samples = resample_sample(sample, args.infer_xtimes, args.infer_max_sample_size)
+            list_uttname.extend(sample['name'])
+            list_target.extend(sample['target'][:, 0].cpu().numpy())
+            list_src_len.extend((~sample['net_input']['padding_mask']).sum(1).cpu().numpy())
+            latents = []
+            logits = []
+            for sample in samples:
+                sample = utils.move_to_cuda(sample) if use_cuda else sample
+                try:
+                    latent = model.forward_latent(**sample['net_input'])
+                    latents.append(latent.detach().cpu().numpy())
+                except:
+                    latent = None
+                logit = model.forward(**sample['net_input'])
+                logits.append(logit.detach().cpu().numpy())
+            if len(latents) > 0:
+                latents = np.stack(latents, 1) # B,X,D
+            logits = np.stack(logits,  1) # B,X,Cls
+            list_latent.extend(latents)
+            list_logit.extend(logits)
+    # create big npz
+    list_uttname = np.array(list_uttname)
+    list_latent = np.array(list_latent)
+    list_target = np.array(list_target)
+    list_logit = np.array(list_logit)
+    list_src_len = np.array(list_src_len)
+    # save to npz
+    output_path = args.output_path
+    if (output_path is None):
+        output_path = tempfile.NamedTemporaryFile('wb', delete=False).name
+    with open(output_path, 'wb') as ww:
+        np.savez(ww, name=list_uttname,
+                latent=list_latent,
+                target=list_target,
+                logit=list_logit,
+                src_len=list_src_len)
+    print("="*10 + " REPORT " + "="*10)
+    print(f'| latent saved in {output_path}')
+    print(f'| {list_uttname.shape=}, {list_latent.shape=}, {list_target.shape=}, {list_logit.shape=}, {list_src_len.shape=}')

fairseq/examples/wmt19/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# WMT 19
+This page provides pointers to the models of Facebook-FAIR's WMT'19 news translation task submission [(Ng et al., 2019)](https://arxiv.org/abs/1907.06616).
+## Pre-trained models
+Model | Description | Download
+---|---|---
+`transformer.wmt19.en-de` | En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.de-en` | De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.en-ru` | En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
+`transformer.wmt19.ru-en` | Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
+`transformer_lm.wmt19.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | De Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Ru Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
+## Pre-trained single models before finetuning
+Model | Description | Download
+---|---|---
+`transformer.wmt19.en-de` | En->De Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.ffn8192.tar.gz)
+`transformer.wmt19.de-en` | De->En Single, no finetuning  | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.ffn8192.tar.gz)
+`transformer.wmt19.en-ru` | En->Ru Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ffn8192.tar.gz)
+`transformer.wmt19.ru-en` | Ru->En Single, no finetuning  | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ffn8192.tar.gz)
+## Example usage (torch.hub)
+#### Requirements
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+#### Translation
+```python
+import torch
+# English to German translation
+en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+en2de.translate("Machine learning is great!")  # 'Maschinelles Lernen ist großartig!'
+# German to English translation
+de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+de2en.translate("Maschinelles Lernen ist großartig!")  # 'Machine learning is great!'
+# English to Russian translation
+en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+en2ru.translate("Machine learning is great!")  # 'Машинное обучение - это здорово!'
+# Russian to English translation
+ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+ru2en.translate("Машинное обучение - это здорово!")  # 'Machine learning is great!'
+```
+#### Language Modeling
+```python
+# Sample from the English LM
+en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+en_lm.sample("Machine learning is")  # 'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
+# Sample from the German LM
+de_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.de', tokenizer='moses', bpe='fastbpe')
+de_lm.sample("Maschinelles lernen ist")  # 'Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...'
+# Sample from the Russian LM
+ru_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe')
+ru_lm.sample("машинное обучение это")  # 'машинное обучение это то, что мы называем "искусственным интеллектом".'
+```
+## Citation
+```bibtex
+@inproceedings{ng2019facebook},
+  title = {Facebook FAIR's WMT19 News Translation Task Submission},
+  author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},
+  booktitle = {Proc. of WMT},
+  year = 2019,
+}
+```