PyTorch
ssl-aasist
custom_code
ash56 commited on
Commit
7b2dad9
·
verified ·
1 Parent(s): 66a0dab

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. fairseq/examples/__pycache__/__init__.cpython-310.pyc +0 -0
  2. fairseq/examples/wav2vec/unsupervised/config/timit_matched/test.uid +192 -0
  3. fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid +1000 -0
  4. fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid +620 -0
  5. fairseq/examples/wav2vec/unsupervised/data/__init__.py +13 -0
  6. fairseq/examples/wav2vec/unsupervised/data/extracted_features_dataset.py +167 -0
  7. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh +15 -0
  8. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh +37 -0
  9. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh +35 -0
  10. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh +63 -0
  11. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh +52 -0
  12. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh +129 -0
  13. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py +135 -0
  14. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh +37 -0
  15. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh +35 -0
  16. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh +175 -0
  17. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh +239 -0
  18. fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh +281 -0
  19. fairseq/examples/wav2vec/unsupervised/models/__init__.py +11 -0
  20. fairseq/examples/wav2vec/unsupervised/models/wav2vec_u.py +687 -0
  21. fairseq/examples/wav2vec/unsupervised/scripts/apply_pca.py +76 -0
  22. fairseq/examples/wav2vec/unsupervised/scripts/copy_labels.py +10 -0
  23. fairseq/examples/wav2vec/unsupervised/scripts/filter_lexicon.py +40 -0
  24. fairseq/examples/wav2vec/unsupervised/scripts/filter_tsv.py +37 -0
  25. fairseq/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py +45 -0
  26. fairseq/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py +16 -0
  27. fairseq/examples/wav2vec/unsupervised/scripts/mean_pool.py +99 -0
  28. fairseq/examples/wav2vec/unsupervised/scripts/merge_clusters.py +114 -0
  29. fairseq/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py +72 -0
  30. fairseq/examples/wav2vec/unsupervised/scripts/normalize_text.py +22 -0
  31. fairseq/examples/wav2vec/unsupervised/scripts/pca.py +53 -0
  32. fairseq/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py +83 -0
  33. fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio.sh +78 -0
  34. fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh +68 -0
  35. fairseq/examples/wav2vec/unsupervised/scripts/prepare_text.sh +83 -0
  36. fairseq/examples/wav2vec/unsupervised/scripts/prepare_timit.sh +79 -0
  37. fairseq/examples/wav2vec/unsupervised/scripts/remove_silence.py +63 -0
  38. fairseq/examples/wav2vec/unsupervised/scripts/vads.py +98 -0
  39. fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py +128 -0
  40. fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py +210 -0
  41. fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py +119 -0
  42. fairseq/examples/wav2vec/unsupervised/scripts/wer.py +82 -0
  43. fairseq/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py +16 -0
  44. fairseq/examples/wav2vec/unsupervised/tasks/__init__.py +11 -0
  45. fairseq/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py +452 -0
  46. fairseq/examples/wav2vec/xlsr/README.md +95 -0
  47. fairseq/examples/wav2vec/xlsr/config/finetune.yaml +66 -0
  48. fairseq/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py +173 -0
  49. fairseq/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py +222 -0
  50. fairseq/examples/wmt19/README.md +85 -0
fairseq/examples/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (298 Bytes). View file
 
fairseq/examples/wav2vec/unsupervised/config/timit_matched/test.uid ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FDHC0_SI1559
2
+ FDHC0_SI2189
3
+ FDHC0_SI929
4
+ FDHC0_SX119
5
+ FDHC0_SX209
6
+ FDHC0_SX29
7
+ FDHC0_SX299
8
+ FDHC0_SX389
9
+ FELC0_SI1386
10
+ FELC0_SI2016
11
+ FELC0_SI756
12
+ FELC0_SX126
13
+ FELC0_SX216
14
+ FELC0_SX306
15
+ FELC0_SX36
16
+ FELC0_SX396
17
+ FJLM0_SI1043
18
+ FJLM0_SI1673
19
+ FJLM0_SI2303
20
+ FJLM0_SX143
21
+ FJLM0_SX233
22
+ FJLM0_SX323
23
+ FJLM0_SX413
24
+ FJLM0_SX53
25
+ FMGD0_SI1564
26
+ FMGD0_SI2194
27
+ FMGD0_SI934
28
+ FMGD0_SX124
29
+ FMGD0_SX214
30
+ FMGD0_SX304
31
+ FMGD0_SX34
32
+ FMGD0_SX394
33
+ FMLD0_SI2185
34
+ FMLD0_SI822
35
+ FMLD0_SI925
36
+ FMLD0_SX115
37
+ FMLD0_SX205
38
+ FMLD0_SX25
39
+ FMLD0_SX295
40
+ FMLD0_SX385
41
+ FNLP0_SI1308
42
+ FNLP0_SI1938
43
+ FNLP0_SI678
44
+ FNLP0_SX138
45
+ FNLP0_SX228
46
+ FNLP0_SX318
47
+ FNLP0_SX408
48
+ FNLP0_SX48
49
+ FPAS0_SI1272
50
+ FPAS0_SI2204
51
+ FPAS0_SI944
52
+ FPAS0_SX134
53
+ FPAS0_SX224
54
+ FPAS0_SX314
55
+ FPAS0_SX404
56
+ FPAS0_SX44
57
+ FPKT0_SI1538
58
+ FPKT0_SI2168
59
+ FPKT0_SI908
60
+ FPKT0_SX188
61
+ FPKT0_SX278
62
+ FPKT0_SX368
63
+ FPKT0_SX8
64
+ FPKT0_SX98
65
+ MBPM0_SI1577
66
+ MBPM0_SI1584
67
+ MBPM0_SI947
68
+ MBPM0_SX137
69
+ MBPM0_SX227
70
+ MBPM0_SX317
71
+ MBPM0_SX407
72
+ MBPM0_SX47
73
+ MCMJ0_SI1094
74
+ MCMJ0_SI464
75
+ MCMJ0_SI602
76
+ MCMJ0_SX104
77
+ MCMJ0_SX14
78
+ MCMJ0_SX194
79
+ MCMJ0_SX284
80
+ MCMJ0_SX374
81
+ MDAB0_SI1039
82
+ MDAB0_SI1669
83
+ MDAB0_SI2299
84
+ MDAB0_SX139
85
+ MDAB0_SX229
86
+ MDAB0_SX319
87
+ MDAB0_SX409
88
+ MDAB0_SX49
89
+ MGRT0_SI1450
90
+ MGRT0_SI2080
91
+ MGRT0_SI820
92
+ MGRT0_SX10
93
+ MGRT0_SX100
94
+ MGRT0_SX190
95
+ MGRT0_SX280
96
+ MGRT0_SX370
97
+ MJDH0_SI1354
98
+ MJDH0_SI1984
99
+ MJDH0_SI724
100
+ MJDH0_SX184
101
+ MJDH0_SX274
102
+ MJDH0_SX364
103
+ MJDH0_SX4
104
+ MJDH0_SX94
105
+ MJLN0_SI1449
106
+ MJLN0_SI2079
107
+ MJLN0_SI819
108
+ MJLN0_SX189
109
+ MJLN0_SX279
110
+ MJLN0_SX369
111
+ MJLN0_SX9
112
+ MJLN0_SX99
113
+ MJMP0_SI1535
114
+ MJMP0_SI1791
115
+ MJMP0_SI905
116
+ MJMP0_SX185
117
+ MJMP0_SX275
118
+ MJMP0_SX365
119
+ MJMP0_SX5
120
+ MJMP0_SX95
121
+ MKLT0_SI1213
122
+ MKLT0_SI1843
123
+ MKLT0_SI583
124
+ MKLT0_SX133
125
+ MKLT0_SX223
126
+ MKLT0_SX313
127
+ MKLT0_SX403
128
+ MKLT0_SX43
129
+ MLLL0_SI1363
130
+ MLLL0_SI1993
131
+ MLLL0_SI733
132
+ MLLL0_SX103
133
+ MLLL0_SX13
134
+ MLLL0_SX193
135
+ MLLL0_SX283
136
+ MLLL0_SX373
137
+ MLNT0_SI1574
138
+ MLNT0_SI1902
139
+ MLNT0_SI642
140
+ MLNT0_SX102
141
+ MLNT0_SX12
142
+ MLNT0_SX192
143
+ MLNT0_SX282
144
+ MLNT0_SX372
145
+ MNJM0_SI1580
146
+ MNJM0_SI2210
147
+ MNJM0_SI950
148
+ MNJM0_SX140
149
+ MNJM0_SX230
150
+ MNJM0_SX320
151
+ MNJM0_SX410
152
+ MNJM0_SX50
153
+ MPAM0_SI1189
154
+ MPAM0_SI1819
155
+ MPAM0_SI1961
156
+ MPAM0_SX109
157
+ MPAM0_SX19
158
+ MPAM0_SX199
159
+ MPAM0_SX289
160
+ MPAM0_SX379
161
+ MTAS1_SI1473
162
+ MTAS1_SI2098
163
+ MTAS1_SI838
164
+ MTAS1_SX118
165
+ MTAS1_SX208
166
+ MTAS1_SX28
167
+ MTAS1_SX298
168
+ MTAS1_SX388
169
+ MTLS0_SI1370
170
+ MTLS0_SI2000
171
+ MTLS0_SI740
172
+ MTLS0_SX110
173
+ MTLS0_SX20
174
+ MTLS0_SX200
175
+ MTLS0_SX290
176
+ MTLS0_SX380
177
+ MWBT0_SI1553
178
+ MWBT0_SI2183
179
+ MWBT0_SI923
180
+ MWBT0_SX113
181
+ MWBT0_SX203
182
+ MWBT0_SX23
183
+ MWBT0_SX293
184
+ MWBT0_SX383
185
+ MWEW0_SI1361
186
+ MWEW0_SI1991
187
+ MWEW0_SI731
188
+ MWEW0_SX101
189
+ MWEW0_SX11
190
+ MWEW0_SX191
191
+ MWEW0_SX281
192
+ MWEW0_SX371
fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FAEM0_SI762
2
+ FAEM0_SX42
3
+ FAJW0_SA1
4
+ FAJW0_SX3
5
+ FAJW0_SX93
6
+ FALK0_SX186
7
+ FALK0_SX6
8
+ FALR0_SI1325
9
+ FBAS0_SA1
10
+ FBAS0_SX217
11
+ FBCG1_SA1
12
+ FBCG1_SX172
13
+ FBCG1_SX442
14
+ FBCH0_SX236
15
+ FBCH0_SX416
16
+ FBLV0_SA1
17
+ FBLV0_SI1058
18
+ FBLV0_SX338
19
+ FBLV0_SX68
20
+ FBMH0_SA1
21
+ FBMJ0_SI815
22
+ FCAG0_SA1
23
+ FCAG0_SX153
24
+ FCAG0_SX243
25
+ FCAJ0_SI1479
26
+ FCAJ0_SX309
27
+ FCDR1_SX106
28
+ FCDR1_SX196
29
+ FCEG0_SA2
30
+ FCJF0_SA1
31
+ FCJF0_SX127
32
+ FCJS0_SI1607
33
+ FCJS0_SI2237
34
+ FCJS0_SX257
35
+ FCKE0_SA2
36
+ FCKE0_SX121
37
+ FCLT0_SI2068
38
+ FCLT0_SX448
39
+ FCLT0_SX88
40
+ FCMG0_SA2
41
+ FCMG0_SI1872
42
+ FCMG0_SX72
43
+ FCMM0_SA1
44
+ FCMM0_SA2
45
+ FCMM0_SX183
46
+ FCRZ0_SI2053
47
+ FCRZ0_SX433
48
+ FCYL0_SA1
49
+ FCYL0_SX37
50
+ FDAS1_SI2091
51
+ FDAS1_SX201
52
+ FDAS1_SX381
53
+ FDAW0_SI1406
54
+ FDFB0_SA1
55
+ FDFB0_SA2
56
+ FDFB0_SI2010
57
+ FDFB0_SX58
58
+ FDJH0_SX305
59
+ FDML0_SA2
60
+ FDML0_SX159
61
+ FDML0_SX249
62
+ FDML0_SX429
63
+ FDMY0_SA2
64
+ FDMY0_SX27
65
+ FDNC0_SX198
66
+ FDNC0_SX288
67
+ FDTD0_SX211
68
+ FDXW0_SA1
69
+ FDXW0_SX251
70
+ FDXW0_SX341
71
+ FDXW0_SX71
72
+ FEAC0_SX165
73
+ FEAC0_SX75
74
+ FEAR0_SI622
75
+ FECD0_SX68
76
+ FEEH0_SA1
77
+ FEEH0_SI1742
78
+ FEEH0_SI471
79
+ FEEH0_SX122
80
+ FEME0_SA1
81
+ FEME0_SX155
82
+ FEME0_SX65
83
+ FETB0_SA1
84
+ FETB0_SI1148
85
+ FETB0_SX158
86
+ FEXM0_SI1101
87
+ FGCS0_SX136
88
+ FGCS0_SX226
89
+ FGCS0_SX316
90
+ FGCS0_SX406
91
+ FGDP0_SA1
92
+ FGMB0_SI1775
93
+ FGMB0_SX245
94
+ FHLM0_SX390
95
+ FHXS0_SA2
96
+ FHXS0_SX445
97
+ FJDM2_SA1
98
+ FJDM2_SX232
99
+ FJDM2_SX52
100
+ FJHK0_SX302
101
+ FJKL0_SX212
102
+ FJKL0_SX392
103
+ FJLG0_SI2306
104
+ FJLR0_SA1
105
+ FJRP1_SI2062
106
+ FJRP1_SX82
107
+ FJSK0_SA1
108
+ FJSP0_SX264
109
+ FJSP0_SX354
110
+ FJSP0_SX444
111
+ FJWB1_SA1
112
+ FJWB1_SX345
113
+ FJWB1_SX435
114
+ FJXM0_SA1
115
+ FJXM0_SI581
116
+ FJXM0_SX401
117
+ FJXP0_SA1
118
+ FJXP0_SI1122
119
+ FJXP0_SX132
120
+ FKAA0_SX128
121
+ FKAA0_SX398
122
+ FKDE0_SA1
123
+ FKDE0_SX151
124
+ FKDE0_SX241
125
+ FKDE0_SX421
126
+ FKDE0_SX61
127
+ FKDW0_SX397
128
+ FKFB0_SA2
129
+ FKFB0_SX348
130
+ FKFB0_SX78
131
+ FKKH0_SA1
132
+ FKKH0_SA2
133
+ FKKH0_SX120
134
+ FKKH0_SX390
135
+ FKLC0_SX355
136
+ FKLC1_SI2308
137
+ FKLC1_SX238
138
+ FKLC1_SX328
139
+ FKLC1_SX418
140
+ FKLH0_SA2
141
+ FKLH0_SX177
142
+ FKSR0_SA1
143
+ FKSR0_SA2
144
+ FKSR0_SI1747
145
+ FKSR0_SI487
146
+ FKSR0_SX217
147
+ FLAC0_SX451
148
+ FLAG0_SA2
149
+ FLAG0_SX114
150
+ FLAG0_SX204
151
+ FLAG0_SX24
152
+ FLAG0_SX384
153
+ FLEH0_SI1681
154
+ FLEH0_SI2311
155
+ FLEH0_SX331
156
+ FLET0_SA1
157
+ FLHD0_SI1827
158
+ FLHD0_SX354
159
+ FLJA0_SA1
160
+ FLJA0_SI2338
161
+ FLJD0_SI886
162
+ FLJD0_SX76
163
+ FLJG0_SA2
164
+ FLKM0_SA2
165
+ FLKM0_SI686
166
+ FLKM0_SX260
167
+ FLKM0_SX80
168
+ FLMA0_SA1
169
+ FLMA0_SI613
170
+ FLMA0_SX433
171
+ FLMA0_SX73
172
+ FLMC0_SX22
173
+ FLMK0_SI1035
174
+ FLMK0_SX315
175
+ FLMK0_SX405
176
+ FLOD0_SI1917
177
+ FLOD0_SX117
178
+ FLOD0_SX171
179
+ FLOD0_SX297
180
+ FLTM0_SA1
181
+ FLTM0_SI1070
182
+ FLTM0_SI2330
183
+ FMAH1_SA2
184
+ FMAH1_SX159
185
+ FMBG0_SA2
186
+ FMBG0_SI2264
187
+ FMEM0_SI747
188
+ FMEM0_SX387
189
+ FMJB0_SI547
190
+ FMJB0_SX97
191
+ FMJF0_SA2
192
+ FMJU0_SX309
193
+ FMJU0_SX399
194
+ FMKC0_SI1702
195
+ FMKC0_SX442
196
+ FMKC0_SX82
197
+ FMKF0_SX186
198
+ FMPG0_SA2
199
+ FNKL0_SI1522
200
+ FNTB0_SI1203
201
+ FNTB0_SI573
202
+ FNTB0_SX303
203
+ FPAB1_SI1471
204
+ FPAB1_SX211
205
+ FPAC0_SA2
206
+ FPAD0_SA2
207
+ FPAD0_SX356
208
+ FPAD0_SX86
209
+ FPAF0_SA2
210
+ FPAF0_SX154
211
+ FPAZ0_SA1
212
+ FPAZ0_SA2
213
+ FPAZ0_SX243
214
+ FPJF0_SA1
215
+ FPJF0_SX146
216
+ FPJF0_SX56
217
+ FPLS0_SI1590
218
+ FPLS0_SX330
219
+ FPMY0_SA1
220
+ FPMY0_SX343
221
+ FREH0_SA1
222
+ FREH0_SA2
223
+ FREH0_SX415
224
+ FRJB0_SX347
225
+ FRLL0_SX434
226
+ FSAG0_SA1
227
+ FSAG0_SX243
228
+ FSAH0_SA1
229
+ FSAH0_SA2
230
+ FSAH0_SX164
231
+ FSAH0_SX434
232
+ FSBK0_SA2
233
+ FSBK0_SI1069
234
+ FSBK0_SX169
235
+ FSCN0_SA2
236
+ FSCN0_SI626
237
+ FSCN0_SX266
238
+ FSCN0_SX446
239
+ FSCN0_SX86
240
+ FSDC0_SA2
241
+ FSDC0_SX142
242
+ FSDC0_SX322
243
+ FSDC0_SX52
244
+ FSDJ0_SI485
245
+ FSDJ0_SX215
246
+ FSDJ0_SX305
247
+ FSDJ0_SX395
248
+ FSGF0_SX117
249
+ FSJG0_SX130
250
+ FSJK1_SA2
251
+ FSJK1_SX125
252
+ FSJK1_SX35
253
+ FSJS0_SX181
254
+ FSJW0_SI1963
255
+ FSJW0_SX433
256
+ FSKC0_SI1416
257
+ FSKC0_SI786
258
+ FSKC0_SX246
259
+ FSKL0_SI1529
260
+ FSKL0_SX449
261
+ FSKP0_SA2
262
+ FSLS0_SX156
263
+ FSLS0_SX426
264
+ FSMA0_SA2
265
+ FSMA0_SX181
266
+ FSMM0_SX144
267
+ FSMM0_SX234
268
+ FSMS1_SX244
269
+ FSMS1_SX347
270
+ FSPM0_SA2
271
+ FSPM0_SX161
272
+ FSPM0_SX71
273
+ FSRH0_SI1931
274
+ FSRH0_SI671
275
+ FSRH0_SX221
276
+ FSRH0_SX401
277
+ FTAJ0_SI699
278
+ FTAJ0_SX159
279
+ FTAJ0_SX249
280
+ FTAJ0_SX429
281
+ FTBR0_SX21
282
+ FTBW0_SA1
283
+ FTMG0_SI1532
284
+ FTMG0_SI2162
285
+ FTMG0_SX452
286
+ FVFB0_SA2
287
+ FVFB0_SX132
288
+ FVFB0_SX42
289
+ FVKB0_SA1
290
+ FVMH0_SA2
291
+ FVMH0_SX116
292
+ FVMH0_SX26
293
+ MABC0_SI1620
294
+ MABC0_SI2041
295
+ MABC0_SI781
296
+ MADC0_SX107
297
+ MADC0_SX377
298
+ MADD0_SA2
299
+ MADD0_SI1295
300
+ MADD0_SX178
301
+ MADD0_SX268
302
+ MADD0_SX88
303
+ MAEB0_SX450
304
+ MAEO0_SA1
305
+ MAFM0_SI939
306
+ MAFM0_SX129
307
+ MAFM0_SX309
308
+ MAJP0_SA2
309
+ MAKB0_SI1646
310
+ MAKB0_SX26
311
+ MAKB0_SX386
312
+ MAKR0_SX362
313
+ MAKR0_SX92
314
+ MAPV0_SX213
315
+ MARC0_SA2
316
+ MARC0_SX108
317
+ MARC0_SX18
318
+ MARC0_SX198
319
+ MARW0_SI1906
320
+ MBAR0_SA1
321
+ MBAR0_SX419
322
+ MBAR0_SX59
323
+ MBBR0_SI2315
324
+ MBBR0_SX65
325
+ MBCG0_SA1
326
+ MBCG0_SI486
327
+ MBEF0_SI1281
328
+ MBEF0_SI1911
329
+ MBEF0_SI651
330
+ MBEF0_SX21
331
+ MBEF0_SX381
332
+ MBGT0_SA2
333
+ MBGT0_SX261
334
+ MBGT0_SX351
335
+ MBGT0_SX441
336
+ MBJV0_SA1
337
+ MBJV0_SI617
338
+ MBJV0_SX347
339
+ MBMA0_SI592
340
+ MBMA0_SX232
341
+ MBMA0_SX52
342
+ MBMA1_SI2214
343
+ MBMA1_SX54
344
+ MBML0_SA2
345
+ MBML0_SI1169
346
+ MBML0_SX89
347
+ MBOM0_SA2
348
+ MBOM0_SI2274
349
+ MBOM0_SX294
350
+ MBSB0_SA1
351
+ MBSB0_SX3
352
+ MBTH0_SA2
353
+ MBTH0_SX122
354
+ MBTH0_SX32
355
+ MCAE0_SX277
356
+ MCAL0_SA2
357
+ MCAL0_SI1768
358
+ MCDC0_SA1
359
+ MCDC0_SX212
360
+ MCDD0_SA2
361
+ MCDD0_SI883
362
+ MCDD0_SX253
363
+ MCDD0_SX433
364
+ MCDR0_SI1154
365
+ MCEF0_SX235
366
+ MCEF0_SX415
367
+ MCEW0_SA2
368
+ MCHL0_SX87
369
+ MCLK0_SX310
370
+ MCLM0_SA1
371
+ MCLM0_SI2086
372
+ MCLM0_SI826
373
+ MCPM0_SA1
374
+ MCPM0_SX114
375
+ MCPM0_SX294
376
+ MCPM0_SX384
377
+ MCSS0_SI750
378
+ MCTH0_SA1
379
+ MCTH0_SX39
380
+ MCXM0_SX91
381
+ MDAC0_SA1
382
+ MDAC0_SX181
383
+ MDAC0_SX361
384
+ MDAS0_SX6
385
+ MDBB1_SX106
386
+ MDBB1_SX16
387
+ MDBB1_SX376
388
+ MDBP0_SX168
389
+ MDCD0_SI1415
390
+ MDCD0_SX245
391
+ MDCD0_SX425
392
+ MDCM0_SX40
393
+ MDCM0_SX400
394
+ MDDC0_SI2049
395
+ MDDC0_SI789
396
+ MDDC0_SX159
397
+ MDDC0_SX69
398
+ MDED0_SA1
399
+ MDED0_SA2
400
+ MDEF0_SX123
401
+ MDEF0_SX303
402
+ MDHL0_SI1439
403
+ MDHL0_SX269
404
+ MDHL0_SX449
405
+ MDHS0_SA1
406
+ MDHS0_SA2
407
+ MDHS0_SI1530
408
+ MDHS0_SI2160
409
+ MDJM0_SX105
410
+ MDJM0_SX15
411
+ MDKS0_SX436
412
+ MDLB0_SA2
413
+ MDLC0_SX405
414
+ MDLC1_SA2
415
+ MDLC1_SI2065
416
+ MDLC1_SI2144
417
+ MDLC1_SX445
418
+ MDLC2_SI2244
419
+ MDLC2_SX354
420
+ MDLH0_SA2
421
+ MDLM0_SI1234
422
+ MDLM0_SI1864
423
+ MDLM0_SX154
424
+ MDLM0_SX424
425
+ MDLR0_SA1
426
+ MDLR0_SA2
427
+ MDLR0_SI1863
428
+ MDLR0_SI603
429
+ MDLR0_SX153
430
+ MDLR1_SA1
431
+ MDLR1_SA2
432
+ MDMA0_SI1430
433
+ MDMA0_SX260
434
+ MDMA0_SX80
435
+ MDMT0_SA1
436
+ MDMT0_SA2
437
+ MDMT0_SI1832
438
+ MDMT0_SX122
439
+ MDMT0_SX32
440
+ MDNS0_SA2
441
+ MDNS0_SI2271
442
+ MDNS0_SX201
443
+ MDNS0_SX21
444
+ MDPB0_SX416
445
+ MDPK0_SI1053
446
+ MDPK0_SX333
447
+ MDPK0_SX423
448
+ MDPS0_SI719
449
+ MDPS0_SX359
450
+ MDRD0_SA1
451
+ MDRD0_SX32
452
+ MDSJ0_SI2092
453
+ MDSS0_SA2
454
+ MDSS0_SX441
455
+ MDSS1_SA1
456
+ MDSS1_SI1327
457
+ MDSS1_SI697
458
+ MDSS1_SX157
459
+ MDSS1_SX67
460
+ MDTB0_SI1200
461
+ MDTB0_SI1830
462
+ MDTB0_SX120
463
+ MDWD0_SA2
464
+ MDWD0_SX270
465
+ MDWD0_SX90
466
+ MDWH0_SX215
467
+ MDWH0_SX305
468
+ MDWM0_SA1
469
+ MDWM0_SA2
470
+ MDWM0_SX16
471
+ MDWM0_SX286
472
+ MEAL0_SA2
473
+ MEAL0_SI2177
474
+ MEAL0_SX107
475
+ MEAL0_SX347
476
+ MEDR0_SA1
477
+ MEDR0_SA2
478
+ MEDR0_SI1374
479
+ MEFG0_SA1
480
+ MEGJ0_SA2
481
+ MEGJ0_SX257
482
+ MEGJ0_SX3
483
+ MEJL0_SA1
484
+ MEJL0_SX152
485
+ MEJL0_SX242
486
+ MEJS0_SI610
487
+ MEJS0_SX160
488
+ MEJS0_SX340
489
+ MESG0_SX432
490
+ MESJ0_SX187
491
+ MESJ0_SX97
492
+ MEWM0_SI718
493
+ MEWM0_SX178
494
+ MEWM0_SX88
495
+ MFER0_SI862
496
+ MFER0_SX142
497
+ MFRM0_SX345
498
+ MFRM0_SX435
499
+ MFWK0_SI1879
500
+ MFWK0_SX169
501
+ MFXS0_SX54
502
+ MFXV0_SA2
503
+ MFXV0_SX105
504
+ MGAF0_SA1
505
+ MGAF0_SX22
506
+ MGAF0_SX382
507
+ MGAG0_SA2
508
+ MGAK0_SX226
509
+ MGAK0_SX46
510
+ MGAR0_SX132
511
+ MGAW0_SI535
512
+ MGAW0_SX175
513
+ MGES0_SA1
514
+ MGES0_SI2111
515
+ MGES0_SI851
516
+ MGJC0_SA2
517
+ MGJC0_SX75
518
+ MGRL0_SI2127
519
+ MGRL0_SI867
520
+ MGRL0_SX147
521
+ MGRP0_SA2
522
+ MGSH0_SA2
523
+ MGSH0_SI1806
524
+ MGSH0_SX127
525
+ MGSH0_SX276
526
+ MGSH0_SX6
527
+ MGSL0_SA1
528
+ MGSL0_SI534
529
+ MGSL0_SX264
530
+ MGXP0_SX187
531
+ MGXP0_SX7
532
+ MHBS0_SX315
533
+ MHBS0_SX45
534
+ MHIT0_SA1
535
+ MHJB0_SA1
536
+ MHJB0_SI1017
537
+ MHMG0_SX195
538
+ MHMR0_SA1
539
+ MHMR0_SI489
540
+ MHRM0_SA1
541
+ MHRM0_SI958
542
+ MHRM0_SX148
543
+ MHRM0_SX58
544
+ MHXL0_SI1772
545
+ MHXL0_SX242
546
+ MILB0_SA2
547
+ MJAC0_SX307
548
+ MJAC0_SX71
549
+ MJAE0_SX174
550
+ MJAI0_SA1
551
+ MJAI0_SA2
552
+ MJBG0_SX62
553
+ MJDA0_SI1031
554
+ MJDA0_SX311
555
+ MJDE0_SI463
556
+ MJDG0_SA2
557
+ MJDG0_SI1042
558
+ MJDG0_SI1705
559
+ MJDM0_SA1
560
+ MJDM0_SI974
561
+ MJEB0_SI656
562
+ MJEB0_SX296
563
+ MJEB1_SA2
564
+ MJEB1_SX207
565
+ MJEB1_SX387
566
+ MJEE0_SA1
567
+ MJEE0_SX247
568
+ MJEE0_SX337
569
+ MJFH0_SA2
570
+ MJFH0_SI1107
571
+ MJFR0_SX75
572
+ MJHI0_SA1
573
+ MJHI0_SX158
574
+ MJJB0_SA1
575
+ MJJB0_SX239
576
+ MJJJ0_SX443
577
+ MJJM0_SA2
578
+ MJJM0_SI827
579
+ MJJM0_SX107
580
+ MJKR0_SA1
581
+ MJKR0_SI571
582
+ MJLB0_SX176
583
+ MJLG1_SX292
584
+ MJLS0_SX106
585
+ MJMA0_SA1
586
+ MJMA0_SA2
587
+ MJMD0_SA2
588
+ MJMD0_SX308
589
+ MJMD0_SX38
590
+ MJMM0_SX85
591
+ MJPG0_SI1191
592
+ MJPG0_SX111
593
+ MJPG0_SX201
594
+ MJPG0_SX21
595
+ MJPM0_SA2
596
+ MJPM0_SX378
597
+ MJPM1_SI2280
598
+ MJPM1_SX401
599
+ MJRA0_SA1
600
+ MJRA0_SA2
601
+ MJRA0_SI1236
602
+ MJRA0_SI1866
603
+ MJRA0_SX426
604
+ MJRG0_SI1366
605
+ MJRG0_SI1996
606
+ MJRG0_SX376
607
+ MJRH0_SX225
608
+ MJRH1_SA1
609
+ MJRH1_SI514
610
+ MJRH1_SX154
611
+ MJRH1_SX244
612
+ MJRH1_SX424
613
+ MJRK0_SA1
614
+ MJRK0_SA2
615
+ MJRK0_SI1662
616
+ MJRK0_SX160
617
+ MJRK0_SX250
618
+ MJRK0_SX430
619
+ MJRP0_SA1
620
+ MJRP0_SA2
621
+ MJRP0_SX225
622
+ MJSR0_SA1
623
+ MJSR0_SI1424
624
+ MJSR0_SX344
625
+ MJWG0_SA1
626
+ MJWG0_SX265
627
+ MJWS0_SI513
628
+ MJWS0_SX153
629
+ MJWS0_SX63
630
+ MJWT0_SA1
631
+ MJWT0_SX121
632
+ MJWT0_SX211
633
+ MJWT0_SX301
634
+ MJWT0_SX31
635
+ MJWT0_SX391
636
+ MJXA0_SX427
637
+ MJXL0_SI542
638
+ MKAG0_SA1
639
+ MKAG0_SX259
640
+ MKAJ0_SA2
641
+ MKAJ0_SX154
642
+ MKAM0_SA1
643
+ MKAM0_SX146
644
+ MKAM0_SX326
645
+ MKAM0_SX56
646
+ MKDB0_SA1
647
+ MKDB0_SA2
648
+ MKDB0_SX152
649
+ MKDD0_SA2
650
+ MKES0_SA1
651
+ MKES0_SI1253
652
+ MKES0_SI1883
653
+ MKES0_SX173
654
+ MKJO0_SI1517
655
+ MKJO0_SI887
656
+ MKJO0_SX437
657
+ MKLN0_SI968
658
+ MKLN0_SX248
659
+ MKLR0_SA2
660
+ MKLR0_SI1689
661
+ MKLS0_SA1
662
+ MKLS0_SX357
663
+ MKLS0_SX87
664
+ MKLS1_SA1
665
+ MKLS1_SA2
666
+ MKLS1_SX375
667
+ MKLW0_SA1
668
+ MKRG0_SX411
669
+ MKXL0_SA2
670
+ MKXL0_SX15
671
+ MKXL0_SX375
672
+ MLBC0_SA1
673
+ MLBC0_SI1869
674
+ MLBC0_SX249
675
+ MLEL0_SA1
676
+ MLEL0_SA2
677
+ MLEL0_SI1246
678
+ MLEL0_SX256
679
+ MLEL0_SX436
680
+ MLJC0_SX145
681
+ MLJC0_SX415
682
+ MLJH0_SX64
683
+ MLNS0_SI2037
684
+ MMAA0_SA1
685
+ MMAA0_SA2
686
+ MMAA0_SX35
687
+ MMAB1_SI1494
688
+ MMAB1_SX234
689
+ MMAG0_SA2
690
+ MMAG0_SI1126
691
+ MMAG0_SX316
692
+ MMAM0_SI2227
693
+ MMAM0_SX157
694
+ MMAM0_SX427
695
+ MMAR0_SX256
696
+ MMBS0_SI1781
697
+ MMCC0_SA2
698
+ MMDB0_SX177
699
+ MMDG0_SA1
700
+ MMDG0_SA2
701
+ MMDG0_SI520
702
+ MMDG0_SX160
703
+ MMDG0_SX250
704
+ MMDM0_SI1941
705
+ MMDM0_SI681
706
+ MMDM0_SX141
707
+ MMDM1_SA2
708
+ MMDM1_SI2043
709
+ MMDM1_SX423
710
+ MMDM1_SX63
711
+ MMDS0_SA1
712
+ MMEA0_SA1
713
+ MMEA0_SX128
714
+ MMEA0_SX398
715
+ MMEB0_SA2
716
+ MMEB0_SX187
717
+ MMEB0_SX367
718
+ MMGC0_SA2
719
+ MMGC0_SX135
720
+ MMGC0_SX225
721
+ MMGG0_SX269
722
+ MMGK0_SX332
723
+ MMGK0_SX62
724
+ MMJB1_SA2
725
+ MMRP0_SA2
726
+ MMRP0_SX144
727
+ MMSM0_SX116
728
+ MMSM0_SX206
729
+ MMVP0_SA1
730
+ MMVP0_SA2
731
+ MMWB0_SI989
732
+ MMWB0_SX89
733
+ MMWS0_SA2
734
+ MMWS0_SX168
735
+ MMWS0_SX348
736
+ MMWS0_SX438
737
+ MMWS1_SI1701
738
+ MMXS0_SI2136
739
+ MMXS0_SX246
740
+ MMXS0_SX426
741
+ MNET0_SI816
742
+ MNET0_SX6
743
+ MNTW0_SA2
744
+ MNTW0_SX168
745
+ MNTW0_SX78
746
+ MPAR0_SI2206
747
+ MPAR0_SI946
748
+ MPAR0_SX136
749
+ MPAR0_SX316
750
+ MPEB0_SI1034
751
+ MPEB0_SI1860
752
+ MPEB0_SX240
753
+ MPEB0_SX330
754
+ MPFU0_SI628
755
+ MPFU0_SX448
756
+ MPGH0_SX114
757
+ MPGH0_SX24
758
+ MPGR0_SX240
759
+ MPGR0_SX330
760
+ MPGR1_SX149
761
+ MPPC0_SA1
762
+ MPRD0_SA1
763
+ MPRD0_SX261
764
+ MPRD0_SX351
765
+ MPRD0_SX441
766
+ MPRD0_SX81
767
+ MPRK0_SI1727
768
+ MPRK0_SX107
769
+ MPRK0_SX377
770
+ MPRT0_SA1
771
+ MPRT0_SX310
772
+ MPSW0_SI1067
773
+ MPSW0_SX167
774
+ MPSW0_SX437
775
+ MRAB1_SX128
776
+ MRAB1_SX308
777
+ MRAI0_SA1
778
+ MRAI0_SA2
779
+ MRAI0_SX72
780
+ MRAM0_SA1
781
+ MRAM0_SA2
782
+ MRAM0_SX15
783
+ MRBC0_SI1859
784
+ MRBC0_SX329
785
+ MRBC0_SX419
786
+ MRCG0_SI798
787
+ MRCG0_SX168
788
+ MRCW0_SA1
789
+ MRCW0_SX291
790
+ MRDD0_SI1680
791
+ MRDD0_SX150
792
+ MRDD0_SX277
793
+ MRDD0_SX60
794
+ MRDM0_SI1595
795
+ MRDM0_SX65
796
+ MRDS0_SA1
797
+ MREE0_SX24
798
+ MREH1_SX249
799
+ MREH1_SX69
800
+ MREM0_SA2
801
+ MREW1_SI870
802
+ MRFK0_SX446
803
+ MRFL0_SA1
804
+ MRFL0_SX256
805
+ MRFL0_SX436
806
+ MRFL0_SX76
807
+ MRGM0_SA2
808
+ MRGM0_SX262
809
+ MRGS0_SA2
810
+ MRGS0_SX186
811
+ MRHL0_SI885
812
+ MRHL0_SX345
813
+ MRHL0_SX435
814
+ MRJB1_SA1
815
+ MRJB1_SA2
816
+ MRJB1_SX210
817
+ MRJB1_SX30
818
+ MRJB1_SX390
819
+ MRJH0_SA2
820
+ MRJH0_SX307
821
+ MRJH0_SX79
822
+ MRJM0_SX148
823
+ MRJM1_SA2
824
+ MRJM1_SI1298
825
+ MRJM1_SI1928
826
+ MRJM1_SX128
827
+ MRJT0_SA2
828
+ MRJT0_SI1498
829
+ MRJT0_SX328
830
+ MRJT0_SX418
831
+ MRKM0_SA2
832
+ MRKM0_SX367
833
+ MRLD0_SA2
834
+ MRLD0_SI2224
835
+ MRLD0_SX154
836
+ MRLD0_SX424
837
+ MRLJ0_SA1
838
+ MRLJ0_SX250
839
+ MRLJ0_SX340
840
+ MRLJ1_SA1
841
+ MRLJ1_SA2
842
+ MRLJ1_SX321
843
+ MRLK0_SI843
844
+ MRLK0_SX123
845
+ MRLK0_SX213
846
+ MRMB0_SA2
847
+ MRMB0_SI1581
848
+ MRMB0_SX411
849
+ MRMG0_SA1
850
+ MRMG0_SI1080
851
+ MRMG0_SX450
852
+ MRMH0_SI1349
853
+ MRMH0_SI2281
854
+ MRMH0_SX121
855
+ MRML0_SA2
856
+ MRML0_SX341
857
+ MRPC1_SI2112
858
+ MRRE0_SA2
859
+ MRRE0_SX164
860
+ MRRE0_SX344
861
+ MRRE0_SX74
862
+ MRSO0_SX129
863
+ MRSO0_SX39
864
+ MRSP0_SX259
865
+ MRTC0_SX378
866
+ MRVG0_SI1140
867
+ MRVG0_SX240
868
+ MRWA0_SI973
869
+ MRWA0_SX163
870
+ MRWA0_SX73
871
+ MRWS0_SI1732
872
+ MRWS0_SI472
873
+ MRWS0_SX22
874
+ MRWS0_SX382
875
+ MRXB0_SA2
876
+ MRXB0_SX415
877
+ MSAH1_SI1679
878
+ MSAS0_SX116
879
+ MSAS0_SX206
880
+ MSAS0_SX386
881
+ MSAT0_SA1
882
+ MSAT1_SX263
883
+ MSAT1_SX443
884
+ MSAT1_SX83
885
+ MSDB0_SX197
886
+ MSDB0_SX287
887
+ MSDB0_SX377
888
+ MSDH0_SI2240
889
+ MSDH0_SX440
890
+ MSDH0_SX80
891
+ MSDS0_SA1
892
+ MSEM1_SI1440
893
+ MSEM1_SX180
894
+ MSEM1_SX270
895
+ MSES0_SI1589
896
+ MSES0_SX239
897
+ MSES0_SX419
898
+ MSFH0_SX316
899
+ MSFV0_SI1892
900
+ MSFV0_SX362
901
+ MSFV0_SX92
902
+ MSMR0_SX415
903
+ MSMS0_SA1
904
+ MSMS0_SX173
905
+ MSMS0_SX83
906
+ MSRG0_SA1
907
+ MSRG0_SI1221
908
+ MSTF0_SI766
909
+ MSTF0_SX316
910
+ MSTF0_SX46
911
+ MSVS0_SA2
912
+ MSVS0_SX308
913
+ MTAS0_SX215
914
+ MTAS0_SX35
915
+ MTAS0_SX395
916
+ MTAT0_SX390
917
+ MTAT1_SX59
918
+ MTBC0_SI1803
919
+ MTCS0_SA2
920
+ MTCS0_SI2265
921
+ MTCS0_SX82
922
+ MTDP0_SA2
923
+ MTER0_SA2
924
+ MTER0_SI1787
925
+ MTJG0_SA1
926
+ MTJG0_SI2157
927
+ MTJG0_SX260
928
+ MTJM0_SI1856
929
+ MTJM0_SX146
930
+ MTJU0_SX130
931
+ MTJU0_SX400
932
+ MTKD0_SX107
933
+ MTKD0_SX287
934
+ MTKP0_SI1023
935
+ MTLB0_SA1
936
+ MTLB0_SX234
937
+ MTLC0_SA1
938
+ MTML0_SI2325
939
+ MTML0_SX165
940
+ MTMN0_SA2
941
+ MTMN0_SI1064
942
+ MTMN0_SI2324
943
+ MTMN0_SX434
944
+ MTMT0_SA2
945
+ MTMT0_SI1748
946
+ MTPF0_SX65
947
+ MTPG0_SI1383
948
+ MTPG0_SI753
949
+ MTPG0_SX303
950
+ MTPP0_SX338
951
+ MTPR0_SX340
952
+ MTQC0_SI480
953
+ MTQC0_SX91
954
+ MTRR0_SX198
955
+ MTRR0_SX288
956
+ MTRT0_SA2
957
+ MTRT0_SX254
958
+ MTRT0_SX57
959
+ MTWH1_SX72
960
+ MTXS0_SA1
961
+ MTXS0_SA2
962
+ MVJH0_SI926
963
+ MVJH0_SX206
964
+ MVJH0_SX296
965
+ MVLO0_SA1
966
+ MVRW0_SA2
967
+ MVRW0_SX135
968
+ MVRW0_SX225
969
+ MWAC0_SA2
970
+ MWAC0_SX341
971
+ MWAC0_SX431
972
+ MWAD0_SX432
973
+ MWAD0_SX72
974
+ MWAR0_SA1
975
+ MWAR0_SI1675
976
+ MWCH0_SI1895
977
+ MWCH0_SI2252
978
+ MWCH0_SX182
979
+ MWCH0_SX452
980
+ MWDK0_SA1
981
+ MWDK0_SA2
982
+ MWDK0_SI2017
983
+ MWDK0_SI806
984
+ MWDK0_SX176
985
+ MWDK0_SX86
986
+ MWEM0_SA2
987
+ MWEM0_SI1320
988
+ MWEM0_SI1393
989
+ MWEM0_SX150
990
+ MWGR0_SX346
991
+ MWRE0_SX247
992
+ MWRE0_SX337
993
+ MWRE0_SX427
994
+ MWRP0_SA1
995
+ MWRP0_SX273
996
+ MWRP0_SX363
997
+ MWSB0_SX276
998
+ MWSH0_SX256
999
+ MWSH0_SX76
1000
+ MZMB0_SA1
fairseq/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FAEM0_SI1392
2
+ FAJW0_SI1263
3
+ FAJW0_SI633
4
+ FALK0_SI658
5
+ FALR0_SX335
6
+ FAPB0_SI1063
7
+ FAPB0_SI2323
8
+ FAPB0_SX433
9
+ FBAS0_SI1472
10
+ FBAS0_SI2066
11
+ FBCG1_SX352
12
+ FBCH0_SI959
13
+ FBJL0_SI922
14
+ FBLV0_SI1688
15
+ FBMH0_SI1136
16
+ FBMH0_SI970
17
+ FBMJ0_SA1
18
+ FBMJ0_SI1776
19
+ FBMJ0_SI516
20
+ FBMJ0_SX336
21
+ FCDR1_SI1186
22
+ FCDR1_SI1816
23
+ FCDR1_SI556
24
+ FCDR1_SX286
25
+ FCKE0_SI1741
26
+ FCKE0_SI481
27
+ FCLT0_SI808
28
+ FCMG0_SI1142
29
+ FCMG0_SX432
30
+ FCMM0_SI1957
31
+ FCMM0_SX420
32
+ FCYL0_SI667
33
+ FCYL0_SX349
34
+ FDAS1_SI1461
35
+ FDAS1_SI831
36
+ FDAW0_SI1271
37
+ FDAW0_SI2036
38
+ FDJH0_SI935
39
+ FDKN0_SI1202
40
+ FDKN0_SX181
41
+ FDKN0_SX451
42
+ FDMY0_SA1
43
+ FDMY0_SI567
44
+ FDMY0_SI714
45
+ FDMY0_SX387
46
+ FDNC0_SI1278
47
+ FDNC0_SI1908
48
+ FDTD0_SA1
49
+ FDTD0_SX321
50
+ FEAC0_SI615
51
+ FEAR0_SX352
52
+ FECD0_SA1
53
+ FECD0_SI1418
54
+ FECD0_SI788
55
+ FEME0_SI875
56
+ FEME0_SX335
57
+ FEXM0_SA1
58
+ FEXM0_SI482
59
+ FEXM0_SX366
60
+ FGDP0_SI988
61
+ FGDP0_SX88
62
+ FGMB0_SI1145
63
+ FGMB0_SX335
64
+ FGRW0_SA1
65
+ FGRW0_SI1152
66
+ FGRW0_SX162
67
+ FGRW0_SX432
68
+ FHLM0_SX120
69
+ FHLM0_SX349
70
+ FHXS0_SA1
71
+ FHXS0_SI1075
72
+ FHXS0_SI2302
73
+ FHXS0_SX175
74
+ FJDM2_SA2
75
+ FJDM2_SX142
76
+ FJEN0_SA1
77
+ FJEN0_SX327
78
+ FJEN0_SX417
79
+ FJHK0_SI2282
80
+ FJKL0_SI932
81
+ FJLG0_SI1889
82
+ FJLR0_SI1231
83
+ FJRB0_SX402
84
+ FJRP1_SA1
85
+ FJRP1_SI1432
86
+ FJRP1_SX262
87
+ FJRP1_SX352
88
+ FJSK0_SI1052
89
+ FJSP0_SI1434
90
+ FJWB1_SI748
91
+ FJXM0_SX311
92
+ FJXM0_SX41
93
+ FJXP0_SI1752
94
+ FKAA0_SA1
95
+ FKDE0_SI1141
96
+ FKDE0_SI1771
97
+ FKDW0_SI1207
98
+ FKDW0_SI1891
99
+ FKFB0_SI1608
100
+ FKFB0_SX438
101
+ FKKH0_SI1290
102
+ FKKH0_SI1920
103
+ FKLC0_SI985
104
+ FKLC0_SX175
105
+ FKLC1_SI1048
106
+ FKLH0_SI1257
107
+ FKSR0_SX366
108
+ FLAC0_SI1339
109
+ FLAG0_SI1464
110
+ FLAG0_SI834
111
+ FLEH0_SI1051
112
+ FLET0_SI507
113
+ FLJA0_SI1078
114
+ FLJA0_SX178
115
+ FLJD0_SI1516
116
+ FLJG0_SI981
117
+ FLJG0_SX171
118
+ FLJG0_SX351
119
+ FLKM0_SA1
120
+ FLKM0_SI620
121
+ FLKM0_SX350
122
+ FLKM0_SX440
123
+ FLMC0_SI1372
124
+ FLMK0_SA1
125
+ FLMK0_SI1229
126
+ FLTM0_SX170
127
+ FLTM0_SX350
128
+ FLTM0_SX440
129
+ FMAH1_SI879
130
+ FMBG0_SI1160
131
+ FMEM0_SA1
132
+ FMEM0_SX333
133
+ FMJB0_SI1177
134
+ FMJF0_SI624
135
+ FMJF0_SX174
136
+ FMJF0_SX84
137
+ FMJU0_SI1389
138
+ FMKC0_SI1041
139
+ FMKF0_SI1018
140
+ FMPG0_SA1
141
+ FMPG0_SI972
142
+ FMPG0_SX162
143
+ FMPG0_SX342
144
+ FMPG0_SX432
145
+ FNKL0_SI892
146
+ FNTB0_SI679
147
+ FPAB1_SA1
148
+ FPAB1_SI2101
149
+ FPAB1_SI841
150
+ FPAC0_SI1921
151
+ FPAC0_SI661
152
+ FPAD0_SI716
153
+ FPAD0_SX176
154
+ FPAF0_SA1
155
+ FPAF0_SI1054
156
+ FPAZ0_SI2223
157
+ FPAZ0_SI963
158
+ FPJF0_SI1259
159
+ FPJF0_SX352
160
+ FPLS0_SI960
161
+ FPMY0_SI1153
162
+ FPMY0_SI523
163
+ FREH0_SI1945
164
+ FRLL0_SI805
165
+ FSAG0_SI1323
166
+ FSAG0_SX153
167
+ FSAG0_SX333
168
+ FSAG0_SX423
169
+ FSAH0_SI614
170
+ FSAH0_SX327
171
+ FSAK0_SI1300
172
+ FSBK0_SX349
173
+ FSCN0_SA1
174
+ FSCN0_SI705
175
+ FSCN0_SX176
176
+ FSDC0_SI1312
177
+ FSDJ0_SI1115
178
+ FSGF0_SI2187
179
+ FSGF0_SI927
180
+ FSJG0_SA1
181
+ FSJG0_SA2
182
+ FSJG0_SI940
183
+ FSJG0_SX220
184
+ FSJG0_SX40
185
+ FSJG0_SX400
186
+ FSJS0_SA1
187
+ FSJS0_SX451
188
+ FSJW0_SI1333
189
+ FSKP0_SI1098
190
+ FSMA0_SI991
191
+ FSMA0_SX451
192
+ FSMM0_SX324
193
+ FSPM0_SI1241
194
+ FSPM0_SX251
195
+ FSRH0_SX311
196
+ FSSB0_SI1712
197
+ FSSB0_SX362
198
+ FTBR0_SI1402
199
+ FTBR0_SI921
200
+ FTBW0_SI715
201
+ FTBW0_SX175
202
+ FTLG0_SI1743
203
+ FTLG0_SI483
204
+ FTMG0_SI902
205
+ FVFB0_SI1510
206
+ FVKB0_SX349
207
+ FVMH0_SI1466
208
+ FVMH0_SI836
209
+ MADC0_SI1367
210
+ MADC0_SI737
211
+ MAEB0_SI1411
212
+ MAEO0_SI1326
213
+ MAJP0_SI1704
214
+ MAJP0_SX174
215
+ MAKB0_SA2
216
+ MAKB0_SI1016
217
+ MAKB0_SI2276
218
+ MAKB0_SX116
219
+ MAPV0_SI1293
220
+ MAPV0_SI663
221
+ MARW0_SX286
222
+ MARW0_SX349
223
+ MBBR0_SI1055
224
+ MBBR0_SX335
225
+ MBCG0_SI957
226
+ MBCG0_SX327
227
+ MBGT0_SI1841
228
+ MBGT0_SX171
229
+ MBMA0_SI1222
230
+ MBMA1_SI954
231
+ MBMA1_SX324
232
+ MBTH0_SI2102
233
+ MBWP0_SX349
234
+ MCAE0_SI1447
235
+ MCAE0_SI2077
236
+ MCAE0_SI817
237
+ MCAL0_SI1138
238
+ MCDR0_SI1784
239
+ MCDR0_SI524
240
+ MCEF0_SI842
241
+ MCEW0_SA1
242
+ MCEW0_SI2072
243
+ MCEW0_SI812
244
+ MCEW0_SX362
245
+ MCEW0_SX452
246
+ MCHL0_SI1347
247
+ MCHL0_SI1404
248
+ MCLK0_SI2290
249
+ MCLK0_SI650
250
+ MCPM0_SI1824
251
+ MCSS0_SI1380
252
+ MCSS0_SI688
253
+ MCTM0_SI1350
254
+ MCTM0_SI1980
255
+ MDAC0_SI631
256
+ MDAS0_SI1896
257
+ MDAS0_SI636
258
+ MDBP0_SI528
259
+ MDBP0_SX438
260
+ MDCD0_SI785
261
+ MDCD0_SX335
262
+ MDCM0_SI1480
263
+ MDDC0_SI1419
264
+ MDED0_SI540
265
+ MDEF0_SI1123
266
+ MDEM0_SA1
267
+ MDEM0_SI608
268
+ MDEM0_SI800
269
+ MDEM0_SX428
270
+ MDHS0_SI900
271
+ MDJM0_SI1455
272
+ MDKS0_SX166
273
+ MDKS0_SX346
274
+ MDLB0_SI1306
275
+ MDLB0_SX136
276
+ MDLB0_SX406
277
+ MDLC0_SI1395
278
+ MDLC0_SI2025
279
+ MDLC1_SI1435
280
+ MDLH0_SX160
281
+ MDLH0_SX430
282
+ MDLM0_SI604
283
+ MDLR0_SX333
284
+ MDLR1_SI669
285
+ MDMA0_SX170
286
+ MDMA0_SX350
287
+ MDMA0_SX440
288
+ MDNS0_SI1011
289
+ MDNS0_SI873
290
+ MDPB0_SI1760
291
+ MDPB0_SI866
292
+ MDRD0_SI752
293
+ MDSJ0_SI1462
294
+ MDSJ0_SX438
295
+ MDWD0_SI1260
296
+ MDWH0_SA1
297
+ MDWH0_SI1168
298
+ MDWH0_SI665
299
+ MDWM0_SI916
300
+ MEDR0_SI2004
301
+ MEFG0_SI491
302
+ MEFG0_SI598
303
+ MEGJ0_SA1
304
+ MEGJ0_SI1337
305
+ MEGJ0_SI707
306
+ MEGJ0_SX167
307
+ MEJS0_SI1240
308
+ MESG0_SI702
309
+ MESJ0_SI2039
310
+ MFWK0_SX349
311
+ MFXS0_SX324
312
+ MFXV0_SI1005
313
+ MFXV0_SI1342
314
+ MGAF0_SI1282
315
+ MGAG0_SI691
316
+ MGAK0_SI1036
317
+ MGAK0_SX136
318
+ MGAR0_SX312
319
+ MGAW0_SI1165
320
+ MGES0_SX311
321
+ MGJC0_SX435
322
+ MGRL0_SX327
323
+ MGRP0_SI1317
324
+ MGRP0_SX327
325
+ MGSH0_SI1176
326
+ MGSH0_SI546
327
+ MGSL0_SI797
328
+ MGXP0_SI1087
329
+ MGXP0_SI525
330
+ MHBS0_SI945
331
+ MHIT0_SI983
332
+ MHMG0_SI735
333
+ MHMR0_SI1692
334
+ MILB0_SI903
335
+ MJAC0_SI701
336
+ MJAC0_SX251
337
+ MJAE0_SX84
338
+ MJAI0_SI682
339
+ MJAI0_SI710
340
+ MJDC0_SI531
341
+ MJDE0_SA1
342
+ MJDE0_SI1120
343
+ MJDE0_SI490
344
+ MJDE0_SX220
345
+ MJDM0_SI1340
346
+ MJDM0_SX170
347
+ MJDM0_SX350
348
+ MJEB0_SX170
349
+ MJEB1_SI1467
350
+ MJEB1_SI837
351
+ MJFR0_SA1
352
+ MJFR0_SX435
353
+ MJHI0_SI1328
354
+ MJJJ0_SI1163
355
+ MJJM0_SI1251
356
+ MJLB0_SI1616
357
+ MJLS0_SI1726
358
+ MJMA0_SI2125
359
+ MJMD0_SI2288
360
+ MJMM0_SI1255
361
+ MJMM0_SX175
362
+ MJPG0_SI1821
363
+ MJPM0_SI1368
364
+ MJPM1_SX311
365
+ MJRA0_SX336
366
+ MJRG0_SI736
367
+ MJRG0_SX352
368
+ MJRH0_SI1840
369
+ MJRH1_SI1558
370
+ MJRK0_SI880
371
+ MJRP0_SI1845
372
+ MJSR0_SI2054
373
+ MJSR0_SI794
374
+ MJWG0_SI813
375
+ MJWG0_SI895
376
+ MJWG0_SX175
377
+ MJWS0_SX333
378
+ MJWT0_SI1291
379
+ MJWT0_SI1381
380
+ MJXL0_SI1172
381
+ MKAG0_SI979
382
+ MKAH0_SX178
383
+ MKAM0_SI1250
384
+ MKAM0_SI1465
385
+ MKDD0_SI1567
386
+ MKDD0_SI2197
387
+ MKDD0_SI937
388
+ MKDT0_SI814
389
+ MKES0_SI623
390
+ MKLS0_SI1437
391
+ MKLS0_SI2067
392
+ MKLS1_SI915
393
+ MKLW0_SI1571
394
+ MKLW0_SX311
395
+ MKRG0_SI861
396
+ MKXL0_SI1815
397
+ MKXL0_SI1958
398
+ MLBC0_SI1239
399
+ MLEL0_SI616
400
+ MLEL0_SX166
401
+ MLJC0_SI1225
402
+ MLJH0_SA1
403
+ MLJH0_SA2
404
+ MLJH0_SI1422
405
+ MLJH0_SI694
406
+ MLJH0_SX244
407
+ MLSH0_SI1417
408
+ MLSH0_SX247
409
+ MMAA0_SI1588
410
+ MMAA0_SI845
411
+ MMAB1_SI864
412
+ MMAB1_SX324
413
+ MMAG0_SA1
414
+ MMAG0_SI1756
415
+ MMAG0_SX136
416
+ MMAR0_SI1966
417
+ MMAR0_SX166
418
+ MMAR0_SX346
419
+ MMBS0_SI521
420
+ MMBS0_SX161
421
+ MMCC0_SI1338
422
+ MMDB0_SI987
423
+ MMDG0_SI1780
424
+ MMDM0_SI1311
425
+ MMDM1_SX153
426
+ MMDM1_SX333
427
+ MMEB0_SX327
428
+ MMGC0_SI1305
429
+ MMGG0_SI1079
430
+ MMGG0_SX449
431
+ MMLM0_SI2150
432
+ MMPM0_SX161
433
+ MMRP0_SX324
434
+ MMSM0_SI1106
435
+ MMSM0_SI476
436
+ MMVP0_SI654
437
+ MMVP0_SX347
438
+ MMWB0_SA1
439
+ MMWB0_SI2249
440
+ MMWB0_SX359
441
+ MMWB0_SX449
442
+ MNTW0_SI1068
443
+ MNTW0_SI1698
444
+ MPEB0_SI600
445
+ MPFU0_SI1258
446
+ MPGH0_SI675
447
+ MPGR0_SI1410
448
+ MPGR1_SI1499
449
+ MPMB0_SA1
450
+ MPMB0_SA2
451
+ MPMB0_SI1501
452
+ MPMB0_SI2131
453
+ MPMB0_SI871
454
+ MPMB0_SX151
455
+ MPMB0_SX331
456
+ MPMB0_SX421
457
+ MPMB0_SX61
458
+ MPPC0_SI1412
459
+ MPRB0_SI1215
460
+ MPRB0_SI575
461
+ MPRD0_SI801
462
+ MPRD0_SX171
463
+ MPRK0_SA1
464
+ MPRK0_SI1097
465
+ MPRK0_SI467
466
+ MPRK0_SX287
467
+ MRAB0_SI1854
468
+ MRAB1_SI848
469
+ MRAI0_SI2052
470
+ MRAI0_SI792
471
+ MRAI0_SX432
472
+ MRAM0_SI1951
473
+ MRCG0_SA2
474
+ MRCG0_SI1428
475
+ MRCG0_SX348
476
+ MRCG0_SX438
477
+ MRCW0_SI741
478
+ MRDM0_SI1044
479
+ MRDM0_SX335
480
+ MREE0_SI1104
481
+ MREE0_SI1959
482
+ MREH1_SA1
483
+ MREH1_SI1599
484
+ MREH1_SI969
485
+ MREM0_SI511
486
+ MRFK0_SI1076
487
+ MRFL0_SI1156
488
+ MRFL0_SI526
489
+ MRFL0_SX166
490
+ MRGM0_SI532
491
+ MRGM0_SX172
492
+ MRGM0_SX442
493
+ MRGS0_SI1356
494
+ MRGS0_SI726
495
+ MRGS0_SX6
496
+ MRJB1_SI1413
497
+ MRJB1_SI2021
498
+ MRJB1_SX120
499
+ MRJH0_SI1519
500
+ MRJH0_SI889
501
+ MRJH0_SX169
502
+ MRJT0_SI868
503
+ MRJT0_SX58
504
+ MRKM0_SI1267
505
+ MRKM0_SI1391
506
+ MRKM0_SI637
507
+ MRLJ0_SI790
508
+ MRLJ1_SI2301
509
+ MRLK0_SI1468
510
+ MRLR0_SI1196
511
+ MRML0_SA1
512
+ MRML0_SI1421
513
+ MRML0_SX161
514
+ MRML0_SX251
515
+ MRMS0_SI2057
516
+ MRRE0_SA1
517
+ MRRE0_SI1334
518
+ MRRE0_SI952
519
+ MRSO0_SI1206
520
+ MRSP0_SI1429
521
+ MRTC0_SI1458
522
+ MRTJ0_SA1
523
+ MRTJ0_SI772
524
+ MRTJ0_SX142
525
+ MRTJ0_SX232
526
+ MRTJ0_SX52
527
+ MRWS0_SI1102
528
+ MRXB0_SI2215
529
+ MRXB0_SI955
530
+ MSAS0_SI1376
531
+ MSAS0_SI746
532
+ MSDH0_SI980
533
+ MSDH0_SX170
534
+ MSDS0_SI1077
535
+ MSDS0_SX267
536
+ MSDS0_SX357
537
+ MSEM1_SI2070
538
+ MSEM1_SI810
539
+ MSFH0_SA1
540
+ MSFH0_SI1738
541
+ MSFH0_SX136
542
+ MSFH0_SX406
543
+ MSFV0_SI632
544
+ MSJK0_SI1596
545
+ MSJK0_SX336
546
+ MSMC0_SI509
547
+ MSMR0_SI1150
548
+ MSMS0_SI1433
549
+ MSRR0_SI1761
550
+ MSRR0_SI501
551
+ MSTF0_SI852
552
+ MSVS0_SI2198
553
+ MSVS0_SI938
554
+ MSVS0_SX398
555
+ MTAB0_SI1572
556
+ MTAB0_SX312
557
+ MTAT0_SA1
558
+ MTAT0_SI1110
559
+ MTAT0_SI811
560
+ MTAT1_SI779
561
+ MTAT1_SX149
562
+ MTAT1_SX329
563
+ MTBC0_SI543
564
+ MTCS0_SI712
565
+ MTDB0_SI1401
566
+ MTDB0_SI771
567
+ MTDP0_SA1
568
+ MTDP0_SI1521
569
+ MTDP0_SX171
570
+ MTDP0_SX351
571
+ MTER0_SA1
572
+ MTER0_SI1157
573
+ MTER0_SX437
574
+ MTJG0_SX170
575
+ MTJS0_SA2
576
+ MTJS0_SI1822
577
+ MTJS0_SI562
578
+ MTJS0_SX382
579
+ MTJU0_SI2020
580
+ MTKD0_SI630
581
+ MTKP0_SI2283
582
+ MTKP0_SI454
583
+ MTLB0_SI1134
584
+ MTLB0_SX324
585
+ MTLC0_SI1313
586
+ MTLC0_SI1477
587
+ MTML0_SX435
588
+ MTMN0_SI582
589
+ MTMT0_SI488
590
+ MTPP0_SI1508
591
+ MTPR0_SI2230
592
+ MTPR0_SX160
593
+ MTPR0_SX430
594
+ MTQC0_SA1
595
+ MTQC0_SI1441
596
+ MTQC0_SX181
597
+ MTQC0_SX451
598
+ MTRC0_SI589
599
+ MTRR0_SI918
600
+ MTRT0_SI1227
601
+ MTXS0_SI1060
602
+ MTXS0_SI2320
603
+ MTXS0_SX160
604
+ MTXS0_SX430
605
+ MVJH0_SI1556
606
+ MVLO0_SI517
607
+ MWAC0_SI1601
608
+ MWAC0_SX161
609
+ MWAC0_SX251
610
+ MWAR0_SI1045
611
+ MWDK0_SI1436
612
+ MWEM0_SX420
613
+ MWRE0_SA2
614
+ MWRE0_SI1057
615
+ MWRE0_SX67
616
+ MWRP0_SI1443
617
+ MWSB0_SI996
618
+ MWSH0_SI1426
619
+ MWSH0_SI796
620
+ MWSH0_SX166
fairseq/examples/wav2vec/unsupervised/data/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .extracted_features_dataset import ExtractedFeaturesDataset
7
+ from .random_input_dataset import RandomInputDataset
8
+
9
+
10
+ __all__ = [
11
+ "ExtractedFeaturesDataset",
12
+ "RandomInputDataset",
13
+ ]
fairseq/examples/wav2vec/unsupervised/data/extracted_features_dataset.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import logging
8
+ import os
9
+ import contextlib
10
+
11
+ import numpy as np
12
+ import torch
13
+
14
+ from fairseq.data import FairseqDataset, data_utils
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ExtractedFeaturesDataset(FairseqDataset):
21
+ def __init__(
22
+ self,
23
+ path,
24
+ split,
25
+ min_length=3,
26
+ max_length=None,
27
+ labels=None,
28
+ label_dict=None,
29
+ shuffle=True,
30
+ sort_by_length=True,
31
+ aux_target_postfix=None,
32
+ ):
33
+ super().__init__()
34
+
35
+ self.min_length = min_length
36
+ self.max_length = max_length
37
+ self.shuffle = shuffle
38
+ self.sort_by_length = sort_by_length
39
+ self.label_dict = label_dict
40
+
41
+ if labels is not None:
42
+ assert label_dict is not None
43
+
44
+ self.sizes = []
45
+ self.offsets = []
46
+ self.labels = []
47
+ self.aux_tgt = None
48
+
49
+ path = os.path.join(path, split)
50
+ data_path = path
51
+ self.data = np.load(data_path + ".npy", mmap_mode="r")
52
+
53
+ offset = 0
54
+ skipped = 0
55
+
56
+ if not os.path.exists(path + f".{labels}"):
57
+ labels = None
58
+
59
+ with open(data_path + ".lengths", "r") as len_f, open(
60
+ path + f".{labels}", "r"
61
+ ) if labels is not None else contextlib.ExitStack() as lbl_f:
62
+ for line in len_f:
63
+ length = int(line.rstrip())
64
+ lbl = None if labels is None else next(lbl_f).rstrip().split()
65
+ if length >= min_length and (
66
+ max_length is None or length <= max_length
67
+ ):
68
+ self.sizes.append(length)
69
+ self.offsets.append(offset)
70
+ if lbl is not None:
71
+ self.labels.append(lbl)
72
+ offset += length
73
+
74
+ self.sizes = np.asarray(self.sizes)
75
+ self.offsets = np.asarray(self.offsets)
76
+
77
+ if aux_target_postfix is not None:
78
+ if not os.path.exists(path+f".{aux_target_postfix}"):
79
+ logger.info(f"auxaliry target for {split} missing")
80
+ else:
81
+ with open(path+f".{aux_target_postfix}", "r") as t_f:
82
+ self.aux_tgt = [
83
+ torch.LongTensor(list(map(int,seg.strip().split())))\
84
+ for seg in t_f]
85
+
86
+ logger.info(f"loaded {len(self.offsets)}, skipped {skipped} samples")
87
+
88
+ def __getitem__(self, index):
89
+ offset = self.offsets[index]
90
+ end = self.sizes[index] + offset
91
+ feats = torch.from_numpy(self.data[offset:end].copy()).float()
92
+
93
+ res = {"id": index, "features": feats}
94
+ if len(self.labels) > 0:
95
+ res["target"] = self.label_dict.encode_line(
96
+ self.labels[index],
97
+ line_tokenizer=lambda x: x,
98
+ append_eos=False,
99
+ )
100
+
101
+ if self.aux_tgt:
102
+ res["aux_target"] = self.aux_tgt[index]
103
+
104
+ return res
105
+
106
+ def __len__(self):
107
+ return len(self.sizes)
108
+
109
+ def collater(self, samples):
110
+ if len(samples) == 0:
111
+ return {}
112
+
113
+ features = [s["features"] for s in samples]
114
+ sizes = [len(s) for s in features]
115
+
116
+ target_size = max(sizes)
117
+
118
+ collated_features = features[0].new_zeros(
119
+ len(features), target_size, features[0].size(-1)
120
+ )
121
+ padding_mask = torch.BoolTensor(collated_features.shape[:-1]).fill_(False)
122
+ for i, (f, size) in enumerate(zip(features, sizes)):
123
+ collated_features[i, :size] = f
124
+ padding_mask[i, size:] = True
125
+
126
+ res = {
127
+ "id": torch.LongTensor([s["id"] for s in samples]),
128
+ "net_input": {"features": collated_features, "padding_mask": padding_mask},
129
+ }
130
+
131
+ if len(self.labels) > 0:
132
+ target = data_utils.collate_tokens(
133
+ [s["target"] for s in samples],
134
+ pad_idx=self.label_dict.pad(),
135
+ left_pad=False,
136
+ )
137
+ res["target"] = target
138
+
139
+ if self.aux_tgt:
140
+ idxs = torch.nn.utils.rnn.pad_sequence(
141
+ [s["aux_target"] for s in samples],
142
+ batch_first=True,
143
+ padding_value=-1,
144
+ )
145
+ res["net_input"]["aux_target"] = idxs
146
+
147
+ return res
148
+
149
+ def num_tokens(self, index):
150
+ return self.size(index)
151
+
152
+ def size(self, index):
153
+ return self.sizes[index]
154
+
155
+ def ordered_indices(self):
156
+ """Return an ordered list of indices. Batches will be constructed based
157
+ on this order."""
158
+ if self.shuffle:
159
+ order = [np.random.permutation(len(self))]
160
+ else:
161
+ order = [np.arange(len(self))]
162
+
163
+ if self.sort_by_length:
164
+ order.append(self.sizes)
165
+ return np.lexsort(order)[::-1]
166
+ else:
167
+ return order[0]
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # you can change cmd.sh depending on what type of queue you are using.
2
+ # If you have no queueing system and want to run on a local machine, you
3
+ # can change all instances 'queue.pl' to run.pl (but be careful and run
4
+ # commands one by one: most recipes will exhaust the memory on your
5
+ # machine). queue.pl works with GridEngine (qsub). slurm.pl works
6
+ # with slurm. Different queues are configured differently, with different
7
+ # queue names and different ways of specifying things like memory;
8
+ # to account for these differences you can create and edit the file
9
+ # conf/queue.conf to match your queue's configuration. Search for
10
+ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11
+ # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12
+
13
+ export train_cmd="run.pl --mem 2G"
14
+ export decode_cmd="run.pl --mem 4G"
15
+ export mkgraph_cmd="run.pl --mem 8G"
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ sil_prob=0.5
4
+ num_sil_states=3
5
+ num_nonsil_states=1
6
+
7
+ . ./cmd.sh
8
+ . ./path.sh
9
+ . parse_options.sh
10
+
11
+ set -eux
12
+
13
+ dict=$1
14
+ data_dir=$2
15
+
16
+ dict_dir=$data_dir/local/dict
17
+ tmplm_dir=$data_dir/local/lang_tmp
18
+ lm_dir=$data_dir/lang
19
+
20
+ mkdir -p $dict_dir $tmplm_dir $lm_dir
21
+
22
+ # prepare dict
23
+ echo "SIL" > $dict_dir/silence_phones.txt
24
+ echo "SIL" > $dict_dir/optional_silence.txt
25
+ awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt
26
+
27
+ echo "SIL SIL" > $dict_dir/lexicon.txt
28
+ echo "<UNK> SIL" >> $dict_dir/lexicon.txt
29
+ awk '{print $1" "$1}' $dict >> $dict_dir/lexicon.txt
30
+
31
+ echo "SIL" > $dict_dir/extra_questions.txt
32
+ awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt
33
+
34
+ # prepare lang
35
+ utils/prepare_lang.sh --sil-prob $sil_prob --position-dependent-phones false \
36
+ --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \
37
+ $dict_dir "<UNK>" $tmplm_dir $lm_dir
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ langdir=""
4
+ lmdir=""
5
+
6
+ . ./cmd.sh
7
+ . ./path.sh
8
+ . parse_options.sh
9
+
10
+ arpa_lm=$1
11
+ data=$2
12
+
13
+ if [ -z $langdir ]; then
14
+ langdir=$data/lang
15
+ fi
16
+ if [ -z $lmdir ]; then
17
+ lmdir=$data/lang_test
18
+ fi
19
+
20
+ if [ ! -d $langdir ]; then
21
+ echo "$langdir not found. run local/prepare_lang.sh first" && exit 1
22
+ fi
23
+
24
+ mkdir -p $lmdir
25
+ cp -r $langdir/* $lmdir
26
+
27
+ if [[ "$arpa_lm" == *.gz ]]; then
28
+ gunzip -c $arpa_lm | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt - $lmdir/G.fst
29
+ else
30
+ arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt $arpa_lm $lmdir/G.fst
31
+ fi
32
+ fstisstochastic $lmdir/G.fst
33
+ utils/validate_lang.pl $lmdir || exit 1
34
+
35
+ echo "done preparing lm ($lmdir)"
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
3
+ # 2014 Guoguo Chen
4
+ # Apache 2.0
5
+
6
+ [ -f ./path.sh ] && . ./path.sh
7
+
8
+ # begin configuration section.
9
+ cmd=run.pl
10
+ stage=0
11
+ decode_mbr=true
12
+ word_ins_penalty=0.0,0.5,1.0
13
+ min_lmwt=7
14
+ max_lmwt=17
15
+ iter=final
16
+ #end configuration section.
17
+
18
+ [ -f ./path.sh ] && . ./path.sh
19
+ . parse_options.sh || exit 1;
20
+
21
+ if [ $# -ne 3 ]; then
22
+ echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
23
+ echo " Options:"
24
+ echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
25
+ echo " --stage (0|1|2) # start scoring script from part-way through."
26
+ echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
27
+ echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
28
+ echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
29
+ exit 1;
30
+ fi
31
+
32
+ data=$1
33
+ lang_or_graph=$2
34
+ dir=$3
35
+
36
+ symtab=$lang_or_graph/words.txt
37
+
38
+ for f in $symtab $dir/lat.1.gz $data/text; do
39
+ [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
40
+ done
41
+
42
+ mkdir -p $dir/scoring/log
43
+
44
+ cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
45
+
46
+ for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
47
+ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
48
+ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
49
+ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
50
+ lattice-best-path --word-symbol-table=$symtab \
51
+ ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
52
+ done
53
+
54
+ # Note: the double level of quoting for the sed command
55
+ for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
56
+ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
57
+ cat $dir/scoring/LMWT.$wip.tra \| \
58
+ utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
59
+ compute-wer --text --mode=present \
60
+ ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
61
+ done
62
+
63
+ exit 0;
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ split="dev_other"
4
+ ref_data=""
5
+ get_best_wer=true
6
+ dec_name="decode"
7
+ graph_name="graph"
8
+
9
+ . ./cmd.sh
10
+ . ./path.sh
11
+ . parse_options.sh
12
+
13
+ exp_root=$1
14
+
15
+ set -eu
16
+
17
+ echo "==== WER w.r.t. pseudo transcript"
18
+ for x in $exp_root/*/${dec_name}_${split}*; do grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh; done
19
+
20
+
21
+ if [ ! -z $ref_data ]; then
22
+ echo "==== WER w.r.t. real transcript (select based on pseudo WER)"
23
+ ref_txt=$ref_data/$split/text
24
+ for x in $exp_root/*/${dec_name}_${split}*; do
25
+ lang=$(dirname $x)/$graph_name
26
+
27
+ lmwt=$(
28
+ grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh |
29
+ sed 's/.*wer_\(.*\)$/\1/g' | sed 's/_/./g'
30
+ )
31
+ tra=$x/scoring/$lmwt.tra
32
+ cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \
33
+ compute-wer --text --mode=present \
34
+ ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra
35
+ done
36
+ fi
37
+
38
+ if [ ! -z $ref_data ] && $get_best_wer; then
39
+ echo "==== WER w.r.t. real transcript (select based on true WER)"
40
+ ref_txt=$ref_data/$split/text
41
+ for x in $exp_root/*/${dec_name}_${split}*; do
42
+ lang=$(dirname $x)/$graph_name
43
+
44
+ for tra in $x/scoring/*.tra; do
45
+ cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \
46
+ compute-wer --text --mode=present \
47
+ ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra
48
+ done | sort -k2n | head -n1
49
+ done
50
+ fi
51
+
52
+ exit 0;
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ out_root=/tmp
4
+ out_name=train_${RANDOM}
5
+ num_nonsil_states=1
6
+
7
+ valid="dev_other"
8
+ train="train"
9
+ mono_size="-1" # 2000
10
+ tri1_size="-1" # 5000
11
+ tri2b_size="-1" # 10000
12
+ tri3b_size="-1" # 10000
13
+
14
+ # Acoustic model parameters
15
+ numLeavesTri1=2000
16
+ numGaussTri1=10000
17
+ numLeavesMLLT=2500
18
+ numGaussMLLT=15000
19
+ numLeavesSAT=2500
20
+ numGaussSAT=15000
21
+
22
+ stage=1
23
+ max_stage=1
24
+
25
+ . ./cmd.sh
26
+ . ./path.sh
27
+ . parse_options.sh
28
+
29
+ data=$1
30
+ lang=$2
31
+ lang_test=$3
32
+
33
+ exp_root=$out_root/$out_name
34
+
35
+ # you might not want to do this for interactive shells.
36
+ set -e
37
+
38
+
39
+ if [ $stage -le 1 ] && [ $max_stage -ge 1 ]; then
40
+ # train a monophone system
41
+ if [ ! $mono_size -eq -1 ]; then
42
+ utils/subset_data_dir.sh $data/$train $mono_size $data/${train}_${mono_size}
43
+ mono_train=${train}_${mono_size}
44
+ else
45
+ mono_train=${train}
46
+ fi
47
+
48
+ steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
49
+ --initial-beam 40 --regular-beam 60 --retry-beam 120 \
50
+ $data/$mono_train $lang $exp_root/mono
51
+
52
+ utils/mkgraph.sh $lang_test $exp_root/mono $exp_root/mono/graph
53
+ steps/decode.sh --nj 20 --cmd "$decode_cmd" \
54
+ $exp_root/mono/graph $data/$valid $exp_root/mono/decode_$valid &
55
+ fi
56
+
57
+
58
+ if [ $stage -le 2 ] && [ $max_stage -ge 2 ]; then
59
+ # train a first delta + delta-delta triphone system on a subset of 5000 utterances
60
+ if [ ! $tri1_size -eq -1 ]; then
61
+ utils/subset_data_dir.sh $data/$train $tri1_size $data/${train}_${tri1_size}
62
+ tri1_train=${train}_${tri1_size}
63
+ else
64
+ tri1_train=${train}
65
+ fi
66
+
67
+ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
68
+ $data/$tri1_train $lang \
69
+ $exp_root/mono $exp_root/mono_ali_${tri1_train}
70
+
71
+ steps_gan/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
72
+ --num_nonsil_states $num_nonsil_states $numLeavesTri1 $numGaussTri1 \
73
+ $data/$tri1_train $lang \
74
+ $exp_root/mono_ali_${tri1_train} $exp_root/tri1
75
+
76
+ utils/mkgraph.sh $lang_test $exp_root/tri1 $exp_root/tri1/graph
77
+ steps/decode.sh --nj 20 --cmd "$decode_cmd" \
78
+ $exp_root/tri1/graph $data/$valid $exp_root/tri1/decode_$valid &
79
+ fi
80
+
81
+ if [ $stage -le 3 ] && [ $max_stage -ge 3 ]; then
82
+ # train an LDA+MLLT system.
83
+ if [ ! $tri2b_size -eq -1 ]; then
84
+ utils/subset_data_dir.sh $data/$train $tri2b_size $data/${train}_${tri2b_size}
85
+ tri2b_train=${train}_${tri2b_size}
86
+ else
87
+ tri2b_train=${train}
88
+ fi
89
+
90
+ steps/align_si.sh --nj 10 --cmd "$train_cmd" \
91
+ $data/$tri2b_train $lang \
92
+ $exp_root/tri1 $exp_root/tri1_ali_${tri2b_train}
93
+
94
+ steps_gan/train_lda_mllt.sh --cmd "$train_cmd" \
95
+ --num_nonsil_states $num_nonsil_states \
96
+ --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT \
97
+ $data/$tri2b_train $lang \
98
+ $exp_root/tri1_ali_${tri2b_train} $exp_root/tri2b
99
+
100
+ utils/mkgraph.sh $lang_test $exp_root/tri2b $exp_root/tri2b/graph
101
+ steps/decode.sh --nj 20 --cmd "$decode_cmd" \
102
+ $exp_root/tri2b/graph $data/$valid $exp_root/tri2b/decode_$valid &
103
+ fi
104
+
105
+
106
+ if [ $stage -le 4 ] && [ $max_stage -ge 4 ]; then
107
+ # Train tri3b, which is LDA+MLLT+SAT on 10k utts
108
+ if [ ! $tri3b_size -eq -1 ]; then
109
+ utils/subset_data_dir.sh $data/$train $tri3b_size $data/${train}_${tri3b_size}
110
+ tri3b_train=${train}_${tri3b_size}
111
+ else
112
+ tri3b_train=${train}
113
+ fi
114
+
115
+ steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
116
+ $data/$tri3b_train $lang \
117
+ $exp_root/tri2b $exp_root/tri2b_ali_${tri2b_train}
118
+
119
+ steps_gan/train_sat.sh --cmd "$train_cmd" \
120
+ --num_nonsil_states $num_nonsil_states $numLeavesSAT $numGaussSAT \
121
+ $data/$tri3b_train $lang \
122
+ $exp_root/tri2b_ali_${tri2b_train} $exp_root/tri3b
123
+
124
+ utils/mkgraph.sh $lang_test $exp_root/tri3b $exp_root/tri3b/graph
125
+ steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
126
+ $exp_root/tri3b/graph $data/$valid $exp_root/tri3b/decode_$valid &
127
+ fi
128
+
129
+ wait
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implement unsupervised metric for decoding hyperparameter selection:
3
+ $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$
4
+ """
5
+ import argparse
6
+ import logging
7
+ import math
8
+ import sys
9
+
10
+ import kenlm
11
+ import editdistance
12
+ from g2p_en import G2p
13
+
14
+ logging.root.setLevel(logging.INFO)
15
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def get_parser():
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("ref_tra", help="reference pseudo labels")
22
+ parser.add_argument("hyp_tra", help="decoded pseudo labels to be assess")
23
+ parser.add_argument("--kenlm_path", default="/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o5.bin", help="")
24
+ parser.add_argument("--uppercase", action="store_true", help="")
25
+ parser.add_argument("--skipwords", default="", help="")
26
+ parser.add_argument("--gt_tra", default="", help="ground truth pseudo labels for computing oracle WER")
27
+ parser.add_argument("--min_vt_uer", default=0.0, type=float)
28
+ parser.add_argument("--phonemize", action="store_true", help="phonemize word hypotheses, used when reference is phone transcript")
29
+ parser.add_argument("--phonemize_lexicon", default="", type=str, help="use a lexicon for phonemizing")
30
+ return parser
31
+
32
+ def load_tra(tra_path):
33
+ with open(tra_path, "r") as f:
34
+ uid_to_tra = {}
35
+ for line in f:
36
+ toks = line.rstrip().split()
37
+ uid, tra = toks[0], " ".join(toks[1:])
38
+ uid_to_tra[uid] = tra
39
+ logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}")
40
+ return uid_to_tra
41
+
42
+ def load_lex(lex_path):
43
+ with open(lex_path, "r") as f:
44
+ w2p = {}
45
+ for line in f:
46
+ w, p = line.rstrip().split(None, 1)
47
+ w2p[w] = p.split()
48
+ return w2p
49
+
50
+ def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict):
51
+ d_cnt = 0
52
+ w_cnt = 0
53
+ w_cnt_h = 0
54
+ for uid in hyp_uid_to_tra:
55
+ ref = ref_uid_to_tra[uid].split()
56
+ if g2p_dict is not None:
57
+ hyp = []
58
+ for word in hyp_uid_to_tra[uid].split():
59
+ if word in g2p_dict:
60
+ hyp = hyp + g2p_dict[word]
61
+ else:
62
+ logger.warning(f"{word} not in g2p_dict")
63
+ elif g2p is not None:
64
+ hyp = g2p(hyp_uid_to_tra[uid])
65
+ hyp = [p for p in hyp if p != "'" and p != " "]
66
+ hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp]
67
+ else:
68
+ hyp = hyp_uid_to_tra[uid].split()
69
+ logger.debug((
70
+ f"======================\n"
71
+ f"HYP: {' '.join(hyp)}\n"
72
+ f"REF: {' '.join(ref)}"
73
+ ))
74
+ d_cnt += editdistance.eval(ref, hyp)
75
+ w_cnt += len(ref)
76
+ w_cnt_h += len(hyp)
77
+ wer = float(d_cnt) / w_cnt
78
+ logger.debug((
79
+ f"wer = {wer*100:.2f}%; num. of ref words = {w_cnt}; "
80
+ f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}"
81
+ ))
82
+ return wer
83
+
84
+ def compute_lm_ppl(hyp_uid_to_tra, score_fn):
85
+ lm_score = 0.
86
+ w_cnt = 0
87
+ for hyp in hyp_uid_to_tra.values():
88
+ cur_score = score_fn(hyp)
89
+ cur_cnt = len(hyp.split()) + 1 # plus one for </s>
90
+ lm_score += cur_score
91
+ w_cnt += cur_cnt
92
+ logger.debug((
93
+ f"======================\n"
94
+ f"score sum/avg = {cur_score:.2f}/{cur_score/cur_cnt:.2f}\n"
95
+ f"hyp = {hyp}"
96
+ ))
97
+ lm_ppl = math.pow(10, -lm_score / w_cnt)
98
+ logger.debug(f"lm ppl = {lm_ppl:.2f}; num. of words = {w_cnt}")
99
+ return lm_ppl
100
+
101
+ def main():
102
+ args = get_parser().parse_args()
103
+ logger.debug(f"Args: {args}")
104
+
105
+ ref_uid_to_tra = load_tra(args.ref_tra)
106
+ hyp_uid_to_tra = load_tra(args.hyp_tra)
107
+ assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys()))
108
+
109
+ lm = kenlm.Model(args.kenlm_path)
110
+ skipwords = set(args.skipwords.split(","))
111
+ def compute_lm_score(s):
112
+ s = " ".join(w for w in s.split() if w not in skipwords)
113
+ s = s.upper() if args.uppercase else s
114
+ return lm.score(s)
115
+
116
+ g2p, g2p_dict = None, None
117
+ if args.phonemize:
118
+ if args.phonemize_lexicon:
119
+ g2p_dict = load_lex(args.phonemize_lexicon)
120
+ else:
121
+ g2p = G2p()
122
+
123
+ wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict)
124
+ lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score)
125
+
126
+ gt_wer = -math.inf
127
+ if args.gt_tra:
128
+ gt_uid_to_tra = load_tra(args.gt_tra)
129
+ gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None)
130
+
131
+ score = math.log(lm_ppl) * max(wer, args.min_vt_uer)
132
+ logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%")
133
+
134
+ if __name__ == "__main__":
135
+ main()
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ split="dev_other"
4
+ ref_txt="" # ground truth transcript path
5
+ psd_txt="" # pseudo transcript path
6
+ get_best_wer=true
7
+ dec_name="decode"
8
+ graph_name="graph"
9
+ kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin
10
+
11
+ . ./cmd.sh
12
+ . ./path.sh
13
+ . parse_options.sh
14
+
15
+ exp_root=$1
16
+ unsup_args=""
17
+ if [ $# -ge 2 ]; then
18
+ unsup_args=$2
19
+ fi
20
+
21
+ set -eu
22
+
23
+ if [ ! -z $ref_txt ] && $get_best_wer; then
24
+ echo "==== WER w.r.t. real transcript (select based on unsupervised metric)"
25
+ for x in $exp_root/*/${dec_name}_${split}*; do
26
+ lang=$(dirname $x)/$graph_name
27
+
28
+ (
29
+ for tra in $x/scoring/*.tra; do
30
+ cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' > $tra.txt
31
+ python local/unsup_select.py $psd_txt $tra.txt --kenlm_path $kenlm_path --gt_tra $ref_txt $unsup_args
32
+ done 2>/dev/null | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1
33
+ ) &
34
+ done
35
+ fi
36
+ wait
37
+
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ split="dev_other"
4
+ ref_txt="" # ground truth transcript path
5
+ psd_txt="" # pseudo transcript path
6
+ get_best_wer=true
7
+ dec_name="decode"
8
+ graph_name="graph"
9
+ kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin
10
+ phonemize_lexicon=""
11
+
12
+ . ./cmd.sh
13
+ . ./path.sh
14
+ . parse_options.sh
15
+ . /private/home/wnhsu/unsup_asr/fairseq-py-unsup/env.sh
16
+
17
+ exp_root=$1
18
+
19
+ set -eu
20
+
21
+ if [ ! -z $ref_txt ] && $get_best_wer; then
22
+ echo "==== WER w.r.t. real transcript (select based on unsupervised metric)"
23
+ for x in $exp_root/*/${dec_name}_${split}*; do
24
+ lang=$(dirname $x)/$graph_name
25
+
26
+ for tra in $x/scoring/*.tra; do
27
+ cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:\<UNK\>::g' > $tra.txt
28
+ python local/unsup_select.py $psd_txt $tra.txt \
29
+ --kenlm_path $kenlm_path --gt_tra $ref_txt --phonemize \
30
+ --phonemize_lexicon "$phonemize_lexicon"
31
+ done | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1
32
+ done
33
+ fi
34
+
35
+
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
4
+ # Apache 2.0
5
+
6
+ # Begin configuration.
7
+ stage=-4 # This allows restarting after partway, when something when wrong.
8
+ config=
9
+ cmd=run.pl
10
+ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
11
+ realign_iters="10 20 30";
12
+ num_iters=35 # Number of iterations of training
13
+ max_iter_inc=25 # Last iter to increase #Gauss on.
14
+ beam=10
15
+ careful=false
16
+ retry_beam=40
17
+ boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
18
+ power=0.25 # Exponent for number of gaussians according to occurrence counts
19
+ cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
20
+ norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=true"
21
+ # use the option --cmvn-opts "--norm-means=false"
22
+ cmvn_opts=
23
+ delta_opts=
24
+ context_opts= # use"--context-width=5 --central-position=2" for quinphone
25
+ num_nonsil_states=3
26
+ # End configuration.
27
+
28
+ echo "$0 $@" # Print the command line for logging
29
+
30
+ [ -f path.sh ] && . ./path.sh;
31
+ . parse_options.sh || exit 1;
32
+
33
+ if [ $# != 6 ]; then
34
+ echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
35
+ echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
36
+ echo "main options (for others, see top of script file)"
37
+ echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
38
+ echo " --config <config-file> # config containing options"
39
+ echo " --stage <stage> # stage to do partial re-run from."
40
+ exit 1;
41
+ fi
42
+
43
+ numleaves=$1
44
+ totgauss=$2
45
+ data=$3
46
+ lang=$4
47
+ alidir=$5
48
+ dir=$6
49
+
50
+ for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
51
+ [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
52
+ done
53
+
54
+ numgauss=$numleaves
55
+ incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
56
+ oov=`cat $lang/oov.int` || exit 1;
57
+ ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
58
+ nj=`cat $alidir/num_jobs` || exit 1;
59
+ mkdir -p $dir/log
60
+ echo $nj > $dir/num_jobs
61
+
62
+ utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
63
+ cp $lang/phones.txt $dir || exit 1;
64
+
65
+ sdata=$data/split$nj;
66
+ split_data.sh $data $nj || exit 1;
67
+
68
+
69
+ [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
70
+ echo "$0: warning: ignoring CMVN options from source directory $alidir"
71
+ $norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
72
+ echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN.
73
+ [ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts
74
+
75
+ feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
76
+
77
+ rm $dir/.error 2>/dev/null
78
+
79
+ if [ $stage -le -3 ]; then
80
+ echo "$0: accumulating tree stats"
81
+ $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
82
+ acc-tree-stats $context_opts \
83
+ --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
84
+ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
85
+ sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
86
+ rm $dir/*.treeacc
87
+ fi
88
+
89
+ if [ $stage -le -2 ]; then
90
+ echo "$0: getting questions for tree-building, via clustering"
91
+ # preparing questions, roots file...
92
+ cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts \
93
+ $dir/treeacc $lang/phones/sets.int \
94
+ $dir/questions.int 2> $dir/log/questions.log || exit 1;
95
+ cat $lang/phones/extra_questions.int >> $dir/questions.int
96
+ compile-questions $context_opts $lang/topo $dir/questions.int \
97
+ $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
98
+
99
+ echo "$0: building the tree"
100
+ $cmd $dir/log/build_tree.log \
101
+ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
102
+ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
103
+ $dir/questions.qst $lang/topo $dir/tree || exit 1;
104
+
105
+ $cmd $dir/log/init_model.log \
106
+ gmm-init-model --write-occs=$dir/1.occs \
107
+ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl || exit 1;
108
+ if grep 'no stats' $dir/log/init_model.log; then
109
+ echo "** The warnings above about 'no stats' generally mean you have phones **"
110
+ echo "** (or groups of phones) in your phone set that had no corresponding data. **"
111
+ echo "** You should probably figure out whether something went wrong, **"
112
+ echo "** or whether your data just doesn't happen to have examples of those **"
113
+ echo "** phones. **"
114
+ fi
115
+
116
+ gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
117
+ rm $dir/treeacc
118
+ fi
119
+
120
+ if [ $stage -le -1 ]; then
121
+ # Convert the alignments.
122
+ echo "$0: converting alignments from $alidir to use current tree"
123
+ $cmd JOB=1:$nj $dir/log/convert.JOB.log \
124
+ convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
125
+ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
126
+ fi
127
+
128
+ if [ $stage -le 0 ]; then
129
+ echo "$0: compiling graphs of transcripts"
130
+ $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
131
+ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \
132
+ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
133
+ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
134
+ fi
135
+
136
+ x=1
137
+ while [ $x -lt $num_iters ]; do
138
+ echo "$0: training pass $x"
139
+ if [ $stage -le $x ]; then
140
+ if echo $realign_iters | grep -w $x >/dev/null; then
141
+ echo "$0: aligning data"
142
+ mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
143
+ $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
144
+ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
145
+ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
146
+ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
147
+ fi
148
+ $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
149
+ gmm-acc-stats-ali $dir/$x.mdl "$feats" \
150
+ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
151
+ $cmd $dir/log/update.$x.log \
152
+ gmm-est --mix-up=$numgauss --power=$power \
153
+ --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
154
+ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
155
+ rm $dir/$x.mdl $dir/$x.*.acc
156
+ rm $dir/$x.occs
157
+ fi
158
+ [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
159
+ x=$[$x+1];
160
+ done
161
+
162
+ rm $dir/final.mdl $dir/final.occs 2>/dev/null
163
+ ln -s $x.mdl $dir/final.mdl
164
+ ln -s $x.occs $dir/final.occs
165
+
166
+ steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
167
+
168
+ # Summarize warning messages...
169
+ utils/summarize_warnings.pl $dir/log
170
+
171
+ steps/info/gmm_dir_info.pl $dir
172
+
173
+ echo "$0: Done training system with delta+delta-delta features in $dir"
174
+
175
+ exit 0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
4
+ #
5
+ # LDA+MLLT refers to the way we transform the features after computing
6
+ # the MFCCs: we splice across several frames, reduce the dimension (to 40
7
+ # by default) using Linear Discriminant Analysis), and then later estimate,
8
+ # over multiple iterations, a diagonalizing transform known as MLLT or STC.
9
+ # See http://kaldi-asr.org/doc/transform.html for more explanation.
10
+ #
11
+ # Apache 2.0.
12
+
13
+ # Begin configuration.
14
+ cmd=run.pl
15
+ config=
16
+ stage=-5
17
+ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
18
+ realign_iters="10 20 30";
19
+ mllt_iters="2 4 6 12";
20
+ num_iters=35 # Number of iterations of training
21
+ max_iter_inc=25 # Last iter to increase #Gauss on.
22
+ dim=40
23
+ beam=10
24
+ retry_beam=40
25
+ careful=false
26
+ boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
27
+ power=0.25 # Exponent for number of gaussians according to occurrence counts
28
+ randprune=4.0 # This is approximately the ratio by which we will speed up the
29
+ # LDA and MLLT calculations via randomized pruning.
30
+ splice_opts=
31
+ cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
32
+ norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=false"
33
+ cmvn_opts=
34
+ context_opts= # use "--context-width=5 --central-position=2" for quinphone.
35
+ # End configuration.
36
+ train_tree=true # if false, don't actually train the tree.
37
+ use_lda_mat= # If supplied, use this LDA[+MLLT] matrix.
38
+ num_nonsil_states=3
39
+
40
+ echo "$0 $@" # Print the command line for logging
41
+
42
+ [ -f path.sh ] && . ./path.sh
43
+ . parse_options.sh || exit 1;
44
+
45
+ if [ $# != 6 ]; then
46
+ echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>"
47
+ echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
48
+ echo "Main options (for others, see top of script file)"
49
+ echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
50
+ echo " --config <config-file> # config containing options"
51
+ echo " --stage <stage> # stage to do partial re-run from."
52
+ exit 1;
53
+ fi
54
+
55
+ numleaves=$1
56
+ totgauss=$2
57
+ data=$3
58
+ lang=$4
59
+ alidir=$5
60
+ dir=$6
61
+
62
+ for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
63
+ [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1;
64
+ done
65
+
66
+ numgauss=$numleaves
67
+ incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
68
+ oov=`cat $lang/oov.int` || exit 1;
69
+ nj=`cat $alidir/num_jobs` || exit 1;
70
+ silphonelist=`cat $lang/phones/silence.csl` || exit 1;
71
+ ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
72
+
73
+ mkdir -p $dir/log
74
+
75
+ utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
76
+ cp $lang/phones.txt $dir || exit 1;
77
+
78
+ echo $nj >$dir/num_jobs
79
+ echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
80
+ # so that later stages of system building can know what they were.
81
+
82
+
83
+ [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
84
+ echo "$0: warning: ignoring CMVN options from source directory $alidir"
85
+ $norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
86
+ echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN.
87
+
88
+ sdata=$data/split$nj;
89
+ split_data.sh $data $nj || exit 1;
90
+
91
+ splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
92
+ # Note: $feats gets overwritten later in the script.
93
+ feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
94
+
95
+
96
+
97
+ if [ $stage -le -5 ]; then
98
+ if [ -z "$use_lda_mat" ]; then
99
+ echo "$0: Accumulating LDA statistics."
100
+ rm $dir/lda.*.acc 2>/dev/null
101
+ $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
102
+ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
103
+ weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
104
+ acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
105
+ $dir/lda.JOB.acc || exit 1;
106
+ est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \
107
+ 2>$dir/log/lda_est.log || exit 1;
108
+ rm $dir/lda.*.acc
109
+ else
110
+ echo "$0: Using supplied LDA matrix $use_lda_mat"
111
+ cp $use_lda_mat $dir/0.mat || exit 1;
112
+ [ ! -z "$mllt_iters" ] && \
113
+ echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
114
+ echo " which you might not want; to disable MLLT, specify --mllt-iters ''" && \
115
+ sleep 5
116
+ fi
117
+ fi
118
+
119
+ cur_lda_iter=0
120
+
121
+ if [ $stage -le -4 ] && $train_tree; then
122
+ echo "$0: Accumulating tree stats"
123
+ $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
124
+ acc-tree-stats $context_opts \
125
+ --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
126
+ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
127
+ [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
128
+ $cmd $dir/log/sum_tree_acc.log \
129
+ sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
130
+ rm $dir/*.treeacc
131
+ fi
132
+
133
+
134
+ if [ $stage -le -3 ] && $train_tree; then
135
+ echo "$0: Getting questions for tree clustering."
136
+ # preparing questions, roots file...
137
+ cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts $dir/treeacc $lang/phones/sets.int \
138
+ $dir/questions.int 2> $dir/log/questions.log || exit 1;
139
+ cat $lang/phones/extra_questions.int >> $dir/questions.int
140
+ compile-questions $context_opts $lang/topo $dir/questions.int \
141
+ $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
142
+
143
+ echo "$0: Building the tree"
144
+ $cmd $dir/log/build_tree.log \
145
+ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
146
+ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
147
+ $dir/questions.qst $lang/topo $dir/tree || exit 1;
148
+ fi
149
+
150
+ if [ $stage -le -2 ]; then
151
+ echo "$0: Initializing the model"
152
+ if $train_tree; then
153
+ gmm-init-model --write-occs=$dir/1.occs \
154
+ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
155
+ grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
156
+ rm $dir/treeacc
157
+ else
158
+ cp $alidir/tree $dir/ || exit 1;
159
+ $cmd JOB=1 $dir/log/init_model.log \
160
+ gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
161
+ "$feats subset-feats ark:- ark:-|" || exit 1;
162
+ fi
163
+ fi
164
+
165
+
166
+ if [ $stage -le -1 ]; then
167
+ # Convert the alignments.
168
+ echo "$0: Converting alignments from $alidir to use current tree"
169
+ $cmd JOB=1:$nj $dir/log/convert.JOB.log \
170
+ convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
171
+ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
172
+ fi
173
+
174
+ if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
175
+ echo "$0: Compiling graphs of transcripts"
176
+ $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
177
+ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \
178
+ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
179
+ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
180
+ fi
181
+
182
+
183
+ x=1
184
+ while [ $x -lt $num_iters ]; do
185
+ echo Training pass $x
186
+ if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
187
+ echo Aligning data
188
+ mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
189
+ $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
190
+ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
191
+ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
192
+ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
193
+ fi
194
+ if echo $mllt_iters | grep -w $x >/dev/null; then
195
+ if [ $stage -le $x ]; then
196
+ echo "$0: Estimating MLLT"
197
+ $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
198
+ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
199
+ weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
200
+ gmm-acc-mllt --rand-prune=$randprune $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
201
+ || exit 1;
202
+ est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
203
+ gmm-transform-means $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
204
+ 2> $dir/log/transform_means.$x.log || exit 1;
205
+ compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
206
+ rm $dir/$x.*.macc
207
+ fi
208
+ feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |"
209
+ cur_lda_iter=$x
210
+ fi
211
+
212
+ if [ $stage -le $x ]; then
213
+ $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
214
+ gmm-acc-stats-ali $dir/$x.mdl "$feats" \
215
+ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
216
+ $cmd $dir/log/update.$x.log \
217
+ gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
218
+ $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
219
+ rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
220
+ fi
221
+ [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
222
+ x=$[$x+1];
223
+ done
224
+
225
+ rm $dir/final.{mdl,mat,occs} 2>/dev/null
226
+ ln -s $x.mdl $dir/final.mdl
227
+ ln -s $x.occs $dir/final.occs
228
+ ln -s $cur_lda_iter.mat $dir/final.mat
229
+
230
+ steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
231
+
232
+ # Summarize warning messages...
233
+ utils/summarize_warnings.pl $dir/log
234
+
235
+ steps/info/gmm_dir_info.pl $dir
236
+
237
+ echo "$0: Done training system with LDA+MLLT features in $dir"
238
+
239
+ exit 0
fairseq/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
3
+
4
+
5
+ # This does Speaker Adapted Training (SAT), i.e. train on
6
+ # fMLLR-adapted features. It can be done on top of either LDA+MLLT, or
7
+ # delta and delta-delta features. If there are no transforms supplied
8
+ # in the alignment directory, it will estimate transforms itself before
9
+ # building the tree (and in any case, it estimates transforms a number
10
+ # of times during training).
11
+
12
+
13
+ # Begin configuration section.
14
+ stage=-5
15
+ exit_stage=-100 # you can use this to require it to exit at the
16
+ # beginning of a specific stage. Not all values are
17
+ # supported.
18
+ fmllr_update_type=full
19
+ cmd=run.pl
20
+ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
21
+ beam=10
22
+ retry_beam=40
23
+ careful=false
24
+ boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
25
+ context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
26
+ realign_iters="10 20 30";
27
+ fmllr_iters="2 4 6 12";
28
+ silence_weight=0.0 # Weight on silence in fMLLR estimation.
29
+ num_iters=35 # Number of iterations of training
30
+ max_iter_inc=25 # Last iter to increase #Gauss on.
31
+ power=0.2 # Exponent for number of gaussians according to occurrence counts
32
+ cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
33
+ phone_map=
34
+ train_tree=true
35
+ tree_stats_opts=
36
+ cluster_phones_opts=
37
+ compile_questions_opts=
38
+ # End configuration section.
39
+ num_nonsil_states=3
40
+
41
+ echo "$0 $@" # Print the command line for logging
42
+
43
+ [ -f path.sh ] && . ./path.sh
44
+ . parse_options.sh || exit 1;
45
+
46
+ if [ $# != 6 ]; then
47
+ echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>"
48
+ echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
49
+ echo "Main options (for others, see top of script file)"
50
+ echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
51
+ echo " --config <config-file> # config containing options"
52
+ echo " --stage <stage> # stage to do partial re-run from."
53
+ exit 1;
54
+ fi
55
+
56
+ numleaves=$1
57
+ totgauss=$2
58
+ data=$3
59
+ lang=$4
60
+ alidir=$5
61
+ dir=$6
62
+
63
+ for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
64
+ [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
65
+ done
66
+
67
+ numgauss=$numleaves
68
+ incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
69
+ oov=`cat $lang/oov.int`
70
+ nj=`cat $alidir/num_jobs` || exit 1;
71
+ silphonelist=`cat $lang/phones/silence.csl`
72
+ ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
73
+ sdata=$data/split$nj;
74
+ splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
75
+ cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
76
+ delta_opts=`cat $alidir/delta_opts 2>/dev/null`
77
+ phone_map_opt=
78
+ [ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"
79
+
80
+ mkdir -p $dir/log
81
+ cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
82
+ cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
83
+ cp $alidir/delta_opts $dir 2>/dev/null # delta option.
84
+
85
+ utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
86
+ cp $lang/phones.txt $dir || exit 1;
87
+
88
+ echo $nj >$dir/num_jobs
89
+ [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
90
+
91
+ # Set up features.
92
+
93
+ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
94
+ echo "$0: feature type is $feat_type"
95
+
96
+ ## Set up speaker-independent features.
97
+ case $feat_type in
98
+ delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
99
+ lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
100
+ cp $alidir/final.mat $dir
101
+ cp $alidir/full.mat $dir 2>/dev/null
102
+ ;;
103
+ *) echo "$0: invalid feature type $feat_type" && exit 1;
104
+ esac
105
+
106
+ ## Get initial fMLLR transforms (possibly from alignment dir)
107
+ if [ -f $alidir/trans.1 ]; then
108
+ echo "$0: Using transforms from $alidir"
109
+ feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
110
+ cur_trans_dir=$alidir
111
+ else
112
+ if [ $stage -le -5 ]; then
113
+ echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
114
+ # The next line is necessary because of $silphonelist otherwise being incorrect; would require
115
+ # old $lang dir which would require another option. Not needed anyway.
116
+ [ ! -z "$phone_map" ] && \
117
+ echo "$0: error: you must provide transforms if you use the --phone-map option." && exit 1;
118
+ $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
119
+ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
120
+ weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
121
+ gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
122
+ --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \
123
+ ark:- ark:$dir/trans.JOB || exit 1;
124
+ fi
125
+ feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
126
+ cur_trans_dir=$dir
127
+ fi
128
+
129
+ if [ $stage -le -4 ] && $train_tree; then
130
+ # Get tree stats.
131
+ echo "$0: Accumulating tree stats"
132
+ $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
133
+ acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
134
+ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
135
+ [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
136
+ $cmd $dir/log/sum_tree_acc.log \
137
+ sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
138
+ rm $dir/*.treeacc
139
+ fi
140
+
141
+ if [ $stage -le -3 ] && $train_tree; then
142
+ echo "$0: Getting questions for tree clustering."
143
+ # preparing questions, roots file...
144
+ cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) \
145
+ $cluster_phones_opts $context_opts \
146
+ $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1;
147
+ cat $lang/phones/extra_questions.int >> $dir/questions.int
148
+ compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
149
+
150
+ echo "$0: Building the tree"
151
+ $cmd $dir/log/build_tree.log \
152
+ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
153
+ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
154
+ $dir/questions.qst $lang/topo $dir/tree || exit 1;
155
+ fi
156
+
157
+ if [ $stage -le -2 ]; then
158
+ echo "$0: Initializing the model"
159
+ if $train_tree; then
160
+ gmm-init-model --write-occs=$dir/1.occs \
161
+ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
162
+ grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
163
+ rm $dir/treeacc
164
+ else
165
+ cp $alidir/tree $dir/ || exit 1;
166
+ $cmd JOB=1 $dir/log/init_model.log \
167
+ gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
168
+ "$feats subset-feats ark:- ark:-|" || exit 1;
169
+ fi
170
+ fi
171
+
172
+ if [ $stage -le -1 ]; then
173
+ # Convert the alignments.
174
+ echo "$0: Converting alignments from $alidir to use current tree"
175
+ $cmd JOB=1:$nj $dir/log/convert.JOB.log \
176
+ convert-ali $phone_map_opt $alidir/final.mdl $dir/1.mdl $dir/tree \
177
+ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
178
+ fi
179
+
180
+ [ "$exit_stage" -eq 0 ] && echo "$0: Exiting early: --exit-stage $exit_stage" && exit 0;
181
+
182
+ if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
183
+ echo "$0: Compiling graphs of transcripts"
184
+ $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
185
+ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \
186
+ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
187
+ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
188
+ fi
189
+
190
+ x=1
191
+ while [ $x -lt $num_iters ]; do
192
+ echo Pass $x
193
+ if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
194
+ echo Aligning data
195
+ mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
196
+ $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
197
+ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
198
+ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
199
+ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
200
+ fi
201
+
202
+ if echo $fmllr_iters | grep -w $x >/dev/null; then
203
+ if [ $stage -le $x ]; then
204
+ echo Estimating fMLLR transforms
205
+ # We estimate a transform that's additional to the previous transform;
206
+ # we'll compose them.
207
+ $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \
208
+ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
209
+ weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
210
+ gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
211
+ --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \
212
+ "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1;
213
+ for n in `seq $nj`; do
214
+ ! ( compose-transforms --b-is-affine=true \
215
+ ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \
216
+ && mv $dir/composed_trans.$n $dir/trans.$n && \
217
+ rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \
218
+ && echo "$0: Error composing transforms" && exit 1;
219
+ done
220
+ fi
221
+ feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
222
+ cur_trans_dir=$dir
223
+ fi
224
+
225
+ if [ $stage -le $x ]; then
226
+ $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
227
+ gmm-acc-stats-ali $dir/$x.mdl "$feats" \
228
+ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
229
+ [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
230
+ $cmd $dir/log/update.$x.log \
231
+ gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
232
+ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
233
+ rm $dir/$x.mdl $dir/$x.*.acc
234
+ rm $dir/$x.occs
235
+ fi
236
+ [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
237
+ x=$[$x+1];
238
+ done
239
+
240
+
241
+ if [ $stage -le $x ]; then
242
+ # Accumulate stats for "alignment model"-- this model is
243
+ # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
244
+ # with the final speaker-adapted model.
245
+ $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
246
+ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
247
+ gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
248
+ ark,s,cs:- $dir/$x.JOB.acc || exit 1;
249
+ [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
250
+ # Update model.
251
+ $cmd $dir/log/est_alimdl.log \
252
+ gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
253
+ "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1;
254
+ rm $dir/$x.*.acc
255
+ fi
256
+
257
+ rm $dir/final.{mdl,alimdl,occs} 2>/dev/null
258
+ ln -s $x.mdl $dir/final.mdl
259
+ ln -s $x.occs $dir/final.occs
260
+ ln -s $x.alimdl $dir/final.alimdl
261
+
262
+
263
+ steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
264
+
265
+ utils/summarize_warnings.pl $dir/log
266
+ (
267
+ echo "$0: Likelihood evolution:"
268
+ for x in `seq $[$num_iters-1]`; do
269
+ tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
270
+ /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
271
+ END{ d /= t2; l /= t; printf("%s ", d+l); } '
272
+ done
273
+ echo
274
+ ) | tee $dir/log/summary.log
275
+
276
+
277
+ steps/info/gmm_dir_info.pl $dir
278
+
279
+ echo "$0: done training SAT system in $dir"
280
+
281
+ exit 0
fairseq/examples/wav2vec/unsupervised/models/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .wav2vec_u import Wav2vec_U
7
+
8
+
9
+ __all__ = [
10
+ "Wav2vec_U",
11
+ ]
fairseq/examples/wav2vec/unsupervised/models/wav2vec_u.py ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from dataclasses import dataclass
7
+ from enum import Enum, auto
8
+ import math
9
+ import numpy as np
10
+ from typing import Tuple, List, Optional, Dict
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ from torch import autograd
16
+
17
+ from fairseq import checkpoint_utils, utils
18
+ from fairseq.dataclass import FairseqDataclass
19
+ from fairseq.models import BaseFairseqModel, register_model
20
+ from fairseq.modules import (
21
+ SamePad,
22
+ TransposeLast,
23
+ )
24
+
25
+
26
+ class SegmentationType(Enum):
27
+ NONE = auto()
28
+ RANDOM = auto()
29
+ UNIFORM_RANDOM = auto()
30
+ UNIFORM_RANDOM_JOIN = auto()
31
+ JOIN = auto()
32
+
33
+
34
+ @dataclass
35
+ class SegmentationConfig(FairseqDataclass):
36
+ type: SegmentationType = SegmentationType.NONE
37
+ subsample_rate: float = 0.25
38
+ mean_pool: bool = True
39
+ mean_pool_join: bool = False
40
+ remove_zeros: bool = False
41
+
42
+
43
+ @dataclass
44
+ class Wav2vec_UConfig(FairseqDataclass):
45
+ discriminator_kernel: int = 3
46
+ discriminator_dilation: int = 1
47
+ discriminator_dim: int = 256
48
+ discriminator_causal: bool = True
49
+ discriminator_linear_emb: bool = False
50
+ discriminator_depth: int = 1
51
+ discriminator_max_pool: bool = False
52
+ discriminator_act_after_linear: bool = False
53
+ discriminator_dropout: float = 0.0
54
+ discriminator_spectral_norm: bool = False
55
+ discriminator_weight_norm: bool = False
56
+
57
+ generator_kernel: int = 4
58
+ generator_dilation: int = 1
59
+ generator_stride: int = 1
60
+ generator_pad: int = -1
61
+ generator_bias: bool = False
62
+ generator_dropout: float = 0.0
63
+ generator_batch_norm: int = 0
64
+ generator_residual: bool = False
65
+
66
+ blank_weight: float = 0
67
+ blank_mode: str = "add"
68
+ blank_is_sil: bool = False
69
+ no_softmax: bool = False
70
+
71
+ smoothness_weight: float = 0.0
72
+ smoothing: float = 0.0
73
+ smoothing_one_sided: bool = False
74
+ gradient_penalty: float = 0.0
75
+ probabilistic_grad_penalty_slicing: bool = False
76
+ code_penalty: float = 0.0
77
+ mmi_weight: float = 0.0
78
+ target_dim: int = 64
79
+ target_downsample_rate: int = 2
80
+ gumbel: bool = False
81
+ hard_gumbel: bool = True
82
+ temp: Tuple[float, float, float] = (2, 0.1, 0.99995)
83
+ input_dim: int = 128
84
+
85
+ segmentation: SegmentationConfig = SegmentationConfig()
86
+
87
+
88
+ class Segmenter(nn.Module):
89
+ cfg: SegmentationConfig
90
+
91
+ def __init__(self, cfg: SegmentationConfig):
92
+ super().__init__()
93
+ self.cfg = cfg
94
+ self.subsample_rate = cfg.subsample_rate
95
+
96
+ def pre_segment(self, dense_x, dense_padding_mask):
97
+ return dense_x, dense_padding_mask
98
+
99
+ def logit_segment(self, logits, padding_mask):
100
+ return logits, padding_mask
101
+
102
+
103
+ class RandomSegmenter(Segmenter):
104
+ def pre_segment(self, dense_x, dense_padding_mask):
105
+ target_num = math.ceil(dense_x.size(1) * self.subsample_rate)
106
+ ones = torch.ones(dense_x.shape[:-1], device=dense_x.device)
107
+ indices, _ = ones.multinomial(target_num).sort(dim=-1)
108
+ indices_ld = indices.unsqueeze(-1).expand(-1, -1, dense_x.size(-1))
109
+ dense_x = dense_x.gather(1, indices_ld)
110
+ dense_padding_mask = dense_padding_mask.gather(1, index=indices)
111
+ return dense_x, dense_padding_mask
112
+
113
+
114
+ class UniformRandomSegmenter(Segmenter):
115
+ def pre_segment(self, dense_x, dense_padding_mask):
116
+ bsz, tsz, fsz = dense_x.shape
117
+
118
+ target_num = math.ceil(tsz * self.subsample_rate)
119
+
120
+ rem = tsz % target_num
121
+
122
+ if rem > 0:
123
+ dense_x = F.pad(dense_x, [0, 0, 0, target_num - rem])
124
+ dense_padding_mask = F.pad(
125
+ dense_padding_mask, [0, target_num - rem], value=True
126
+ )
127
+
128
+ dense_x = dense_x.view(bsz, target_num, -1, fsz)
129
+ dense_padding_mask = dense_padding_mask.view(bsz, target_num, -1)
130
+
131
+ if self.cfg.mean_pool:
132
+ dense_x = dense_x.mean(dim=-2)
133
+ dense_padding_mask = dense_padding_mask.all(dim=-1)
134
+ else:
135
+ ones = torch.ones((bsz, dense_x.size(2)), device=dense_x.device)
136
+ indices = ones.multinomial(1)
137
+ indices = indices.unsqueeze(-1).expand(-1, target_num, -1)
138
+ indices_ld = indices.unsqueeze(-1).expand(-1, -1, -1, fsz)
139
+ dense_x = dense_x.gather(2, indices_ld).reshape(bsz, -1, fsz)
140
+ dense_padding_mask = dense_padding_mask.gather(2, index=indices).reshape(
141
+ bsz, -1
142
+ )
143
+ return dense_x, dense_padding_mask
144
+
145
+
146
+ class JoinSegmenter(Segmenter):
147
+ def logit_segment(self, logits, padding_mask):
148
+ preds = logits.argmax(dim=-1)
149
+
150
+ if padding_mask.any():
151
+ preds[padding_mask] = -1 # mark pad
152
+ uniques = []
153
+
154
+ bsz, tsz, csz = logits.shape
155
+
156
+ for p in preds:
157
+ uniques.append(
158
+ p.cpu().unique_consecutive(return_inverse=True, return_counts=True)
159
+ )
160
+
161
+ new_tsz = max(u[0].numel() for u in uniques)
162
+ new_logits = logits.new_zeros(bsz, new_tsz, csz)
163
+ new_pad = padding_mask.new_zeros(bsz, new_tsz)
164
+
165
+ for b in range(bsz):
166
+ u, idx, c = uniques[b]
167
+ keep = u != -1
168
+
169
+ if self.cfg.remove_zeros:
170
+ keep.logical_and_(u != 0)
171
+
172
+ if self.training and not self.cfg.mean_pool_join:
173
+ u[0] = 0
174
+ u[1:] = c.cumsum(0)[:-1]
175
+ m = c > 1
176
+ r = torch.rand(m.sum())
177
+ o = (c[m] * r).long()
178
+ u[m] += o
179
+ new_logits[b, : u.numel()] = logits[b, u]
180
+ else:
181
+ new_logits[b].index_add_(
182
+ dim=0, index=idx.to(new_logits.device), source=logits[b]
183
+ )
184
+ new_logits[b, : c.numel()] /= c.unsqueeze(-1).to(new_logits.device)
185
+
186
+ new_sz = keep.sum()
187
+ if not keep.all():
188
+ kept_logits = new_logits[b, : c.numel()][keep]
189
+ new_logits[b, :new_sz] = kept_logits
190
+
191
+ if new_sz < new_tsz:
192
+ pad = new_tsz - new_sz
193
+ new_logits[b, -pad:] = 0
194
+ new_pad[b, -pad:] = True
195
+
196
+ return new_logits, new_pad
197
+
198
+
199
+ class UniformRandomJoinSegmenter(UniformRandomSegmenter, JoinSegmenter):
200
+ pass
201
+
202
+
203
+ SEGMENT_FACTORY = {
204
+ SegmentationType.NONE: Segmenter,
205
+ SegmentationType.RANDOM: RandomSegmenter,
206
+ SegmentationType.UNIFORM_RANDOM: UniformRandomSegmenter,
207
+ SegmentationType.UNIFORM_RANDOM_JOIN: UniformRandomJoinSegmenter,
208
+ SegmentationType.JOIN: JoinSegmenter,
209
+ }
210
+
211
+
212
+ class Discriminator(nn.Module):
213
+ def __init__(self, dim, cfg: Wav2vec_UConfig):
214
+ super().__init__()
215
+
216
+ inner_dim = cfg.discriminator_dim
217
+ kernel = cfg.discriminator_kernel
218
+ dilation = cfg.discriminator_dilation
219
+ self.max_pool = cfg.discriminator_max_pool
220
+
221
+ if cfg.discriminator_causal:
222
+ padding = kernel - 1
223
+ else:
224
+ padding = kernel // 2
225
+
226
+ def make_conv(in_d, out_d, k, p=0, has_dilation=True):
227
+ conv = nn.Conv1d(
228
+ in_d,
229
+ out_d,
230
+ kernel_size=k,
231
+ padding=p,
232
+ dilation=dilation if has_dilation else 1,
233
+ )
234
+ if cfg.discriminator_spectral_norm:
235
+ conv = nn.utils.spectral_norm(conv)
236
+ elif cfg.discriminator_weight_norm:
237
+ conv = nn.utils.weight_norm(conv)
238
+ return conv
239
+
240
+ inner_net = [
241
+ nn.Sequential(
242
+ make_conv(inner_dim, inner_dim, kernel, padding),
243
+ SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
244
+ nn.Dropout(cfg.discriminator_dropout),
245
+ nn.GELU(),
246
+ )
247
+ for _ in range(cfg.discriminator_depth - 1)
248
+ ] + [
249
+ make_conv(inner_dim, 1, kernel, padding, has_dilation=False),
250
+ SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
251
+ ]
252
+
253
+ if cfg.discriminator_linear_emb:
254
+ emb_net = [make_conv(dim, inner_dim, 1)]
255
+ else:
256
+ emb_net = [
257
+ make_conv(dim, inner_dim, kernel, padding),
258
+ SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
259
+ ]
260
+
261
+ if cfg.discriminator_act_after_linear:
262
+ emb_net.append(nn.GELU())
263
+
264
+ self.net = nn.Sequential(
265
+ *emb_net,
266
+ nn.Dropout(cfg.discriminator_dropout),
267
+ *inner_net,
268
+ )
269
+
270
+ def forward(self, x, padding_mask):
271
+ x = x.transpose(1, 2) # BTC -> BCT
272
+ x = self.net(x)
273
+ x = x.transpose(1, 2)
274
+ x_sz = x.size(1)
275
+ if padding_mask is not None and padding_mask.any() and padding_mask.dim() > 1:
276
+ padding_mask = padding_mask[:, : x.size(1)]
277
+ x[padding_mask] = float("-inf") if self.max_pool else 0
278
+ x_sz = x_sz - padding_mask.sum(dim=-1)
279
+ x = x.squeeze(-1)
280
+ if self.max_pool:
281
+ x, _ = x.max(dim=-1)
282
+ else:
283
+ x = x.sum(dim=-1)
284
+ x = x / x_sz
285
+ return x
286
+
287
+
288
+ class Generator(nn.Module):
289
+ def __init__(self, input_dim, output_dim, cfg: Wav2vec_UConfig):
290
+ super().__init__()
291
+
292
+ self.cfg = cfg
293
+ self.output_dim = output_dim
294
+ self.stride = cfg.generator_stride
295
+ self.dropout = nn.Dropout(cfg.generator_dropout)
296
+ self.batch_norm = cfg.generator_batch_norm != 0
297
+ self.residual = cfg.generator_residual
298
+
299
+ padding = (
300
+ cfg.generator_kernel // 2 if cfg.generator_pad < 0 else cfg.generator_pad
301
+ )
302
+ self.proj = nn.Sequential(
303
+ TransposeLast(),
304
+ nn.Conv1d(
305
+ input_dim,
306
+ output_dim,
307
+ kernel_size=cfg.generator_kernel,
308
+ stride=cfg.generator_stride,
309
+ dilation=cfg.generator_dilation,
310
+ padding=padding,
311
+ bias=cfg.generator_bias,
312
+ ),
313
+ TransposeLast(),
314
+ )
315
+
316
+ if self.batch_norm:
317
+ self.bn = nn.BatchNorm1d(input_dim)
318
+ self.bn.weight.data.fill_(cfg.generator_batch_norm)
319
+ if self.residual:
320
+ self.in_proj = nn.Linear(input_dim, input_dim)
321
+
322
+ def forward(self, dense_x, tokens, dense_padding_mask):
323
+ result = {}
324
+
325
+ if self.batch_norm:
326
+ dense_x = self.bn_padded_data(dense_x, dense_padding_mask)
327
+ if self.residual:
328
+ inter_x = self.in_proj(self.dropout(dense_x))
329
+ dense_x = dense_x + inter_x
330
+ result["inter_x"] = inter_x
331
+
332
+ dense_x = self.dropout(dense_x)
333
+
334
+ dense_x = self.proj(dense_x)
335
+ if self.stride > 1:
336
+ dense_padding_mask = dense_padding_mask[:, :: self.stride]
337
+
338
+ if dense_padding_mask.size(1) != dense_x.size(1):
339
+ new_padding = dense_padding_mask.new_zeros(dense_x.shape[:-1])
340
+ diff = new_padding.size(1) - dense_padding_mask.size(1)
341
+
342
+ if diff > 0:
343
+ new_padding[:, diff:] = dense_padding_mask
344
+ else:
345
+ assert diff < 0
346
+ new_padding = dense_padding_mask[:, :diff]
347
+
348
+ dense_padding_mask = new_padding
349
+
350
+ token_x = None
351
+ if tokens is not None:
352
+ token_x = dense_x.new_zeros(tokens.numel(), self.output_dim)
353
+ token_x.scatter_(1, tokens.view(-1, 1).long(), 1)
354
+ token_x = token_x.view(tokens.shape + (self.output_dim,))
355
+
356
+ result["dense_x"] = dense_x
357
+ result["token_x"] = token_x
358
+ result["dense_padding_mask"] = dense_padding_mask
359
+
360
+ return result
361
+
362
+ def bn_padded_data(self, feature, padding_mask):
363
+ normed_feature = feature.clone()
364
+ normed_feature[~padding_mask] = self.bn(
365
+ feature[~padding_mask].unsqueeze(-1)
366
+ ).squeeze(-1)
367
+ return normed_feature
368
+
369
+
370
+ @register_model("wav2vec_u", dataclass=Wav2vec_UConfig)
371
+ class Wav2vec_U(BaseFairseqModel):
372
+ def calc_gradient_penalty(self, real_data, fake_data):
373
+
374
+ b_size = min(real_data.size(0), fake_data.size(0))
375
+ t_size = min(real_data.size(1), fake_data.size(1))
376
+
377
+ if self.cfg.probabilistic_grad_penalty_slicing:
378
+
379
+ def get_slice(data, dim, target_size):
380
+
381
+ size = data.size(dim)
382
+ diff = size - target_size
383
+ if diff <= 0:
384
+ return data
385
+
386
+ start = np.random.randint(0, diff + 1)
387
+ return data.narrow(dim=dim, start=start, length=target_size)
388
+
389
+ real_data = get_slice(real_data, 0, b_size)
390
+ real_data = get_slice(real_data, 1, t_size)
391
+ fake_data = get_slice(fake_data, 0, b_size)
392
+ fake_data = get_slice(fake_data, 1, t_size)
393
+
394
+ else:
395
+ real_data = real_data[:b_size, :t_size]
396
+ fake_data = fake_data[:b_size, :t_size]
397
+
398
+ alpha = torch.rand(real_data.size(0), 1, 1)
399
+ alpha = alpha.expand(real_data.size())
400
+ alpha = alpha.to(real_data.device)
401
+
402
+ interpolates = alpha * real_data + ((1 - alpha) * fake_data)
403
+
404
+ disc_interpolates = self.discriminator(interpolates, None)
405
+
406
+ gradients = autograd.grad(
407
+ outputs=disc_interpolates,
408
+ inputs=interpolates,
409
+ grad_outputs=torch.ones(disc_interpolates.size(), device=real_data.device),
410
+ create_graph=True,
411
+ retain_graph=True,
412
+ only_inputs=True,
413
+ )[0]
414
+
415
+ gradient_penalty = (gradients.norm(2, dim=1) - 1) ** 2
416
+ return gradient_penalty
417
+
418
+ def set_num_updates(self, num_updates):
419
+ super().set_num_updates(num_updates)
420
+ self.update_num = num_updates
421
+ self.curr_temp = max(
422
+ self.max_temp * self.temp_decay ** num_updates, self.min_temp
423
+ )
424
+
425
+ def discrim_step(self, num_updates):
426
+ return num_updates % 2 == 1
427
+
428
+ def get_groups_for_update(self, num_updates):
429
+ return "discriminator" if self.discrim_step(num_updates) else "generator"
430
+
431
+ def __init__(self, cfg: Wav2vec_UConfig, target_dict):
432
+ super().__init__()
433
+
434
+ self.cfg = cfg
435
+ self.zero_index = target_dict.index("<SIL>") if "<SIL>" in target_dict else 0
436
+ self.smoothness_weight = cfg.smoothness_weight
437
+
438
+ output_size = len(target_dict)
439
+ self.pad = target_dict.pad()
440
+ self.eos = target_dict.eos()
441
+ self.smoothing = cfg.smoothing
442
+ self.smoothing_one_sided = cfg.smoothing_one_sided
443
+ self.no_softmax = cfg.no_softmax
444
+ self.gumbel = cfg.gumbel
445
+ self.hard_gumbel = cfg.hard_gumbel
446
+ self.last_acc = None
447
+
448
+ self.gradient_penalty = cfg.gradient_penalty
449
+ self.code_penalty = cfg.code_penalty
450
+ self.mmi_weight = cfg.mmi_weight
451
+ self.blank_weight = cfg.blank_weight
452
+ self.blank_mode = cfg.blank_mode
453
+ self.blank_index = target_dict.index("<SIL>") if cfg.blank_is_sil else 0
454
+ assert self.blank_index != target_dict.unk()
455
+
456
+ self.discriminator = Discriminator(output_size, cfg)
457
+ for p in self.discriminator.parameters():
458
+ p.param_group = "discriminator"
459
+
460
+ self.pca_A = self.pca_b = None
461
+ d = cfg.input_dim
462
+
463
+ self.segmenter = SEGMENT_FACTORY[cfg.segmentation.type](cfg.segmentation)
464
+
465
+ self.generator = Generator(d, output_size, cfg)
466
+
467
+ for p in self.generator.parameters():
468
+ p.param_group = "generator"
469
+
470
+ for p in self.segmenter.parameters():
471
+ p.param_group = "generator"
472
+
473
+ self.max_temp, self.min_temp, self.temp_decay = cfg.temp
474
+ self.curr_temp = self.max_temp
475
+ self.update_num = 0
476
+
477
+ if self.mmi_weight > 0:
478
+ self.target_downsample_rate = cfg.target_downsample_rate
479
+ self.decoder = nn.Linear(d, cfg.target_dim)
480
+ for p in self.decoder.parameters():
481
+ p.param_group = "generator"
482
+
483
+ @classmethod
484
+ def build_model(cls, cfg, task):
485
+ return cls(cfg, task.target_dictionary)
486
+
487
+ def get_logits(
488
+ self,
489
+ net_output: Optional[Dict[str, List[Optional[torch.Tensor]]]],
490
+ normalize: bool = False,
491
+ ):
492
+ logits = net_output["logits"]
493
+
494
+ if self.blank_weight != 0:
495
+ if self.blank_mode == "add":
496
+ logits[..., self.blank_index] += self.blank_weight
497
+ elif self.blank_mode == "set":
498
+ logits[..., self.blank_index] = self.blank_weight
499
+ else:
500
+ raise Exception(f"invalid blank mode {self.blank_mode}")
501
+
502
+ padding = net_output["padding_mask"]
503
+ if padding.any():
504
+ logits[padding] = float("-inf")
505
+ logits[padding][..., self.blank_index] = float("inf")
506
+
507
+ if normalize:
508
+ logits = utils.log_softmax(logits.float(), dim=-1)
509
+
510
+ return logits.transpose(0, 1)
511
+
512
+ def get_normalized_probs(
513
+ self,
514
+ net_output: Tuple[
515
+ torch.Tensor, Optional[Dict[str, List[Optional[torch.Tensor]]]]
516
+ ],
517
+ log_probs: bool,
518
+ sample: Optional[Dict[str, torch.Tensor]] = None,
519
+ ):
520
+ logits = self.get_logits(net_output)
521
+
522
+ probs = super().get_normalized_probs(logits, log_probs, sample)
523
+ # BTC -> TBC for ctc
524
+ probs = probs.transpose(0, 1)
525
+ return probs
526
+
527
+ def normalize(self, dense_x):
528
+
529
+ bsz, tsz, csz = dense_x.shape
530
+
531
+ if dense_x.numel() == 0:
532
+ raise Exception(dense_x.shape)
533
+ _, k = dense_x.max(-1)
534
+ hard_x = (
535
+ dense_x.new_zeros(bsz * tsz, csz)
536
+ .scatter_(-1, k.view(-1, 1), 1.0)
537
+ .view(-1, csz)
538
+ )
539
+ hard_probs = torch.mean(hard_x.float(), dim=0)
540
+ code_perplexity = torch.exp(
541
+ -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
542
+ )
543
+
544
+ avg_probs = torch.softmax(dense_x.reshape(-1, csz).float(), dim=-1).mean(dim=0)
545
+ prob_perplexity = torch.exp(
546
+ -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
547
+ )
548
+
549
+ if not self.no_softmax:
550
+ if self.training and self.gumbel:
551
+ dense_x = F.gumbel_softmax(
552
+ dense_x.float(), tau=self.curr_temp, hard=self.hard_gumbel
553
+ ).type_as(dense_x)
554
+ else:
555
+ dense_x = dense_x.softmax(-1)
556
+
557
+ return dense_x, code_perplexity, prob_perplexity
558
+
559
+ def forward(
560
+ self,
561
+ features,
562
+ padding_mask,
563
+ random_label=None,
564
+ dense_x_only=False,
565
+ segment=True,
566
+ aux_target=None,
567
+ ):
568
+ if segment:
569
+ features, padding_mask = self.segmenter.pre_segment(features, padding_mask)
570
+
571
+ orig_size = features.size(0) * features.size(1) - padding_mask.sum()
572
+
573
+ gen_result = self.generator(features, random_label, padding_mask)
574
+
575
+ orig_dense_x, token_x = gen_result["dense_x"], gen_result["token_x"]
576
+ orig_dense_padding_mask = gen_result["dense_padding_mask"]
577
+
578
+ if segment:
579
+ dense_x, dense_padding_mask = self.segmenter.logit_segment(
580
+ orig_dense_x, orig_dense_padding_mask
581
+ )
582
+ else:
583
+ dense_x = orig_dense_x
584
+ dense_padding_mask = orig_dense_padding_mask
585
+
586
+ dense_logits = dense_x
587
+ prob_perplexity = None
588
+ code_perplexity = None
589
+
590
+ if not (self.no_softmax and dense_x_only):
591
+ dense_x, code_perplexity, prob_perplexity = self.normalize(dense_logits)
592
+
593
+ if dense_x_only or self.discriminator is None:
594
+ return {
595
+ "logits": dense_x,
596
+ "padding_mask": dense_padding_mask,
597
+ }
598
+
599
+ token_padding_mask = random_label == self.pad
600
+
601
+ dense_y = self.discriminator(dense_x, dense_padding_mask)
602
+ token_y = self.discriminator(token_x, token_padding_mask)
603
+
604
+ sample_size = features.size(0)
605
+
606
+ d_step = self.discrim_step(self.update_num)
607
+
608
+ fake_smooth = self.smoothing
609
+ real_smooth = self.smoothing
610
+ if self.smoothing_one_sided:
611
+ fake_smooth = 0
612
+
613
+ zero_loss = None
614
+ smoothness_loss = None
615
+ code_pen = None
616
+ mmi_loss = None
617
+
618
+ if d_step:
619
+ loss_dense = F.binary_cross_entropy_with_logits(
620
+ dense_y,
621
+ dense_y.new_ones(dense_y.shape) - fake_smooth,
622
+ reduction="sum",
623
+ )
624
+ loss_token = F.binary_cross_entropy_with_logits(
625
+ token_y,
626
+ token_y.new_zeros(token_y.shape) + real_smooth,
627
+ reduction="sum",
628
+ )
629
+ if self.training and self.gradient_penalty > 0:
630
+ grad_pen = self.calc_gradient_penalty(token_x, dense_x)
631
+ grad_pen = grad_pen.sum() * self.gradient_penalty
632
+ else:
633
+ grad_pen = None
634
+ else:
635
+ grad_pen = None
636
+ loss_token = None
637
+ loss_dense = F.binary_cross_entropy_with_logits(
638
+ dense_y,
639
+ dense_y.new_zeros(dense_y.shape) + fake_smooth,
640
+ reduction="sum",
641
+ )
642
+ num_vars = dense_x.size(-1)
643
+ if prob_perplexity is not None:
644
+ code_pen = (num_vars - prob_perplexity) / num_vars
645
+ code_pen = code_pen * sample_size * self.code_penalty
646
+
647
+ if self.smoothness_weight > 0:
648
+ smoothness_loss = F.mse_loss(
649
+ dense_logits[:, :-1], dense_logits[:, 1:], reduction="none"
650
+ )
651
+ smoothness_loss[dense_padding_mask[:, 1:]] = 0
652
+ smoothness_loss = (
653
+ smoothness_loss.mean() * sample_size * self.smoothness_weight
654
+ )
655
+
656
+ if (self.mmi_weight > 0) and (aux_target is not None):
657
+ inter_x = self.decoder(gen_result["inter_x"])
658
+ if self.target_downsample_rate > 1:
659
+ aux_target = aux_target[:, :: self.target_downsample_rate]
660
+ max_t_len = min(aux_target.shape[1], inter_x.shape[1])
661
+ mmi_loss = F.cross_entropy(
662
+ inter_x[:, :max_t_len].transpose(1, 2),
663
+ aux_target[:, :max_t_len],
664
+ ignore_index=-1,
665
+ reduction="none",
666
+ )
667
+ mmi_loss = mmi_loss.mean() * mmi_loss.shape[0] * self.mmi_weight
668
+
669
+ result = {
670
+ "losses": {
671
+ "grad_pen": grad_pen,
672
+ "code_pen": code_pen,
673
+ "smoothness": smoothness_loss,
674
+ "mmi": mmi_loss,
675
+ },
676
+ "temp": self.curr_temp,
677
+ "code_ppl": code_perplexity,
678
+ "prob_ppl": prob_perplexity,
679
+ "d_steps": int(d_step),
680
+ "sample_size": sample_size,
681
+ }
682
+
683
+ suff = "_d" if d_step else "_g"
684
+ result["losses"]["dense" + suff] = loss_dense
685
+ result["losses"]["token" + suff] = loss_token
686
+
687
+ return result
fairseq/examples/wav2vec/unsupervised/scripts/apply_pca.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import math
11
+ import numpy as np
12
+ import tqdm
13
+ import torch
14
+ from shutil import copyfile
15
+
16
+ from npy_append_array import NpyAppendArray
17
+
18
+
19
+ def get_parser():
20
+ parser = argparse.ArgumentParser(
21
+ description="transforms features via a given pca and stored them in target dir"
22
+ )
23
+ # fmt: off
24
+ parser.add_argument('source', help='directory with features')
25
+ parser.add_argument('--split', help='which split to read', required=True)
26
+ parser.add_argument('--save-dir', help='where to save the output', required=True)
27
+ parser.add_argument('--pca-path', type=str, help='pca location. will append _A.npy and _b.npy', required=True)
28
+ parser.add_argument('--batch-size', type=int, default=2048000, help='batch size')
29
+ parser.add_argument('--unfiltered', action='store_true', help='process the unfiltered version')
30
+ # fmt: on
31
+
32
+ return parser
33
+
34
+
35
+ def main():
36
+ parser = get_parser()
37
+ args = parser.parse_args()
38
+
39
+ source_path = osp.join(args.source, args.split)
40
+ data_poth = source_path + "_unfiltered" if args.unfiltered else source_path
41
+
42
+ print(f"data path: {data_poth}")
43
+
44
+ features = np.load(data_poth + ".npy", mmap_mode="r")
45
+ pca_A = torch.from_numpy(np.load(args.pca_path + "_A.npy")).cuda()
46
+ pca_b = torch.from_numpy(np.load(args.pca_path + "_b.npy")).cuda()
47
+
48
+ os.makedirs(args.save_dir, exist_ok=True)
49
+ save_path = osp.join(args.save_dir, args.split)
50
+
51
+ copyfile(source_path + ".tsv", save_path + ".tsv")
52
+ copyfile(data_poth + ".lengths", save_path + ".lengths")
53
+
54
+ if osp.exists(source_path + ".phn"):
55
+ copyfile(source_path + ".phn", save_path + ".phn")
56
+
57
+ if osp.exists(source_path + ".wrd"):
58
+ copyfile(source_path + ".wrd", save_path + ".wrd")
59
+
60
+ if osp.exists(save_path + ".npy"):
61
+ os.remove(save_path + ".npy")
62
+ npaa = NpyAppendArray(save_path + ".npy")
63
+
64
+ batches = math.ceil(features.shape[0] / args.batch_size)
65
+
66
+ with torch.no_grad():
67
+ for b in tqdm.trange(batches):
68
+ start = b * args.batch_size
69
+ end = start + args.batch_size
70
+ x = torch.from_numpy(features[start:end]).cuda()
71
+ x = torch.matmul(x, pca_A) + pca_b
72
+ npaa.append(x.cpu().numpy())
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/copy_labels.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import sys
8
+
9
+ for idx, line in enumerate(sys.stdin):
10
+ print(f"utt{idx:010d} {line}", end="")
fairseq/examples/wav2vec/unsupervised/scripts/filter_lexicon.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import sys
9
+
10
+ from fairseq.data import Dictionary
11
+
12
+
13
+ def get_parser():
14
+ parser = argparse.ArgumentParser(
15
+ description="filters a lexicon given a unit dictionary"
16
+ )
17
+ parser.add_argument("-d", "--unit-dict", help="unit dictionary", required=True)
18
+ return parser
19
+
20
+
21
+ def main():
22
+ parser = get_parser()
23
+ args = parser.parse_args()
24
+
25
+ d = Dictionary.load(args.unit_dict)
26
+ symbols = set(d.symbols)
27
+
28
+ for line in sys.stdin:
29
+ items = line.rstrip().split()
30
+ skip = len(items) < 2
31
+ for x in items[1:]:
32
+ if x not in symbols:
33
+ skip = True
34
+ break
35
+ if not skip:
36
+ print(line, end="")
37
+
38
+
39
+ if __name__ == "__main__":
40
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/filter_tsv.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import os
8
+ import argparse
9
+ import sys
10
+
11
+
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--tsv", required=True, type=str)
14
+ parser.add_argument("--no-skip", action="store_true")
15
+ parser.add_argument("--keep", action="store_true")
16
+ params = parser.parse_args()
17
+
18
+
19
+ def get_fname(line):
20
+ p = os.path.basename(line.split("\t")[0])
21
+ p = os.path.splitext(p)[0]
22
+ return p
23
+
24
+
25
+ # filenames to exclude
26
+ seen = set()
27
+ with open(params.tsv) as f:
28
+ if not params.no_skip:
29
+ root = next(f).rstrip()
30
+ for line in f:
31
+ seen.add(get_fname(line))
32
+
33
+ for i, line in enumerate(sys.stdin):
34
+ exists = get_fname(line) in seen
35
+ keep = (exists and params.keep) or (not exists and not params.keep)
36
+ if i == 0 or keep:
37
+ print(line, end="")
fairseq/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import sys
9
+
10
+ from g2p_en import G2p
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument(
16
+ "--compact",
17
+ action="store_true",
18
+ help="if set, compacts phones",
19
+ )
20
+ args = parser.parse_args()
21
+
22
+ compact = args.compact
23
+
24
+ wrd_to_phn = {}
25
+ g2p = G2p()
26
+ for line in sys.stdin:
27
+ words = line.strip().split()
28
+ phones = []
29
+ for w in words:
30
+ if w not in wrd_to_phn:
31
+ wrd_to_phn[w] = g2p(w)
32
+ if compact:
33
+ wrd_to_phn[w] = [
34
+ p[:-1] if p[-1].isnumeric() else p for p in wrd_to_phn[w]
35
+ ]
36
+ phones.extend(wrd_to_phn[w])
37
+ try:
38
+ print(" ".join(phones))
39
+ except:
40
+ print(wrd_to_phn, words, phones, file=sys.stderr)
41
+ raise
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import sys
8
+
9
+
10
+ def main():
11
+ for line in sys.stdin:
12
+ print(line.replace(" ", "").replace("|", " ").strip())
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/mean_pool.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import math
11
+ import numpy as np
12
+ import tqdm
13
+ import torch
14
+ import torch.nn.functional as F
15
+ from shutil import copyfile
16
+
17
+ from npy_append_array import NpyAppendArray
18
+
19
+
20
+ def get_parser():
21
+ parser = argparse.ArgumentParser(
22
+ description="mean pools representations by compressing uniform splits of the data"
23
+ )
24
+ # fmt: off
25
+ parser.add_argument('source', help='directory with features')
26
+ parser.add_argument('--split', help='which split to read', required=True)
27
+ parser.add_argument('--save-dir', help='where to save the output', required=True)
28
+ parser.add_argument('--subsample-rate', type=float, default=0.5, help='size to subsample data to')
29
+
30
+ parser.add_argument('--remove-extra', action='store_true', help='if true, removes extra states that cant be pooled, otherwise pads with 0s')
31
+ # fmt: on
32
+
33
+ return parser
34
+
35
+
36
+ def main():
37
+ parser = get_parser()
38
+ args = parser.parse_args()
39
+
40
+ source_path = osp.join(args.source, args.split)
41
+
42
+ print(f"data path: {source_path}")
43
+
44
+ features = np.load(source_path + ".npy", mmap_mode="r")
45
+
46
+ os.makedirs(args.save_dir, exist_ok=True)
47
+ save_path = osp.join(args.save_dir, args.split)
48
+
49
+ copyfile(source_path + ".tsv", save_path + ".tsv")
50
+
51
+ if os.path.exists(source_path + ".phn"):
52
+ copyfile(source_path + ".phn", save_path + ".phn")
53
+ if os.path.exists(source_path + ".wrd"):
54
+ copyfile(source_path + ".wrd", save_path + ".wrd")
55
+
56
+ if os.path.exists(osp.join(args.source, "dict.phn.txt")):
57
+ copyfile(
58
+ osp.join(args.source, "dict.phn.txt"),
59
+ osp.join(args.save_dir, "dict.phn.txt"),
60
+ )
61
+
62
+ if osp.exists(save_path + ".npy"):
63
+ os.remove(save_path + ".npy")
64
+ npaa = NpyAppendArray(save_path + ".npy")
65
+
66
+ with open(source_path + ".lengths", "r") as lf:
67
+ lengths = lf.readlines()
68
+
69
+ fsz = features.shape[-1]
70
+ start = 0
71
+ with torch.no_grad():
72
+ with open(save_path + ".lengths", "w") as lengths_out:
73
+ for length in tqdm.tqdm(lengths):
74
+ length = int(length)
75
+ end = start + length
76
+ feats = features[start:end]
77
+ start += length
78
+ x = torch.from_numpy(feats).cuda()
79
+ target_num = math.ceil(length * args.subsample_rate)
80
+ rem = length % target_num
81
+
82
+ if rem > 0:
83
+ if args.remove_extra:
84
+ to_rem = target_num - rem
85
+ target_num -= 1
86
+ x = x[:-to_rem]
87
+ else:
88
+ to_add = target_num - rem
89
+ x = F.pad(x, [0, 0, 0, to_add])
90
+ x[-to_add:] = x[-to_add - 1]
91
+
92
+ x = x.view(target_num, -1, fsz)
93
+ x = x.mean(dim=-2)
94
+ print(target_num, file=lengths_out)
95
+ npaa.append(x.cpu().numpy())
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/merge_clusters.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import numpy as np
11
+ import tqdm
12
+ import torch
13
+ import random
14
+ from shutil import copyfile
15
+
16
+ from npy_append_array import NpyAppendArray
17
+
18
+
19
+ def get_parser():
20
+ parser = argparse.ArgumentParser(
21
+ description="transforms features via a given pca and stored them in target dir"
22
+ )
23
+ # fmt: off
24
+ parser.add_argument('source', help='directory with features')
25
+ parser.add_argument('--split', help='which split to read', required=True)
26
+ parser.add_argument('--save-dir', help='where to save the output', required=True)
27
+ parser.add_argument('--cluster-dir', help='where the clusters are')
28
+ parser.add_argument('--pooling', type=str, default='mean', choices=['mean', 'sample'], help='how to pool')
29
+ # fmt: on
30
+
31
+ return parser
32
+
33
+
34
+ def main():
35
+ parser = get_parser()
36
+ args = parser.parse_args()
37
+
38
+ source_path = osp.join(args.source, args.split)
39
+ cluster_path = osp.join(args.cluster_dir, args.split + ".src")
40
+ print(f"data path: {source_path}")
41
+
42
+ features = np.load(source_path + ".npy", mmap_mode="r")
43
+ sizes = []
44
+ offsets = []
45
+ offset = 0
46
+ with open(source_path + ".lengths", "r") as len_f:
47
+ for line in len_f:
48
+ length = int(line.rstrip())
49
+ sizes.append(length)
50
+ offsets.append(offset)
51
+ offset += length
52
+
53
+ clusters = []
54
+ with open(cluster_path, "r") as cf:
55
+ for line in cf:
56
+ line = line.rstrip()
57
+ items = line.split()
58
+ items = list(map(int, items))
59
+ clusters.append(items)
60
+
61
+ os.makedirs(args.save_dir, exist_ok=True)
62
+ save_path = osp.join(args.save_dir, args.split)
63
+
64
+ copyfile(source_path + ".tsv", save_path + ".tsv")
65
+
66
+ if os.path.exists(source_path + ".phn"):
67
+ copyfile(source_path + ".phn", save_path + ".phn")
68
+ if os.path.exists(osp.join(args.source, "dict.phn.txt")):
69
+ copyfile(
70
+ osp.join(args.source, "dict.phn.txt"),
71
+ osp.join(args.save_dir, "dict.phn.txt"),
72
+ )
73
+ if os.path.exists(source_path + ".wrd"):
74
+ copyfile(source_path + ".wrd", save_path + ".wrd")
75
+
76
+ if osp.exists(save_path + ".npy"):
77
+ os.remove(save_path + ".npy")
78
+ npaa = NpyAppendArray(save_path + ".npy")
79
+
80
+ def merge(feats, clust):
81
+ feats = torch.from_numpy(feats.copy())
82
+ clust = torch.LongTensor(clust)
83
+ _, counts = clust.unique_consecutive(return_counts=True)
84
+ curr = 0
85
+
86
+ merged = []
87
+ for c in counts:
88
+ c = c.item()
89
+ start = curr
90
+ end = curr + c
91
+ curr += c
92
+ if args.pooling == "mean":
93
+ new_x = feats[start:end].mean(dim=0)
94
+ elif args.pooling == "sample":
95
+ new_x = feats[start + int(random.random() * c)]
96
+ else:
97
+ raise NotImplementedError()
98
+ merged.append(new_x)
99
+
100
+ return torch.stack(merged, dim=0).numpy()
101
+
102
+ with open(save_path + ".lengths", "w") as l_f:
103
+ for size, offset, clust in tqdm.tqdm(
104
+ zip(sizes, offsets, clusters), total=len(sizes)
105
+ ):
106
+ end = size + offset
107
+ feats = features[offset:end]
108
+ feats = merge(feats, clust)
109
+ print(len(feats), file=l_f)
110
+ npaa.append(feats)
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import fasttext as ft
9
+ import os
10
+ import regex
11
+ import sys
12
+
13
+
14
+ def get_parser():
15
+ parser = argparse.ArgumentParser(
16
+ description="reads text from stdin and outputs normalized, lid-filtered version to stdout"
17
+ )
18
+ parser.add_argument(
19
+ "--fasttext-model",
20
+ help="path to fasttext model",
21
+ default="lid.187.bin",
22
+ )
23
+ parser.add_argument("--lang", help="language id", required=True)
24
+ parser.add_argument(
25
+ "--lid-threshold",
26
+ type=float,
27
+ help="threshold for this lang id probability",
28
+ default=0.4,
29
+ )
30
+
31
+ return parser
32
+
33
+
34
+ def main():
35
+ parser = get_parser()
36
+ args = parser.parse_args()
37
+ filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")
38
+
39
+ lg = args.lang.lower()
40
+ lg_label = f"__label__{lg}"
41
+ thresh = args.lid_threshold
42
+
43
+ if os.path.exists(args.fasttext_model):
44
+ model = ft.load_model(args.fasttext_model)
45
+ else:
46
+ print(
47
+ f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. "
48
+ f"To enable language filtering, please download the latest language id model "
49
+ f"from https://fasttext.cc/docs/en/language-identification.html",
50
+ file=sys.stderr,
51
+ )
52
+ model = None
53
+
54
+ for line in sys.stdin:
55
+ line = line.strip()
56
+ line = filter_r.sub(" ", line)
57
+ line = " ".join(line.split())
58
+
59
+ if model is not None:
60
+ lid, prob = model.predict(line, k=100)
61
+ try:
62
+ target_idx = lid.index(lg_label)
63
+ except ValueError:
64
+ continue
65
+ if target_idx == 0 or prob[target_idx] >= thresh:
66
+ print(line)
67
+ else:
68
+ print(line)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/normalize_text.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import regex
8
+ import sys
9
+
10
+
11
+ def main():
12
+ filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")
13
+
14
+ for line in sys.stdin:
15
+ line = line.strip()
16
+ line = filter_r.sub(" ", line)
17
+ line = " ".join(line.split())
18
+ print(line)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/pca.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import numpy as np
11
+
12
+ import faiss
13
+
14
+
15
+
16
+ def get_parser():
17
+ parser = argparse.ArgumentParser(
18
+ description="compute a pca matrix given an array of numpy features"
19
+ )
20
+ # fmt: off
21
+ parser.add_argument('data', help='numpy file containing features')
22
+ parser.add_argument('--output', help='where to save the pca matrix', required=True)
23
+ parser.add_argument('--dim', type=int, help='dim for pca reduction', required=True)
24
+ parser.add_argument('--eigen-power', type=float, default=0, help='eigen power, -0.5 for whitening')
25
+
26
+ return parser
27
+
28
+
29
+ def main():
30
+ parser = get_parser()
31
+ args = parser.parse_args()
32
+
33
+ print("Reading features")
34
+ x = np.load(args.data, mmap_mode="r")
35
+
36
+ print("Computing PCA")
37
+ pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power)
38
+ pca.train(x)
39
+ b = faiss.vector_to_array(pca.b)
40
+ A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
41
+
42
+ os.makedirs(args.output, exist_ok=True)
43
+
44
+ prefix = str(args.dim)
45
+ if args.eigen_power != 0:
46
+ prefix += f"_{args.eigen_power}"
47
+
48
+ np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T)
49
+ np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
50
+
51
+
52
+ if __name__ == "__main__":
53
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import numpy as np
9
+ import sys
10
+
11
+
12
+ def get_parser():
13
+ parser = argparse.ArgumentParser(
14
+ description="converts words to phones adding optional silences around in between words"
15
+ )
16
+ parser.add_argument(
17
+ "--sil-prob",
18
+ "-s",
19
+ type=float,
20
+ default=0,
21
+ help="probability of inserting silence between each word",
22
+ )
23
+ parser.add_argument(
24
+ "--surround",
25
+ action="store_true",
26
+ help="if set, surrounds each example with silence",
27
+ )
28
+ parser.add_argument(
29
+ "--lexicon",
30
+ help="lexicon to convert to phones",
31
+ required=True,
32
+ )
33
+
34
+ return parser
35
+
36
+
37
+ def main():
38
+ parser = get_parser()
39
+ args = parser.parse_args()
40
+
41
+ sil_prob = args.sil_prob
42
+ surround = args.surround
43
+ sil = "<SIL>"
44
+
45
+ wrd_to_phn = {}
46
+
47
+ with open(args.lexicon, "r") as lf:
48
+ for line in lf:
49
+ items = line.rstrip().split()
50
+ assert len(items) > 1, line
51
+ assert items[0] not in wrd_to_phn, items
52
+ wrd_to_phn[items[0]] = items[1:]
53
+
54
+ for line in sys.stdin:
55
+ words = line.strip().split()
56
+
57
+ if not all(w in wrd_to_phn for w in words):
58
+ continue
59
+
60
+ phones = []
61
+ if surround:
62
+ phones.append(sil)
63
+
64
+ sample_sil_probs = None
65
+ if sil_prob > 0 and len(words) > 1:
66
+ sample_sil_probs = np.random.random(len(words) - 1)
67
+
68
+ for i, w in enumerate(words):
69
+ phones.extend(wrd_to_phn[w])
70
+ if (
71
+ sample_sil_probs is not None
72
+ and i < len(sample_sil_probs)
73
+ and sample_sil_probs[i] < sil_prob
74
+ ):
75
+ phones.append(sil)
76
+
77
+ if surround:
78
+ phones.append(sil)
79
+ print(" ".join(phones))
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio.sh ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env zsh
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ source_dir=$1
8
+ tgt_dir=$2
9
+ model=$3
10
+
11
+ if [ -z "$4" ]
12
+ then
13
+ dim=512
14
+ else
15
+ dim=$4
16
+ fi
17
+
18
+ echo "using $dim dim for PCA"
19
+
20
+ if [ -z "$5" ]
21
+ then
22
+ layer=14
23
+ else
24
+ layer=$5
25
+ fi
26
+
27
+ echo "extracting from layer $layer"
28
+
29
+ train_split=train
30
+ valid_split=valid
31
+ test_split=test
32
+
33
+ all_splits=($train_split)
34
+
35
+ if [[ -f "$source_dir/valid.tsv" ]]; then
36
+ all_splits+=('valid')
37
+ fi
38
+
39
+ if [[ -f "$source_dir/test.tsv" ]]; then
40
+ all_splits+=('test')
41
+ fi
42
+
43
+ echo "processing splits: $all_splits"
44
+
45
+ mkdir -p $tgt_dir
46
+
47
+ cp $source_dir/*.tsv $tgt_dir
48
+ cp $source_dir/*.wrd $tgt_dir
49
+ cp $source_dir/*.ltr $tgt_dir
50
+ cp $source_dir/*.phn $tgt_dir
51
+ cp $source_dir/dict* $tgt_dir
52
+
53
+ setopt shwordsplit
54
+
55
+ for split in $all_splits; do
56
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \
57
+ --save-dir $tgt_dir --checkpoint $model --layer $layer
58
+ done
59
+
60
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py $tgt_dir/${train_split}.tsv \
61
+ --checkpoint $model --save-dir $tgt_dir -f "CLUS128" --sample-pct 1.0
62
+
63
+ for split in $all_splits; do
64
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py $tgt_dir \
65
+ --checkpoint $model --path $tgt_dir/CLUS128 --split $split
66
+ done
67
+
68
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/pca.py $tgt_dir/${train_split}.npy --output $tgt_dir/pca --dim $dim
69
+
70
+ for split in $all_splits; do
71
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/apply_pca.py $tgt_dir --split $split --save-dir $tgt_dir/precompute_pca$dim --pca-path $tgt_dir/pca/${dim}_pca --batch-size 1048000
72
+
73
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/merge_clusters.py $tgt_dir/precompute_pca$dim --cluster-dir $tgt_dir/CLUS128 \
74
+ --split $split --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean --pooling mean
75
+
76
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/mean_pool.py $tgt_dir/precompute_pca${dim}_cls128_mean \
77
+ --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean_pooled --split $split
78
+ done
fairseq/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env zsh
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ source_dir=$1
8
+ tgt_dir=$2
9
+ model=$3
10
+
11
+ if [ -z "$4" ]
12
+ then
13
+ dim=64
14
+ else
15
+ dim=$4
16
+ fi
17
+
18
+ echo "using $dim clusters for auxilary target"
19
+
20
+ if [ -z "$5" ]
21
+ then
22
+ layer=14
23
+ else
24
+ layer=$5
25
+ fi
26
+
27
+ echo "extracting from layer $layer"
28
+
29
+ train_split=train
30
+ valid_split=valid
31
+ test_split=test
32
+
33
+ all_splits=($train_split)
34
+
35
+ if [[ -f "$source_dir/valid.tsv" ]]; then
36
+ all_splits+=('valid')
37
+ fi
38
+
39
+ if [[ -f "$source_dir/test.tsv" ]]; then
40
+ all_splits+=('test')
41
+ fi
42
+
43
+ echo "processing splits: $all_splits"
44
+
45
+ mkdir -p $tgt_dir
46
+
47
+ cp $source_dir/*.tsv $tgt_dir
48
+ cp $source_dir/*.wrd $tgt_dir
49
+ cp $source_dir/*.ltr $tgt_dir
50
+ cp $source_dir/*.phn $tgt_dir
51
+ cp $source_dir/dict* $tgt_dir
52
+
53
+ setopt shwordsplit
54
+
55
+ for split in $all_splits; do
56
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \
57
+ --save-dir $tgt_dir --checkpoint $model --layer $layer
58
+ done
59
+
60
+
61
+ mkdir -p $tgt_dir/mfcc
62
+
63
+ # Consider spliting corpus into chuncks for large corpus, see HuBERT preprocessing for more details
64
+ python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_mfcc_feature.py \
65
+ $tgt_dir $train_split 1 0 $tgt_dir/mfcc
66
+ python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py \
67
+ $tgt_dir/mfcc $train_split $tgt_dir/mfcc/cls$dim 1 0 $tgt_dir/mfcc/cls${dim}_idx
68
+ cp $tgt_dir/mfcc/cls${dim}_idx/${train_split}_0_1.km $tgt_dir/$train_split.km
fairseq/examples/wav2vec/unsupervised/scripts/prepare_text.sh ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env zsh
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ lg=$1
8
+ text_path=$2
9
+ target_dir=$3
10
+ min_phones=$4
11
+ phonemizer=$5
12
+ lid_path=$6
13
+ sil_prob=$7
14
+
15
+ if [ -z "$lid_path" ]; then
16
+ lid_path="lid.187.bin"
17
+ fi
18
+
19
+ ph_lg=${lg:l}
20
+ if test "$lg" = 'fr'; then
21
+ ph_lg='fr-fr'
22
+ elif test "$lg" = 'en'; then
23
+ ph_lg='en-us'
24
+ elif test "$lg" = 'pt'; then
25
+ ph_lg='pt-br'
26
+ fi
27
+
28
+ ESPEAK_PATH=''
29
+ if test "$phonemizer" = 'espeak'; then
30
+ ESPEAK_PATH=$(which espeak)
31
+ elif test "$phonemizer" = 'espeak-ng'; then
32
+ ESPEAK_PATH=$(which espeak-ng)
33
+ elif test "$phonemizer" = 'G2P'; then
34
+ ESPEAK_PATH=''
35
+ else
36
+ echo "Unknown phonemizer $phonemizer. Valid options are espeak, espean-ng and G2P"
37
+ exit 1
38
+ fi
39
+
40
+ echo $lg
41
+ echo $ph_lg
42
+ echo $text_path
43
+ echo $target_dir
44
+ echo "min phone seen threshold is $min_phones"
45
+
46
+ mkdir -p $target_dir
47
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py --lang $lg --fasttext-model $lid_path < $text_path | grep -v '\-\-\-' >! $target_dir/lm.upper.lid.txt
48
+ python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/lm.upper.lid.txt --only-source --destdir $target_dir --thresholdsrc 2 --padding-factor 1 --dict-only
49
+ cut -f1 -d' ' $target_dir/dict.txt | grep -v -x '[[:punct:]]*' | grep -Pv '\d\d\d\d\d+' >! $target_dir/words.txt
50
+
51
+
52
+ if [ -z "$ESPEAK_PATH" ]; then
53
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py --compact < $target_dir/words.txt > $target_dir/phones.txt
54
+ else
55
+ # echoing 1 into corpus will prevent the mismatch lines between lexicon and phones in case the phonemizer fails
56
+ one=$(echo "1" | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -p ' ' -w '' -l $ph_lg --language-switch remove-flags)
57
+ sed 's/$/ 1/' $target_dir/words.txt | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -o $target_dir/phones.txt -p ' ' -w '' -l $ph_lg -j 70 --language-switch remove-flags
58
+ echo "one is ${one}"
59
+ sed -i "s/${one}$//" $target_dir/phones.txt
60
+ fi
61
+
62
+ paste $target_dir/words.txt $target_dir/phones.txt >! $target_dir/lexicon.lst
63
+
64
+ python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc $min_phones --padding-factor 1 --dict-only
65
+
66
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/filter_lexicon.py -d $target_dir/phones/dict.txt < $target_dir/lexicon.lst >! $target_dir/lexicon_filtered.lst
67
+ python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py -s $sil_prob --surround --lexicon $target_dir/lexicon_filtered.lst < $target_dir/lm.upper.lid.txt >! $target_dir/phones/lm.phones.filtered.txt
68
+ cp $target_dir/phones/dict.txt $target_dir/phones/dict.phn.txt
69
+ echo "<SIL> 0" >> $target_dir/phones/dict.phn.txt
70
+ python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones/lm.phones.filtered.txt --workers 70 --only-source --destdir $target_dir/phones --srcdict $target_dir/phones/dict.phn.txt
71
+
72
+ $KENLM_ROOT/lmplz -o 4 < $target_dir/lm.upper.lid.txt --discount_fallback --prune 0 0 0 3 >! $target_dir/kenlm.wrd.o40003.arpa
73
+ $KENLM_ROOT/build_binary $target_dir/kenlm.wrd.o40003.arpa $target_dir/kenlm.wrd.o40003.bin
74
+
75
+ lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words_sil lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'"
76
+ lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn
77
+
78
+ $KENLM_ROOT/lmplz -o 4 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.04.arpa
79
+ $KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.04.arpa $target_dir/phones/lm.phones.filtered.04.bin
80
+ $KENLM_ROOT/lmplz -o 6 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.06.arpa
81
+ $KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.06.arpa $target_dir/phones/lm.phones.filtered.06.bin
82
+
83
+ lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_phn_sil lm_arpa=$target_dir/phones/lm.phones.filtered.06.arpa data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'"
fairseq/examples/wav2vec/unsupervised/scripts/prepare_timit.sh ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ timit_root=$1 # assume it is the upper-cased version
8
+ tgt_dir=$2
9
+ model=$3
10
+
11
+ set -eu
12
+
13
+ setups="matched unmatched"
14
+ splits="test valid train train_text"
15
+
16
+ tgt_dir=$(realpath $tgt_dir)
17
+ sph2wav=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
18
+ wav_dir=$tgt_dir/wav
19
+
20
+
21
+ mkdir -p $tgt_dir $wav_dir
22
+ find $timit_root/{TRAIN,TEST} -iname "*.WAV" > $tgt_dir/all_sph.flist
23
+ cat $tgt_dir/all_sph.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).WAV#\1_\2#g' > $tgt_dir/all.uid
24
+ paste -d' ' $tgt_dir/{all_sph.flist,all.uid} | \
25
+ awk -v sph2wav=$sph2wav -v wav_dir=$wav_dir '{print sph2wav " -f wav " $1 " > " wav_dir "/" $2 ".wav"}' \
26
+ > $tgt_dir/sph2wav.sh
27
+ bash $tgt_dir/sph2wav.sh
28
+ cat $tgt_dir/all.uid | awk -v wav_dir=$(pwd)/$wav_dir '{print $1" "wav_dir"/"$1".wav"}' | sort > $tgt_dir/all_wav.scp
29
+ cut -d' ' -f2 $tgt_dir/all_wav.scp | xargs -I{} soxi -s {} > $tgt_dir/all.dur
30
+ paste -d' ' $tgt_dir/{all_wav.scp,all.dur} > $tgt_dir/all_wav_dur.scp
31
+ rm $tgt_dir/{all.uid,all_sph.flist,sph2wav.sh}
32
+
33
+ find $timit_root/{TRAIN,TEST} -iname "*.PHN" > $tgt_dir/all_phn60.flist
34
+ while read line; do
35
+ if [ ! -f $line ]; then
36
+ >&2 echo "Cannot find transcription file '$line'" && exit 1;
37
+ fi
38
+ cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;'
39
+ done < $tgt_dir/all_phn60.flist > $tgt_dir/all.phn60
40
+ cat $tgt_dir/all_phn60.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).PHN#\1_\2#g' | \
41
+ paste -d' ' - $tgt_dir/all.phn60 | \
42
+ $KALDI_ROOT/egs/timit/s5/local/timit_norm_trans.pl -i - -m $KALDI_ROOT/egs/timit/s5/conf/phones.60-48-39.map -to 39 | \
43
+ sort > $tgt_dir/all.phn
44
+ echo "done preparing wav and 39-phone transcripts"
45
+
46
+
47
+ for s in $setups; do
48
+ mkdir -p $tgt_dir/$s
49
+ for x in $splits; do
50
+ uid_path=config/timit_${s}/${x}.uid
51
+ grep -w -f $uid_path $tgt_dir/all.phn | cut -d' ' -f2- > $tgt_dir/$s/$x.phn
52
+ ln -sf $(realpath $tgt_dir/$s/$x.phn) $tgt_dir/$s/$x.wrd
53
+
54
+ echo "/" > $tgt_dir/$s/$x.tsv && grep -w -f $uid_path $tgt_dir/all_wav_dur.scp | cut -d' ' -f2- | sed 's# #\t#' >> $tgt_dir/$s/$x.tsv
55
+ done
56
+
57
+ for x in $splits; do
58
+ cat $tgt_dir/$s/$x.phn
59
+ done | tr ' ' '\n' | sort -u | awk '{print $1" "1}' > $tgt_dir/$s/dict.phn.txt
60
+ ln -sf $(realpath $tgt_dir/$s/dict.phn.txt) $tgt_dir/$s/dict.wrd.txt
61
+ done
62
+ echo "done preparing unmatched and matched setups for TIMIT"
63
+
64
+
65
+ for s in $setups; do
66
+ zsh scripts/prepare_audio.sh $tgt_dir/$s $tgt_dir/$s/feat $model
67
+
68
+ lm_dir=$tgt_dir/$s/phones
69
+ fst_dir=$tgt_dir/$s/fst/phn_to_phn
70
+
71
+ python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $tgt_dir/$s/train_text.phn --workers 10 --only-source --destdir $lm_dir --srcdict $tgt_dir/$s/dict.phn.txt
72
+ $KENLM_ROOT/lmplz -o 3 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.03.arpa
73
+ $KENLM_ROOT/build_binary $lm_dir/train_text_phn.03.arpa $lm_dir/train_text_phn.03.bin
74
+ $KENLM_ROOT/lmplz -o 4 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.04.arpa
75
+ $KENLM_ROOT/build_binary $lm_dir/train_text_phn.04.arpa $lm_dir/train_text_phn.04.bin
76
+
77
+ python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$fst_dir lm_arpa=$lm_dir/train_text_phn.03.arpa data_dir=$tgt_dir/$s in_labels=phn
78
+ done
79
+ echo "done preprocessing audio and text for wav2vec-U"
fairseq/examples/wav2vec/unsupervised/scripts/remove_silence.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ get intervals from .vads file, specify output data, and this script removes silences and saves the audio data in out path folder
9
+ paths=shards/train.tsv
10
+ vads=shards/train.vads
11
+ python remove_silence.py --paths $paths --vads $vads
12
+ """
13
+
14
+ import os
15
+ import argparse
16
+ import torch
17
+ import torchaudio
18
+ import tqdm
19
+
20
+
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("--tsv", default="", type=str)
23
+ parser.add_argument("--vads", default="", type=str)
24
+ parser.add_argument("--out", type=str)
25
+ params = parser.parse_args()
26
+
27
+ # load paths
28
+ paths = []
29
+ with open(params.tsv) as f:
30
+ root = next(f).rstrip()
31
+ for line in f:
32
+ paths.append(os.path.join(root, line.rstrip().split("\t")[0]))
33
+
34
+ # load vads
35
+ list_intervals = []
36
+ with open(params.vads) as f:
37
+ for line in f:
38
+ interval = [
39
+ [int(w.split(":")[0]), int(w.split(":")[1])] for w in line.rstrip().split()
40
+ ]
41
+ list_intervals.append(interval)
42
+
43
+
44
+ # load audio and keep only intervals (i.e. remove silences)
45
+ for i in tqdm.trange(len(paths)):
46
+ data, _ = torchaudio.load(paths[i])
47
+ if len(list_intervals[i]) > 0:
48
+ data_filtered = torch.cat(
49
+ [data[0][int(it[0]) : int(it[1])] for it in list_intervals[i]]
50
+ ).unsqueeze(0)
51
+ else:
52
+ data_filtered = data
53
+
54
+ # YOU MAY NEED TO MODIFY THIS TO GET THE RIGHT SUBPATH
55
+ # outpath = params.out + '/'.join(paths[i].split('/')[-1])
56
+ outpath = params.out + "/" + "/".join(paths[i].split("/")[-2:])
57
+
58
+ if not os.path.isdir("/".join(outpath.split("/")[:-1])):
59
+ os.makedirs("/".join(outpath.split("/")[:-1]))
60
+ if not os.path.exists(outpath):
61
+ torchaudio.save(outpath, data_filtered, sample_rate=16000)
62
+ else:
63
+ print(outpath, "exists!")
fairseq/examples/wav2vec/unsupervised/scripts/vads.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import sys
9
+
10
+ from copy import deepcopy
11
+ from scipy.signal import lfilter
12
+
13
+ import numpy as np
14
+ from tqdm import tqdm
15
+ import soundfile as sf
16
+ import os.path as osp
17
+
18
+
19
+ def get_parser():
20
+ parser = argparse.ArgumentParser(description="compute vad segments")
21
+ parser.add_argument(
22
+ "--rvad-home",
23
+ "-r",
24
+ help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)",
25
+ required=True,
26
+ )
27
+
28
+ return parser
29
+
30
+
31
+ def rvad(speechproc, path):
32
+ winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
33
+ ftThres = 0.5
34
+ vadThres = 0.4
35
+ opts = 1
36
+
37
+ data, fs = sf.read(path)
38
+ assert fs == 16_000, "sample rate must be 16khz"
39
+ ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
40
+
41
+ # --spectral flatness --
42
+ pv01 = np.zeros(ft.shape[0])
43
+ pv01[np.less_equal(ft, ftThres)] = 1
44
+ pitch = deepcopy(ft)
45
+
46
+ pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
47
+
48
+ # --filtering--
49
+ ENERGYFLOOR = np.exp(-50)
50
+ b = np.array([0.9770, -0.9770])
51
+ a = np.array([1.0000, -0.9540])
52
+ fdata = lfilter(b, a, data, axis=0)
53
+
54
+ # --pass 1--
55
+ noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy(
56
+ fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
57
+ )
58
+
59
+ # sets noisy segments to zero
60
+ for j in range(n_noise_samp):
61
+ fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0
62
+
63
+ vad_seg = speechproc.snre_vad(
64
+ fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
65
+ )
66
+ return vad_seg, data
67
+
68
+
69
+ def main():
70
+ parser = get_parser()
71
+ args = parser.parse_args()
72
+
73
+ sys.path.append(args.rvad_home)
74
+ import speechproc
75
+
76
+ stride = 160
77
+ lines = sys.stdin.readlines()
78
+ root = lines[0].rstrip()
79
+ for fpath in tqdm(lines[1:]):
80
+ path = osp.join(root, fpath.split()[0])
81
+ vads, wav = rvad(speechproc, path)
82
+
83
+ start = None
84
+ vad_segs = []
85
+ for i, v in enumerate(vads):
86
+ if start is None and v == 1:
87
+ start = i * stride
88
+ elif start is not None and v == 0:
89
+ vad_segs.append((start, i * stride))
90
+ start = None
91
+ if start is not None:
92
+ vad_segs.append((start, len(wav)))
93
+
94
+ print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs))
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import numpy as np
11
+ import tqdm
12
+ import torch
13
+ import sys
14
+
15
+ import faiss
16
+ import torch.nn.functional as F
17
+
18
+ from wav2vec_cluster_faiss import parse_faiss_specs, Wav2VecFeatureReader
19
+
20
+
21
+ def get_parser():
22
+ parser = argparse.ArgumentParser(description="apply clusters")
23
+ # fmt: off
24
+ parser.add_argument('data', help='location of tsv files')
25
+ parser.add_argument('--split', help='split to process', required=True)
26
+ parser.add_argument('--labels', help='split to process', default="phn")
27
+ parser.add_argument('--path', help='path to pca and centroids', required=True)
28
+ parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True)
29
+ parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14)
30
+ parser.add_argument('--max-tsz', type=int, help='batch kmeans up to this much', default=14)
31
+ # fmt: on
32
+
33
+ return parser
34
+
35
+
36
+ def get_iterator(args):
37
+ label_path = osp.join(args.data, f"{args.split}.{args.labels}")
38
+ if osp.exists(label_path):
39
+ lp = open(label_path, "r")
40
+ else:
41
+ lp = None
42
+
43
+ with open(osp.join(args.data, f"{args.split}.tsv"), "r") as fp:
44
+ lines = fp.read().split("\n")
45
+ root = lines.pop(0).strip()
46
+ files = [line.rstrip() for line in lines if len(line) > 0]
47
+
48
+ if lp is not None:
49
+ lbls = [line.rstrip() for line in lp]
50
+ else:
51
+ lbls = [None] * len(files)
52
+
53
+ num = len(files)
54
+ reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
55
+
56
+ def iterate():
57
+ for fname, lbl in zip(files, lbls):
58
+ file = osp.join(root, fname.split("\t")[0])
59
+ feats = reader.get_feats(file)
60
+ yield feats.data, fname, lbl
61
+
62
+ return iterate, num, root
63
+
64
+
65
+ def main():
66
+ parser = get_parser()
67
+ args = parser.parse_args()
68
+
69
+ spec = osp.basename(args.path)
70
+
71
+ try:
72
+ faiss_spec = parse_faiss_specs(spec.rstrip("/"))[0]
73
+ except:
74
+ print(spec)
75
+ raise
76
+
77
+ print("Faiss Spec:", faiss_spec, file=sys.stderr)
78
+
79
+ if faiss_spec.pca:
80
+ A = torch.from_numpy(np.load(osp.join(args.path, "pca_A.npy"))).cuda()
81
+ b = torch.from_numpy(np.load(osp.join(args.path, "pca_b.npy"))).cuda()
82
+ print("Loaded PCA", file=sys.stderr)
83
+
84
+ centroids = np.load(osp.join(args.path, "centroids.npy"))
85
+ print("Loaded centroids", centroids.shape, file=sys.stderr)
86
+
87
+ res = faiss.StandardGpuResources()
88
+ index_flat = (
89
+ faiss.IndexFlatL2(centroids.shape[1])
90
+ if not faiss_spec.sphere
91
+ else faiss.IndexFlatIP(centroids.shape[1])
92
+ )
93
+ faiss_index = faiss.index_cpu_to_gpu(res, 0, index_flat)
94
+ faiss_index.add(centroids)
95
+
96
+ generator, num, root = get_iterator(args)
97
+ iterator = generator()
98
+
99
+ had_labels = False
100
+ label_path = osp.join(args.path, f"{args.split}.{args.labels}")
101
+
102
+ with torch.no_grad():
103
+ with open(osp.join(args.path, f"{args.split}.src"), "w") as fp, open(
104
+ osp.join(args.path, f"{args.split}.tsv"), "w"
105
+ ) as pp, open(label_path, "w") as lp:
106
+ print(root, file=pp)
107
+ for f, fname, lbl in tqdm.tqdm(iterator, total=num):
108
+ if faiss_spec.pca:
109
+ f = torch.mm(f, A) + b
110
+ if faiss_spec.norm:
111
+ f = F.normalize(f, p=2, dim=-1)
112
+
113
+ f = f.cpu().numpy()
114
+
115
+ _, z = faiss_index.search(f, 1)
116
+
117
+ print(" ".join(str(x.item()) for x in z), file=fp)
118
+ print(fname, file=pp)
119
+
120
+ if lbl is not None:
121
+ print(lbl, file=lp)
122
+ had_labels = True
123
+ if not had_labels:
124
+ os.remove(label_path)
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import gc
9
+ import os
10
+ import os.path as osp
11
+ import random
12
+ import numpy as np
13
+ import tqdm
14
+ import torch
15
+
16
+ from collections import namedtuple
17
+
18
+ import faiss
19
+
20
+ import fairseq
21
+ import soundfile as sf
22
+
23
+
24
+ def get_parser():
25
+ parser = argparse.ArgumentParser(
26
+ description="compute kmeans codebook from kaldi-computed feats"
27
+ )
28
+ # fmt: off
29
+ parser.add_argument('data', help='location of tsv files')
30
+ parser.add_argument('--save-dir', help='where to save the output', required=True)
31
+ parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True)
32
+ parser.add_argument('--sample-pct', '-r', type=float, help='percentage of timesteps to sample', default=0)
33
+ parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14)
34
+ parser.add_argument('--faiss-specs', '-f', type=str,
35
+ help='faiss index specs; separated by space '
36
+ 'format is: PCAx_NORM_CLUSx_SPHERICAL -> '
37
+ 'PCAx if exists first apply PCA '
38
+ 'NORM if exists, normalize the vector by L2 norm '
39
+ 'CLUSx must exist, cluster to x clusters '
40
+ 'SPEHRICAL if exists, apply spherical kmeans',
41
+ default='l2')
42
+ # fmt: on
43
+
44
+ return parser
45
+
46
+
47
+ faiss_spec = namedtuple("faiss_spec", ["pca", "norm", "n_clus", "sphere", "spec_str"])
48
+
49
+
50
+ def parse_faiss_specs(specs_str):
51
+ specs = []
52
+ for ss in specs_str.split():
53
+ comps = ss.split("_")
54
+ pca = 0
55
+ norm = False
56
+ n_clus = 0
57
+ sphere = False
58
+ for c in comps:
59
+ if c.startswith("PCA"):
60
+ pca = int(c[3:])
61
+ elif c == "NORM":
62
+ norm = True
63
+ elif c.startswith("CLUS"):
64
+ n_clus = int(c[4:])
65
+ elif c == "SPHERICAL":
66
+ sphere = True
67
+ assert n_clus > 0
68
+ specs.append(
69
+ faiss_spec(pca=pca, norm=norm, n_clus=n_clus, sphere=sphere, spec_str=ss)
70
+ )
71
+ return specs
72
+
73
+
74
+ class Wav2VecFeatureReader(object):
75
+ def __init__(self, cp_file, layer):
76
+ state = fairseq.checkpoint_utils.load_checkpoint_to_cpu(cp_file)
77
+
78
+ self.layer = layer
79
+
80
+ if "cfg" in state:
81
+ w2v_args = state["cfg"]
82
+ task = fairseq.tasks.setup_task(w2v_args.task)
83
+ model = task.build_model(w2v_args.model)
84
+ else:
85
+ w2v_args = state["args"]
86
+ task = fairseq.tasks.setup_task(w2v_args)
87
+ model = task.build_model(w2v_args)
88
+ model.load_state_dict(state["model"], strict=True)
89
+ model.eval()
90
+ model.cuda()
91
+ self.model = model
92
+
93
+ def read_audio(self, fname):
94
+ """Load an audio file and return PCM along with the sample rate"""
95
+ wav, sr = sf.read(fname)
96
+ assert sr == 16e3
97
+
98
+ return wav
99
+
100
+ def get_feats(self, loc):
101
+ x = self.read_audio(loc)
102
+ with torch.no_grad():
103
+ source = torch.from_numpy(x).view(1, -1).float().cuda()
104
+ res = self.model(
105
+ source=source, mask=False, features_only=True, layer=self.layer
106
+ )
107
+ return res["layer_results"][self.layer][0].squeeze(1)
108
+
109
+
110
+ def get_iterator(args):
111
+ with open(args.data, "r") as fp:
112
+ lines = fp.read().split("\n")
113
+ root = lines.pop(0).strip()
114
+ files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0]
115
+
116
+ if getattr(args, "sample_pct", 0) > 0:
117
+ files = random.sample(files, int(args.sample_pct * len(files)))
118
+ num = len(files)
119
+ reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
120
+
121
+ def iterate():
122
+ for fname in files:
123
+ feats = reader.get_feats(fname)
124
+ yield feats.cpu().numpy()
125
+
126
+ return iterate, num
127
+
128
+
129
+ def main():
130
+ parser = get_parser()
131
+ args = parser.parse_args()
132
+
133
+ faiss_specs = parse_faiss_specs(args.faiss_specs)
134
+ print("Faiss Specs:", faiss_specs)
135
+
136
+ feat_path = osp.join(args.save_dir, "features")
137
+ if osp.exists(feat_path + ".npy"):
138
+ feats = np.load(feat_path + ".npy")
139
+ else:
140
+ generator, num = get_iterator(args)
141
+ iterator = generator()
142
+
143
+ feats = []
144
+ for f in tqdm.tqdm(iterator, total=num):
145
+ feats.append(f)
146
+
147
+ del iterator
148
+ del generator
149
+
150
+ feats = np.concatenate(feats)
151
+
152
+ print(feats.shape)
153
+
154
+ os.makedirs(args.save_dir, exist_ok=True)
155
+ # np.save(feat_path, feats)
156
+
157
+ gc.collect()
158
+ torch.cuda.empty_cache()
159
+
160
+ reload = False
161
+ for spec in faiss_specs:
162
+ print("Processing spec", spec)
163
+
164
+ if reload:
165
+ print("Reloading...")
166
+ del feats
167
+ gc.collect()
168
+ feats = np.load(feat_path + ".npy")
169
+
170
+ save_path = osp.join(args.save_dir, spec.spec_str)
171
+ os.makedirs(save_path, exist_ok=True)
172
+ d = feats.shape[-1]
173
+ x = feats
174
+ if spec.pca > 0:
175
+ print("Computing PCA")
176
+ pca = faiss.PCAMatrix(d, spec.pca)
177
+ pca.train(x)
178
+ d = spec.pca
179
+ b = faiss.vector_to_array(pca.b)
180
+ A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
181
+ np.save(osp.join(save_path, "pca_A"), A.T)
182
+ np.save(osp.join(save_path, "pca_b"), b)
183
+ print("Applying PCA")
184
+ x = pca.apply_py(x)
185
+
186
+ if spec.norm:
187
+ reload = spec.pca <= 0
188
+ print("Normalizing")
189
+ faiss.normalize_L2(x)
190
+
191
+ print("Computing kmeans")
192
+ kmeans = faiss.Kmeans(
193
+ d,
194
+ spec.n_clus,
195
+ niter=50,
196
+ verbose=True,
197
+ spherical=spec.sphere,
198
+ max_points_per_centroid=feats.shape[0],
199
+ gpu=True,
200
+ nredo=3,
201
+ )
202
+ kmeans.train(x)
203
+ np.save(osp.join(save_path, "centroids"), kmeans.centroids)
204
+ del kmeans
205
+ del x
206
+ gc.collect()
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as osp
10
+ import tqdm
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from shutil import copyfile
14
+
15
+ from npy_append_array import NpyAppendArray
16
+
17
+ import fairseq
18
+ import soundfile as sf
19
+
20
+
21
+ def get_parser():
22
+ parser = argparse.ArgumentParser(
23
+ description="compute kmeans codebook from kaldi-computed feats"
24
+ )
25
+ # fmt: off
26
+ parser.add_argument('data', help='location of tsv files')
27
+ parser.add_argument('--split', help='which split to read', required=True)
28
+ parser.add_argument('--save-dir', help='where to save the output', required=True)
29
+ parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec ctc model', required=True)
30
+ parser.add_argument('--layer', type=int, default=14, help='which layer to use')
31
+ # fmt: on
32
+
33
+ return parser
34
+
35
+
36
+ class Wav2VecFeatureReader(object):
37
+ def __init__(self, cp_file, layer):
38
+ model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
39
+ [cp_file]
40
+ )
41
+ model = model[0]
42
+ model.eval()
43
+ model.cuda()
44
+ self.model = model
45
+ self.task = task
46
+ self.layer = layer
47
+
48
+ def read_audio(self, fname):
49
+ """Load an audio file and return PCM along with the sample rate"""
50
+ wav, sr = sf.read(fname)
51
+ assert sr == 16e3
52
+
53
+ return wav
54
+
55
+ def get_feats(self, loc):
56
+ x = self.read_audio(loc)
57
+ with torch.no_grad():
58
+ source = torch.from_numpy(x).float().cuda()
59
+ if self.task.cfg.normalize:
60
+ assert source.dim() == 1, source.dim()
61
+ with torch.no_grad():
62
+ source = F.layer_norm(source, source.shape)
63
+ source = source.view(1, -1)
64
+
65
+ m_res = self.model(source=source, mask=False, features_only=True, layer=self.layer)
66
+ return m_res["x"].squeeze(0).cpu()
67
+
68
+
69
+ def get_iterator(args):
70
+ with open(osp.join(args.data, args.split) + ".tsv", "r") as fp:
71
+ lines = fp.read().split("\n")
72
+ root = lines.pop(0).strip()
73
+ files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0]
74
+
75
+ num = len(files)
76
+ reader = Wav2VecFeatureReader(args.checkpoint, args.layer)
77
+
78
+ def iterate():
79
+ for fname in files:
80
+ w2v_feats = reader.get_feats(fname)
81
+ yield w2v_feats
82
+
83
+ return iterate, num
84
+
85
+
86
+ def main():
87
+ parser = get_parser()
88
+ args = parser.parse_args()
89
+
90
+ os.makedirs(args.save_dir, exist_ok=True)
91
+
92
+ def create_files(dest):
93
+ copyfile(osp.join(args.data, args.split) + ".tsv", dest + ".tsv")
94
+ if osp.exists(osp.join(args.data, args.split) + ".wrd"):
95
+ copyfile(osp.join(args.data, args.split) + ".wrd", dest + ".wrd")
96
+ if osp.exists(osp.join(args.data, args.split) + ".phn"):
97
+ copyfile(osp.join(args.data, args.split) + ".phn", dest + ".phn")
98
+
99
+ if osp.exists(dest + ".npy"):
100
+ os.remove(dest + ".npy")
101
+ npaa = NpyAppendArray(dest + ".npy")
102
+ return npaa
103
+
104
+ save_path = osp.join(args.save_dir, args.split)
105
+ npaa = create_files(save_path)
106
+
107
+ generator, num = get_iterator(args)
108
+ iterator = generator()
109
+
110
+ with open(save_path + ".lengths", "w") as l_f:
111
+ for w2v_feats in tqdm.tqdm(iterator, total=num):
112
+ print(len(w2v_feats), file=l_f)
113
+
114
+ if len(w2v_feats) > 0:
115
+ npaa.append(w2v_feats.numpy())
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
fairseq/examples/wav2vec/unsupervised/scripts/wer.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Implement unsupervised metric for decoding hyperparameter selection:
9
+ $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$
10
+ """
11
+ import argparse
12
+ import logging
13
+ import sys
14
+
15
+ import editdistance
16
+
17
+ logging.root.setLevel(logging.INFO)
18
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def get_parser():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("-s", "--hypo", help="hypo transcription", required=True)
25
+ parser.add_argument(
26
+ "-r", "--reference", help="reference transcription", required=True
27
+ )
28
+ return parser
29
+
30
+
31
+ def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p):
32
+ d_cnt = 0
33
+ w_cnt = 0
34
+ w_cnt_h = 0
35
+ for uid in hyp_uid_to_tra:
36
+ ref = ref_uid_to_tra[uid].split()
37
+ if g2p is not None:
38
+ hyp = g2p(hyp_uid_to_tra[uid])
39
+ hyp = [p for p in hyp if p != "'" and p != " "]
40
+ hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp]
41
+ else:
42
+ hyp = hyp_uid_to_tra[uid].split()
43
+ d_cnt += editdistance.eval(ref, hyp)
44
+ w_cnt += len(ref)
45
+ w_cnt_h += len(hyp)
46
+ wer = float(d_cnt) / w_cnt
47
+ logger.debug(
48
+ (
49
+ f"wer = {wer * 100:.2f}%; num. of ref words = {w_cnt}; "
50
+ f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}"
51
+ )
52
+ )
53
+ return wer
54
+
55
+
56
+ def main():
57
+ args = get_parser().parse_args()
58
+
59
+ errs = 0
60
+ count = 0
61
+ with open(args.hypo, "r") as hf, open(args.reference, "r") as rf:
62
+ for h, r in zip(hf, rf):
63
+ h = h.rstrip().split()
64
+ r = r.rstrip().split()
65
+ errs += editdistance.eval(r, h)
66
+ count += len(r)
67
+
68
+ logger.info(f"UER: {errs / count * 100:.2f}%")
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
73
+
74
+
75
+ def load_tra(tra_path):
76
+ with open(tra_path, "r") as f:
77
+ uid_to_tra = {}
78
+ for line in f:
79
+ uid, tra = line.split(None, 1)
80
+ uid_to_tra[uid] = tra
81
+ logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}")
82
+ return uid_to_tra
fairseq/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import sys
8
+
9
+
10
+ def main():
11
+ for line in sys.stdin:
12
+ print(" ".join(list(line.strip().replace(" ", "|"))) + " |")
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
fairseq/examples/wav2vec/unsupervised/tasks/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .unpaired_audio_text import UnpairedAudioText
7
+
8
+
9
+ __all__ = [
10
+ "UnpairedAudioText",
11
+ ]
fairseq/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2017-present, Facebook, Inc.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the LICENSE file in
5
+ # the root directory of this source tree. An additional grant of patent rights
6
+ # can be found in the PATENTS file in the same directory.
7
+
8
+ from dataclasses import dataclass, field
9
+ import logging
10
+ import math
11
+ import os
12
+ from typing import Optional
13
+ import torch
14
+
15
+ from fairseq.logging import metrics
16
+ from fairseq.tasks import FairseqTask, register_task
17
+ from ..data import ExtractedFeaturesDataset, RandomInputDataset
18
+
19
+ from fairseq.data import (
20
+ Dictionary,
21
+ data_utils,
22
+ StripTokenDataset,
23
+ )
24
+ from fairseq.dataclass import FairseqDataclass
25
+ from fairseq.distributed.utils import get_data_parallel_world_size
26
+ from omegaconf import MISSING
27
+
28
+ from examples.speech_recognition.kaldi.kaldi_decoder import (
29
+ KaldiDecoder,
30
+ KaldiDecoderConfig,
31
+ )
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class DecodingConfig(FairseqDataclass):
39
+ kenlm_path: Optional[str] = None
40
+ lm_weight: float = 0
41
+ blank_weight: float = 0
42
+
43
+
44
+ @dataclass
45
+ class UnpairedAudioTextConfig(FairseqDataclass):
46
+ data: str = field(
47
+ default=MISSING, metadata={"help": "path to data directory containing audio"}
48
+ )
49
+ text_data: str = field(
50
+ default=MISSING, metadata={"help": "path to data directory containing text"}
51
+ )
52
+ max_length: Optional[int] = None
53
+ labels: Optional[str] = field(
54
+ default=None,
55
+ metadata={"help": "extension of the label file to load, used for fine-tuning"},
56
+ )
57
+ aux_target_postfix: Optional[str] = field(
58
+ default=None,
59
+ metadata={"help": "auxaliry target filename extension"},
60
+ )
61
+ unfiltered: bool = field(
62
+ default=False, metadata={"help": "load data with _unfiltered suffix"}
63
+ )
64
+ ctc_eval: bool = field(
65
+ default=False, metadata={"help": "eval UER as if computed by CTC"}
66
+ )
67
+ sort_by_length: bool = field(
68
+ default=True, metadata={"help": "sort examples by length of audio timesteps"}
69
+ )
70
+ shuffle: bool = field(default=True, metadata={"help": "shuffle examples"})
71
+ append_eos: bool = field(default=False, metadata={"help": "append eos"})
72
+ uppercase: Optional[bool] = field(
73
+ default=False, metadata={"help": "uppercase for LM score computation"}
74
+ )
75
+ skipwords: Optional[str] = field(
76
+ default="",
77
+ metadata={
78
+ "help": "comma-separated words to be removed for LM score computation"
79
+ },
80
+ )
81
+ kenlm_path: Optional[str] = None
82
+ vocab_usage_power: float = 2
83
+
84
+ word_decoder_config: Optional[KaldiDecoderConfig] = None
85
+ word_kenlm_path: Optional[str] = None
86
+
87
+ decoding_config: DecodingConfig = DecodingConfig()
88
+
89
+
90
+ @register_task("unpaired_audio_text", dataclass=UnpairedAudioTextConfig)
91
+ class UnpairedAudioText(FairseqTask):
92
+ """ """
93
+
94
+ cfg: UnpairedAudioTextConfig
95
+
96
+ def __init__(
97
+ self,
98
+ cfg: UnpairedAudioTextConfig,
99
+ source_dictionary=None,
100
+ target_dictionary=None,
101
+ ):
102
+ super().__init__(cfg)
103
+
104
+ self._target_dictionary = target_dictionary
105
+ self._source_dictionary = source_dictionary
106
+ self.num_symbols = (
107
+ len([s for s in target_dictionary.symbols if not s.startswith("madeup")])
108
+ - target_dictionary.nspecial
109
+ )
110
+ self.sil_id = (
111
+ target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1
112
+ )
113
+ self.kenlm = None
114
+ if cfg.kenlm_path is not None:
115
+ import kenlm
116
+
117
+ self.kenlm = kenlm.Model(cfg.kenlm_path)
118
+
119
+ self.word_kenlm = None
120
+ if cfg.word_kenlm_path is not None:
121
+ import kenlm
122
+
123
+ self.word_kenlm = kenlm.Model(cfg.word_kenlm_path)
124
+
125
+ self.uppercase = cfg.uppercase
126
+ self.skipwords = set(cfg.skipwords.split(","))
127
+
128
+ def str_postprocess(s):
129
+ s = " ".join(w for w in s.split() if w not in self.skipwords)
130
+ s = s.upper() if self.uppercase else s
131
+ return s
132
+
133
+ self.str_postprocess = str_postprocess
134
+ self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s))
135
+
136
+ self.compute_word_score = None
137
+ if cfg.word_decoder_config is not None:
138
+ self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10)
139
+
140
+ def compute_word_score(logits, padding):
141
+ res = self.kaldi_decoder.decode(logits, padding)
142
+ for r in res:
143
+ r = r.result()
144
+ assert len(r) == 1
145
+ r = r[0]
146
+ yield r["score"], r["words"]
147
+
148
+ self.compute_word_score = compute_word_score
149
+
150
+ @classmethod
151
+ def setup_task(cls, cfg: UnpairedAudioTextConfig, **kwargs):
152
+ """Setup the task (e.g., load dictionaries).
153
+
154
+ Args:
155
+ cfg (AudioPretrainingConfig): configuration of this task
156
+ """
157
+
158
+ dict_path = os.path.join(cfg.text_data, "dict.txt")
159
+ if os.path.exists(dict_path):
160
+ target_dictionary = Dictionary.load(dict_path)
161
+ else:
162
+ dict_path = os.path.join(cfg.data, f"dict.{cfg.labels}.txt")
163
+ target_dictionary = Dictionary.load(dict_path)
164
+
165
+ return cls(cfg, target_dictionary=target_dictionary)
166
+
167
+ def optimizer_step(self, optimizer, model, update_num):
168
+ if hasattr(model, "get_groups_for_update"):
169
+ groups = model.get_groups_for_update(update_num)
170
+ optimizer.step(groups={groups})
171
+ else:
172
+ optimizer.step()
173
+
174
+ def valid_step(self, sample, model, criterion):
175
+ res = model(
176
+ **sample["net_input"],
177
+ dense_x_only=True,
178
+ )
179
+
180
+ dense_x = res["logits"]
181
+ padding_mask = res["padding_mask"]
182
+
183
+ word_scores = None
184
+ if self.compute_word_score is not None:
185
+ word_scores = self.compute_word_score(dense_x.cpu(), padding_mask.cpu())
186
+
187
+ z = dense_x.argmax(-1)
188
+ z[padding_mask] = self.target_dictionary.pad()
189
+
190
+ vocab_seen = torch.zeros(self.num_symbols, dtype=torch.bool)
191
+
192
+ import editdistance
193
+
194
+ c_err = 0
195
+ c_len = 0
196
+ pred_c_len = 0
197
+ lm_score_sum = 0
198
+ for i, (x, t, id) in enumerate(
199
+ zip(
200
+ z,
201
+ sample["target"] if "target" in sample else [None] * len(z),
202
+ sample["id"],
203
+ )
204
+ ):
205
+
206
+ if t is not None:
207
+ t = t[(t >= self.target_dictionary.nspecial)]
208
+ x = x[
209
+ (x >= self.target_dictionary.nspecial)
210
+ & (x < (self.num_symbols + self.target_dictionary.nspecial))
211
+ ]
212
+ if self.sil_id >= 0:
213
+ x = x[x != self.sil_id]
214
+
215
+ vocab_seen[x - self.target_dictionary.nspecial] = True
216
+
217
+ pred_units_arr = x
218
+ if self.cfg.ctc_eval:
219
+ pred_units_arr = pred_units_arr.unique_consecutive()
220
+ pred_units_arr = pred_units_arr[pred_units_arr != 0]
221
+
222
+ if id == 0:
223
+ if t is not None:
224
+ logger.info(f"REF: {self.target_dictionary.string(t)}")
225
+ logger.info(f"HYP: {self.target_dictionary.string(pred_units_arr)}")
226
+
227
+ if self.kenlm is not None:
228
+ if t is not None:
229
+ ref_lm_s = self.compute_lm_score(
230
+ self.target_dictionary.string(t)
231
+ )
232
+ logger.info(
233
+ f"LM [REF]: {ref_lm_s}, {math.pow(10, -ref_lm_s / (len(t) + 1))}"
234
+ )
235
+
236
+ hyp_lm_s = self.compute_lm_score(
237
+ self.target_dictionary.string(pred_units_arr)
238
+ )
239
+ logger.info(
240
+ f"LM [HYP]: {hyp_lm_s}, {math.pow(10, -hyp_lm_s / (len(pred_units_arr) + 1))}"
241
+ )
242
+
243
+ pred_units_arr = pred_units_arr.tolist()
244
+
245
+ pred_c_len += len(pred_units_arr)
246
+
247
+ if t is not None:
248
+ t = t.tolist()
249
+ c_err += editdistance.eval(pred_units_arr, t)
250
+ c_len += len(t)
251
+ else:
252
+ c_len = pred_c_len
253
+
254
+ if self.kenlm is not None:
255
+ pred_str = self.target_dictionary.string(pred_units_arr)
256
+ lm_score = self.compute_lm_score(pred_str)
257
+ lm_score_sum += lm_score
258
+
259
+ kaldi_score_sum = 0
260
+ word_lm_sum = 0
261
+ num_words = 0
262
+ if word_scores is not None:
263
+ for score, words in word_scores:
264
+ kaldi_score_sum += score
265
+ num_words += len(words)
266
+ if self.word_kenlm is not None:
267
+ word_lm_sum += self.kenlm.score(" ".join(words))
268
+
269
+ try:
270
+ world_size = get_data_parallel_world_size()
271
+ except:
272
+ world_size = 1
273
+
274
+ logging_output = {
275
+ "loss": c_err,
276
+ "_num_char_errors": c_err,
277
+ "_num_chars": c_len,
278
+ "_num_pred_chars": pred_c_len,
279
+ "ntokens": c_len,
280
+ "nsentences": z.size(0),
281
+ "sample_size": c_len,
282
+ "_world_size": world_size,
283
+ "_lm_score_sum": lm_score_sum,
284
+ "_kaldi_score_sum": kaldi_score_sum,
285
+ "_word_lm_sum": word_lm_sum,
286
+ "_num_words": num_words,
287
+ "_vocab_seen": vocab_seen,
288
+ }
289
+
290
+ return c_err, c_len, logging_output
291
+
292
+ def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
293
+ data_path = self.cfg.data
294
+ task_cfg = task_cfg or self.cfg
295
+
296
+ has_unpaired_text = os.path.exists(
297
+ os.path.join(self.cfg.text_data, f"{split}.idx")
298
+ )
299
+
300
+ self.datasets[split] = ExtractedFeaturesDataset(
301
+ path=data_path,
302
+ split=split,
303
+ min_length=3,
304
+ max_length=task_cfg.max_length,
305
+ labels=None if has_unpaired_text else task_cfg.labels,
306
+ label_dict=self.target_dictionary,
307
+ shuffle=getattr(task_cfg, "shuffle", True),
308
+ sort_by_length=task_cfg.sort_by_length,
309
+ aux_target_postfix=task_cfg.aux_target_postfix,
310
+ )
311
+
312
+ logger.info(f"split {split} has unpaired text? {has_unpaired_text}")
313
+ if has_unpaired_text:
314
+ text_dataset = data_utils.load_indexed_dataset(
315
+ os.path.join(self.cfg.text_data, split), self.target_dictionary
316
+ )
317
+ text_dataset = StripTokenDataset(text_dataset, self.target_dictionary.eos())
318
+ self.datasets[split] = RandomInputDataset(
319
+ self.datasets[split],
320
+ text_dataset,
321
+ ["random_label"],
322
+ add_to_input=True,
323
+ pad_idx=self.target_dictionary.pad(),
324
+ )
325
+
326
+ @property
327
+ def source_dictionary(self):
328
+ return self._source_dictionary
329
+
330
+ @property
331
+ def target_dictionary(self):
332
+ """Return the :class:`~fairseq.data.Dictionary` for the language
333
+ model."""
334
+ return self._target_dictionary
335
+
336
+ def max_positions(self):
337
+ """Maximum input length supported by the encoder."""
338
+ return None
339
+
340
+ def reduce_metrics(self, logging_outputs, criterion):
341
+ super().reduce_metrics(logging_outputs, criterion)
342
+
343
+ zero = torch.scalar_tensor(0.0)
344
+ num_char_errors = sum(
345
+ log.get("_num_char_errors", zero) for log in logging_outputs
346
+ )
347
+ num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs)
348
+ num_word_errors = sum(
349
+ log.get("_num_word_errors", zero) for log in logging_outputs
350
+ )
351
+ num_words = sum(log.get("_num_words", zero) for log in logging_outputs)
352
+ num_pred_chars = sum(
353
+ log.get("_num_pred_chars", zero) for log in logging_outputs
354
+ )
355
+
356
+ lm_score_sum = sum(log.get("_lm_score_sum", zero) for log in logging_outputs)
357
+ vocab_seen = (
358
+ sum(log.get("_vocab_seen", zero) for log in logging_outputs)
359
+ .bool()
360
+ .sum()
361
+ .item()
362
+ )
363
+ kaldi_score_sum = sum(
364
+ log.get("_kaldi_score_sum", zero) for log in logging_outputs
365
+ )
366
+ word_lm_sum = sum(log.get("_word_lm_sum", zero) for log in logging_outputs)
367
+
368
+ metrics.log_scalar_sum("_num_char_errors", num_char_errors)
369
+ metrics.log_scalar_sum("_num_chars", num_chars)
370
+ metrics.log_scalar_sum("_num_word_errors", num_word_errors)
371
+ metrics.log_scalar_sum("_num_words", num_words)
372
+
373
+ metrics.log_scalar_sum("lm_score_sum", lm_score_sum)
374
+ metrics.log_scalar_sum("num_pred_chars", num_pred_chars)
375
+
376
+ if self.cfg.word_kenlm_path is not None:
377
+ metrics.log_scalar_sum("kaldi_score_sum", kaldi_score_sum)
378
+ metrics.log_scalar_sum("word_lm_sum", word_lm_sum)
379
+
380
+ if num_chars > 0:
381
+ metrics.log_derived(
382
+ "uer",
383
+ lambda meters: meters["_num_char_errors"].sum
384
+ * 100.0
385
+ / meters["_num_chars"].sum
386
+ if meters["_num_chars"].sum > 0
387
+ else float("nan"),
388
+ )
389
+
390
+ if lm_score_sum < 0 and vocab_seen > 0:
391
+ metrics.log_scalar("vocab_seen_pct", vocab_seen / self.num_symbols)
392
+
393
+ metrics.log_derived(
394
+ "weighted_lm_ppl",
395
+ lambda meters: math.pow(
396
+ 10,
397
+ -meters["lm_score_sum"].sum
398
+ / (
399
+ meters["num_pred_chars"].sum + meters["nsentences"].sum
400
+ ), # account for </s>
401
+ )
402
+ / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power,
403
+ )
404
+
405
+ metrics.log_derived(
406
+ "lm_ppl",
407
+ lambda meters: math.pow(
408
+ 10,
409
+ -meters["lm_score_sum"].sum
410
+ / (
411
+ meters["num_pred_chars"].sum + meters["nsentences"].sum
412
+ ), # account for </s>
413
+ ),
414
+ )
415
+ else:
416
+ metrics.log_derived("weighted_lm_ppl", lambda meters: float("inf"))
417
+
418
+ if num_words > 0:
419
+ if word_lm_sum != 0:
420
+ metrics.log_derived(
421
+ "word_lm_ppl",
422
+ lambda meters: math.pow(
423
+ 10,
424
+ -meters["word_lm_sum"].sum
425
+ / (
426
+ meters["_num_words"].sum + meters["nsentences"].sum
427
+ ), # account for </s>
428
+ ),
429
+ )
430
+ metrics.log_derived(
431
+ "weighted_word_lm_ppl",
432
+ lambda meters: math.pow(
433
+ 10,
434
+ -meters["word_lm_sum"].sum
435
+ / (
436
+ meters["_num_words"].sum + meters["nsentences"].sum
437
+ ), # account for </s>
438
+ )
439
+ / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power,
440
+ )
441
+
442
+ if self.cfg.word_kenlm_path is not None:
443
+ metrics.log_derived(
444
+ "kaldi_score",
445
+ lambda meters: meters["kaldi_score_sum"].sum
446
+ / meters["nsentences"].sum,
447
+ )
448
+
449
+ def build_model(self, cfg: FairseqDataclass, from_checkpoint=False):
450
+ model = super().build_model(cfg)
451
+
452
+ return model
fairseq/examples/wav2vec/xlsr/README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # XLS-R
2
+
3
+ XLS-R is a set of large-scale models for self-supervised cross-lingual speech representation learning based on wav2vec 2.0. It was pretrained on 128 languages and approximately 436K hours of unlabeled speech data. With finetuning, these models achieve state of the art performance in speech translation, speech recognition and language identification. We evaluate the model across multiple benchmarks such as CoVoST-2 for speech translation, BABEL / MLS / CommonVoice / VoxPopuli for automatic speech recognition, and VoxLingua107 for language identification as we llas VoxCeleb1 for speaker identification. More details about this work can be found in our [paper](https://arxiv.org/pdf/2111.09296.pdf) and download links can be found below.
4
+
5
+ Model | Link
6
+ |------|------
7
+ XLS-R 300M | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_300m.pt)
8
+ XLS-R 1B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_960m_1000k.pt)
9
+ XLS-R 2B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_2B_1000k.pt)
10
+
11
+ You can also download these models [here](https://huggingface.co/models?other=xls_r) and read more about it in the [blogpost](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) from Hugging Face.
12
+
13
+ ## Speech Translation Finetuned Models
14
+
15
+ We multilingually finetune XLS-R models on [CoVoST 2](https://github.com/facebookresearch/covost), which has 21
16
+ into-English and 15 out-of-English directions.
17
+
18
+ Model | Directions | Link
19
+ |------|------|------
20
+ XLS-R 300M | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_21_en.pt)
21
+ XLS-R 300M | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_en_15.pt)
22
+ XLS-R 1B | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_21_en.pt)
23
+ XLS-R 1B | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_en_15.pt)
24
+ XLS-R 2B | 21 langs &#8594; En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_21_en.pt)
25
+ XLS-R 2B | En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_en_15.pt)
26
+ XLS-R 2B | 21 langs &#8594; En + En &#8594; 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_22_16.pt)
27
+
28
+ ## ASR Finetuning
29
+
30
+ You can refer the original wav2vec documentation on detailed instructions about how to finetune a pretrained model with CTC [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#fine-tune-a-pre-trained-model-with-ctc). Below is an example command and you can find the values for different hyperparameters to reproduce the results in our paper.
31
+
32
+ ```shell script
33
+ $ fairseq-hydra-train \
34
+ distributed_training.distributed_port=$PORT \
35
+ task.data=/path/to/data \
36
+ model.w2v_path=/path/to/model.pt \
37
+ --config-dir /path/to/fairseq-py/examples/wav2vec/xlsr/config \
38
+ --config-name finetune
39
+ ```
40
+
41
+ For finetuning the 300M as well as 1B model, we use the same hyperparameter setting defined in `finetune.yaml`. We vary `optimization.max_update` as described in the below table and the `optimization.lr` is picked from the interval [2e-5, 3e-4] based on dev word error rate.
42
+
43
+ Benchmark | Total Number of Updates
44
+ |------|------
45
+ Babel | 26000
46
+ Common Voice | 13000
47
+ VoxPopuli | 50000
48
+ MLS 10h | 20000
49
+
50
+ For finetuning the 2B model, we make some additional changes for `finetune.yaml` . We use the fully_sharded `distributed_training.ddp_backend` provided by the [fairscale](https://github.com/facebookresearch/fairscale) library and and set `model.activation_checkpoint` to true. We also increase `dataset.max_tokens` to 2560000 and use a total effective batch size of 2560000*24. We sweep for the best `optimization.lr` within the interval [3e−6,3e−5] using dev error rate. For common voice dataset, we pick the `model.mask_prob` for different languages among {0.30, 0.40} based on best dev error rate.
51
+
52
+ ## LID Inference
53
+
54
+ Model | Link
55
+ |------|------
56
+ XLS-R 300M + ft Voxlingua107 | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_300m_voxlingua107_ft.pt)
57
+
58
+ How to run inference & calculate accuracy (step-by-step):
59
+ 1. Download the Voxlingua107 checkpoint from the table above.
60
+ 1. Use this python script to extract logit/embedding from the XLSR model: https://github.com/fairinternal/fairseq-py/blob/xlsr2/examples/wav2vec/gen_audio_embedding.py
61
+ ```shell command
62
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python3 examples/wav2vec/gen_audio_embedding.py \
63
+ /fsx/data/VoxLingua107/manifest --path "/path/to/checkpoint.pt" \
64
+ --task audio_classification --batch-size 90 --gen-subset test \
65
+ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
66
+ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path /tmp/tmp_voxling_infer.npz
67
+ ```
68
+
69
+ 2. Calculate the overall accuracy, 0-5 seconds and 5-20 seconds:
70
+ ```shell command
71
+ PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \
72
+ --task cls --merge mean_logit --data /tmp/tmp_voxling_infer.npz
73
+
74
+ Output:
75
+ | run classification evaluation
76
+ | acc = 94.34% -- err = 5.66% -- correct=1518 total=1609
77
+ | acc 0to5 = 90.91% -- err = 9.09% -- c_5=230.0 t_5=253
78
+ | acc 5to20 = 94.99% -- err = 5.01% -- c_20=1288.0 t_20=1356
79
+ ```
80
+
81
+ ## Citation
82
+
83
+ Please cite as:
84
+
85
+ ``` bibtex
86
+ @article{babu2021xlsr,
87
+ title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
88
+ author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
89
+ year={2021},
90
+ volume={abs/2111.09296},
91
+ journal={arXiv},
92
+ }
93
+ ```
94
+
95
+
fairseq/examples/wav2vec/xlsr/config/finetune.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+
9
+ checkpoint:
10
+ save_interval: 1000
11
+ save_interval_updates: 1000
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+ best_checkpoint_metric: wer
15
+
16
+ task:
17
+ _name: audio_finetuning
18
+ data: ???
19
+ normalize: true
20
+ labels: ltr
21
+
22
+ dataset:
23
+ num_workers: 6
24
+ max_tokens: 1280000
25
+ skip_invalid_size_inputs_valid_test: true
26
+ validate_after_updates: 10000
27
+ validate_interval_updates: 1000
28
+ valid_subset: valid
29
+
30
+ distributed_training:
31
+ ddp_backend: legacy_ddp
32
+ distributed_world_size: 4
33
+
34
+ criterion:
35
+ _name: ctc
36
+ zero_infinity: true
37
+
38
+ optimization:
39
+ max_update: ???
40
+ lr: [0.0003]
41
+ sentence_avg: true
42
+ update_freq: [5]
43
+
44
+ optimizer:
45
+ _name: adam
46
+ adam_betas: (0.9,0.98)
47
+ adam_eps: 1e-08
48
+
49
+ lr_scheduler:
50
+ _name: tri_stage
51
+ phase_ratio: [0.1, 0.4, 0.5]
52
+ final_lr_scale: 0.05
53
+
54
+ model:
55
+ _name: wav2vec_ctc
56
+ w2v_path: ???
57
+ apply_mask: true
58
+ mask_prob: 0.75
59
+ mask_channel_prob: 0.25
60
+ mask_channel_length: 64
61
+ layerdrop: 0.1
62
+ activation_dropout: 0.1
63
+ feature_grad_mult: 0.0
64
+ freeze_finetune_updates: 10000
65
+
66
+ checkpoint_activations: false
fairseq/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ This scripts it to evaluate the classification accuracy/error rate from the embedding extracted
4
+ by gen_audio_embedding.py
5
+ Example (LID classification)
6
+
7
+ PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \
8
+ --data /fsx/androstj/exps/lid_voxlingua/infer/atj_xlsr2_100pct_300M_mean_fast_upd_100k_new.npz \
9
+ --task cls --merge mean_logit
10
+ """
11
+ import numpy as np
12
+ import sklearn
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from sklearn.preprocessing import StandardScaler
15
+ from tqdm import tqdm
16
+ import ipdb
17
+ import logging
18
+ import argparse
19
+ from scipy.special import softmax
20
+
21
+ log=logging.getLogger(__name__)
22
+ log.setLevel(logging.INFO)
23
+
24
+ def calculate_eer(y_label, y_score):
25
+ # y denotes groundtruth scores,
26
+ # y_score denotes the prediction scores.
27
+ from scipy.optimize import brentq
28
+ from sklearn.metrics import roc_curve
29
+ from scipy.interpolate import interp1d
30
+
31
+ fpr, tpr, thresholds = roc_curve(y_label, y_score, pos_label=1)
32
+ eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
33
+ optimal_threshold = interp1d(fpr, thresholds)(eer)
34
+ return eer, optimal_threshold
35
+
36
+ def calculate_minDCF(y_label, y_score, p_target=0.01, c_miss=1, c_fa=1):
37
+ # https://github.com/kaldi-asr/kaldi/blob/master/egs/sre08/v1/sid/compute_min_dcf.py
38
+ from sklearn.metrics import det_curve
39
+ fpr, fnr, thresholds = det_curve(y_label, y_score, pos_label=1)
40
+ min_c_det = float("inf")
41
+ min_c_det_threshold = thresholds[0]
42
+ for i in range(0, len(fpr)):
43
+ # See Equation (2). it is a weighted sum of false negative
44
+ # and false positive errors.
45
+ c_det = c_miss * fnr[i] * p_target + c_fa * fpr[i] * (1 - p_target)
46
+ if c_det < min_c_det:
47
+ min_c_det = c_det
48
+ min_c_det_threshold = thresholds[i]
49
+ # See Equations (3) and (4). Now we normalize the cost.
50
+ c_def = min(c_miss * p_target, c_fa * (1 - p_target))
51
+ min_dcf = min_c_det / c_def
52
+ return min_dcf, min_c_det_threshold
53
+
54
+
55
+ if __name__ == '__main__':
56
+ parser = argparse.ArgumentParser()
57
+ parser.add_argument('--data', help='npz contains name & latent file')
58
+ parser.add_argument('--task', choices=['cls', 'veri', 'cls_voxlingua'])
59
+ parser.add_argument('--merge', choices=['mean_logit', 'first_logit', 'mean_latent_sim', 'first_latent_sim', 'mean_logit_sim', 'first_logit_sim'])
60
+ parser.add_argument('--veri-pair', help='verification file contains 1/0 utt_x utt_y')
61
+ parser.add_argument('--scaler', type=str, choices=['mean_var'])
62
+ parser.add_argument('--compress-method', choices=['pca'])
63
+ parser.add_argument('--compress-dim', type=int)
64
+ args = parser.parse_args()
65
+
66
+ if args.task in ['cls', 'cls_voxlingua']:
67
+ print('| run classification evaluation')
68
+ data = np.load(args.data)
69
+ data_logit = data['logit']
70
+ data_target = data['target']
71
+ data_src_len = data['src_len']
72
+ assert data_logit.shape[0] == data_target.shape[0]
73
+ B = data_logit.shape[0]
74
+ correct = 0
75
+ total = 0
76
+ data_prob = softmax(data_logit, axis=2)
77
+ correct_vs_len = np.empty((B, 2))
78
+ for ii in range(B):
79
+ _target = data_target[ii]
80
+ if args.merge == 'mean_logit':
81
+ _prob = np.mean(data_prob[ii], axis=0)
82
+ top_1 = np.argmax(_prob)
83
+ elif args.merge == 'first_logit':
84
+ _prob = data_prob[ii][0]
85
+ top_1 = np.argmax(_prob)
86
+ else :
87
+ raise ValueError()
88
+ is_top_1 = (1 if top_1 == _target else 0)
89
+ correct += is_top_1
90
+ total += 1
91
+ _src_len = data_src_len[ii] / 16000
92
+ correct_vs_len[ii] = [is_top_1, _src_len]
93
+
94
+ acc = correct / total * 100
95
+ t_5 = correct_vs_len[:, 1] <= 5
96
+ t_20 = correct_vs_len[:, 1] > 5
97
+ c_5 = correct_vs_len[t_5, 0].sum()
98
+ c_20 = correct_vs_len[t_20, 0].sum()
99
+ t_5 = t_5.sum()
100
+ t_20 = t_20.sum()
101
+ acc_5 = c_5 / t_5 * 100
102
+ acc_20 = c_20 / t_20 * 100
103
+ print(f'| acc = {acc:.2f}% -- err = {100-acc:.2f}% -- {correct=} {total=}')
104
+ print(f'| acc 0to5 = {acc_5:.2f}% -- err = {100-acc_5:.2f}% -- {c_5=} {t_5=}')
105
+ print(f'| acc 5to20 = {acc_20:.2f}% -- err = {100-acc_20:.2f}% -- {c_20=} {t_20=}')
106
+
107
+
108
+
109
+ if args.task == 'veri':
110
+ print('| run verification evaluation')
111
+ veri_pairs = []
112
+ with open(args.veri_pair) as ff:
113
+ for fi in ff:
114
+ a,b,c = fi.split()
115
+ a = int(a)
116
+ veri_pairs.append([a,b,c])
117
+
118
+ data = np.load(args.data)
119
+ if 'logit' in args.merge:
120
+ data_latent = data['logit']
121
+ elif 'latent' in args.merge:
122
+ data_latent = data['latent']
123
+ else :
124
+ raise ValueError()
125
+
126
+ data_name = data['name']
127
+ assert len(data_name) == len(data_latent)
128
+ map_name_latent = {}
129
+
130
+ from sklearn.pipeline import make_pipeline
131
+ pipe = []
132
+ if args.scaler == 'mean_var':
133
+ print(f'| apply StandardScaler')
134
+ pipe.append(StandardScaler())
135
+
136
+ if args.compress_method == 'pca':
137
+ n_comp = args.compress_dim
138
+ print(f'| apply PCA with {n_comp=}')
139
+ from sklearn.decomposition import PCA
140
+ pipe.append(PCA(n_components=n_comp))
141
+ if len(pipe) > 0 :
142
+ pipe = make_pipeline(*pipe)
143
+ data_latent_2d = data_latent.reshape(-1, data_latent.shape[-1])
144
+ pipe.fit(data_latent_2d)
145
+ data_latent_2d = pipe.transform(data_latent_2d)
146
+ data_latent = data_latent_2d.reshape(data_latent.shape[0], data_latent.shape[1], -1)
147
+
148
+ for ii in range(len(data_name)):
149
+ map_name_latent[data_name[ii]] = data_latent[ii]
150
+ labels = []
151
+ scores = []
152
+ for lbl, pair_a, pair_b in tqdm(veri_pairs):
153
+ labels.append(lbl)
154
+ pair_a = map_name_latent[pair_a]
155
+ pair_b = map_name_latent[pair_b]
156
+ assert pair_a.ndim == pair_b.ndim == 2
157
+ score = cosine_similarity(pair_a, pair_b)
158
+ if args.merge.startswith('mean'):
159
+ score = np.mean(score)
160
+ elif args.merge.startswith('first'):
161
+ score = score[0, 0]
162
+ else :
163
+ raise ValueError()
164
+ scores.append(score)
165
+ labels = np.array(labels)
166
+ scores = np.array(scores)
167
+ eer, eer_threshold = calculate_eer(labels, scores)
168
+ minDCF, minDCF_threshold = calculate_minDCF(labels, scores)
169
+ print('='*40)
170
+ print(f'| EER = {eer*100:.2f}%\tthreshold = {eer_threshold:.2f}')
171
+ print(f'| minDCF = {minDCF:.2f}\tthreshold = {minDCF_threshold:.2f}')
172
+
173
+
fairseq/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ This script is used to extract the embedding / logit for speech classification task.
4
+ 1. Set fdir into your model checkpoint directory
5
+ 2. Run the following command (preferrably on GPU machine to speed up the inference process)
6
+
7
+ CUDA_VISIBLE_DEVICES=0 python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \
8
+ --task audio_classification --batch-size 90 --gen-subset test \
9
+ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
10
+ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir
11
+
12
+ Example:
13
+ Case: LID logit extraction
14
+ fdir='/fsx/androstj/exps/voxlingua_lid_train_all/ckpt_100pct_300m_voxling-act_linear-pool_mean_fast-lr_1e-4-phase_0.1_0.4_0.5-maxupd_100000-ufreq_1-mprob_0.5-fz_0-cr_softmax/0/checkpoints/checkpoint_best.pt'
15
+ python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \
16
+ --task audio_classification --batch-size 90 --gen-subset test \
17
+ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \
18
+ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir
19
+
20
+ """
21
+ import torch
22
+ from fairseq import checkpoint_utils, distributed_utils, options, utils
23
+ from fairseq.dataclass.utils import convert_namespace_to_omegaconf
24
+ from fairseq.logging import metrics, progress_bar
25
+ from fairseq import checkpoint_utils, data, options, tasks
26
+ from fairseq.data import FileAudioDataset, AddTargetDataset, Dictionary
27
+ from fairseq.tasks.audio_classification import LabelEncoder
28
+ import ipdb
29
+ import copy
30
+ import sys
31
+ from tqdm import tqdm
32
+ import tempfile
33
+ import numpy as np
34
+ import sklearn
35
+
36
+ def subset_manifest(infer_manifest, veri_pair):
37
+ with open(infer_manifest) as ff, open(veri_pair) as gg, \
38
+ tempfile.NamedTemporaryFile('w', delete=False) as ww:
39
+ fnames = ff.read().strip().split("\n")
40
+ basedir = fnames[0]
41
+ needed_fname = []
42
+ for gi in gg.read().strip().split('\n'):
43
+ _, x1, x2 = gi.split()
44
+ needed_fname.append(x1)
45
+ needed_fname.append(x2)
46
+ needed_fname = set(needed_fname)
47
+
48
+ ww.write(basedir+'\n')
49
+ for ii in range(1, len(fnames)):
50
+ x1,x2 = fnames[ii].split()
51
+ if x1 in needed_fname:
52
+ ww.write(fnames[ii]+'\n')
53
+ print(f'| subset manifest for verification: {ww.name}')
54
+ return ww.name
55
+
56
+ def wrap_target_dataset(infer_manifest, dataset, task):
57
+ label_path = infer_manifest.replace(".tsv", ".label")
58
+ with open(label_path, "r") as f:
59
+ labels = f.read().strip().split("\n")
60
+ assert len(labels) == len(dataset)
61
+ process_label = LabelEncoder(task.target_dictionary)
62
+ dataset = AddTargetDataset(dataset, labels,
63
+ pad=task.target_dictionary.pad(),
64
+ eos=task.target_dictionary.eos(),
65
+ batch_targets=True,
66
+ process_label=process_label,
67
+ add_to_input=False)
68
+ return dataset
69
+
70
+ def resample_data(source, padding_mask, n_sample, max_sample_len):
71
+ # source: BxT
72
+ # padding_mask: BxT
73
+ B = source.shape[0]
74
+ T = source.shape[1]
75
+ sources = []
76
+ padding_masks = []
77
+ seq_len = (~padding_mask).sum(1)
78
+ for jj in range(n_sample):
79
+ new_source = source.new_zeros(B, max_sample_len)
80
+ new_padding_mask = padding_mask.new_zeros(B, max_sample_len)
81
+ for ii in range(B):
82
+ if seq_len[ii] > max_sample_len:
83
+ start = np.random.randint(0, seq_len[ii]-max_sample_len+1)
84
+ end = start + max_sample_len
85
+ else :
86
+ start = 0
87
+ end = seq_len[ii]
88
+ new_source[ii, 0:end-start] = source[ii, start:end]
89
+ new_padding_mask[ii, end-start+1:] = True
90
+ sources.append(new_source)
91
+ padding_masks.append(new_padding_mask)
92
+ return sources, padding_masks
93
+
94
+ def resample_sample(sample, n_sample, max_sample_len):
95
+ new_sources, new_padding_masks = resample_data(sample['net_input']['source'], sample['net_input']['padding_mask'], n_sample, max_sample_len)
96
+ new_samples = []
97
+ for ii in range(n_sample):
98
+ new_sample = copy.deepcopy(sample)
99
+ new_sample['net_input']['source'] = new_sources[ii]
100
+ new_sample['net_input']['padding_mask'] = new_padding_masks[ii]
101
+ new_samples.append(new_sample)
102
+ return new_samples
103
+
104
+ if __name__ == '__main__':
105
+ np.random.seed(123)
106
+ # Parse command-line arguments for generation
107
+ parser = options.get_generation_parser(default_task='audio_classification')
108
+ # parser.add_argument('--infer-merge', type=str, default='mean')
109
+ parser.add_argument('--infer-xtimes', type=int, default=1)
110
+ parser.add_argument('--infer-max-sample-size', type=int, default=5*16000) # 5 secs
111
+ parser.add_argument('--infer-manifest', type=str)
112
+ parser.add_argument('--verification-pair', type=str, required=False,
113
+ help='''
114
+ a file that contains pairs of utts to evaluated if they are from same speaker or not
115
+ format: (following voxceleb)
116
+ 1/0 <wav_pair_a> <wav_pair_b>
117
+ ''')
118
+ parser.add_argument('--output-path', type=str)
119
+ # parser.add_argument('--infer-xtimes', type=int, default=1)
120
+
121
+ args = options.parse_args_and_arch(parser)
122
+ # Setup task
123
+ # task = tasks.setup_task(args)
124
+ use_cuda = not args.cpu
125
+
126
+ # Load model & task
127
+ print('| loading model from {}'.format(args.path))
128
+ arg_overrides = {
129
+ 'data': args.data,
130
+ # 'mask_prob': 0
131
+ #'max_sample_size': sys.maxsize,
132
+ #'min_sample_size': 0,
133
+ }
134
+ state = checkpoint_utils.load_checkpoint_to_cpu(args.path)
135
+ # move to AWS
136
+ state['cfg']['model']['w2v_path'] = state['cfg']['model']['w2v_path'].replace('/checkpoint/arbabu/XLSR2/model_versions/', '/fsx/data/model_versions/').replace('/checkpoint/kushall/final_model_checkpoints/wav2vec2/', '/fsx/data/wav2vec_ckpt/')
137
+ state['cfg']['task']['data'] = state['cfg']['task']['data'].replace('/checkpoint/kushall/data/', '/fsx/data/')
138
+
139
+ models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task([args.path],
140
+ arg_overrides=arg_overrides,
141
+ task=None,
142
+ state=state)
143
+ model = models[0]
144
+ model.eval()
145
+ if use_cuda:
146
+ model.cuda()
147
+
148
+
149
+ # Load dataset
150
+ task.load_dataset(args.gen_subset)
151
+ dataset = task.dataset(args.gen_subset)
152
+ infer_manifest = args.infer_manifest
153
+ # only decode needed utts
154
+ # infer_manifest = subset_manifest(infer_manifest,
155
+ # args.verification_pair)
156
+ infer_dataset = FileAudioDataset(infer_manifest,
157
+ sample_rate=task.cfg.sample_rate,
158
+ max_sample_size=10**10, #task.cfg.max_sample_size,
159
+ min_sample_size=1, #task.cfg.min_sample_size,
160
+ pad=True,
161
+ normalize=task.cfg.normalize)
162
+ # add target (if needed)
163
+ infer_dataset = wrap_target_dataset(infer_manifest, infer_dataset, task)
164
+ itr = task.get_batch_iterator(
165
+ dataset=infer_dataset,
166
+ max_sentences=args.batch_size,
167
+ ).next_epoch_itr(shuffle=False)
168
+
169
+
170
+ # correct = 0
171
+ # total = 0
172
+ list_uttname = []
173
+ list_latent = []
174
+ list_logit = []
175
+ list_target = []
176
+ list_src_len = []
177
+ with torch.no_grad():
178
+ for _, sample in tqdm(enumerate(itr)):
179
+ # resample if needed
180
+ samples = resample_sample(sample, args.infer_xtimes, args.infer_max_sample_size)
181
+ list_uttname.extend(sample['name'])
182
+ list_target.extend(sample['target'][:, 0].cpu().numpy())
183
+ list_src_len.extend((~sample['net_input']['padding_mask']).sum(1).cpu().numpy())
184
+ latents = []
185
+ logits = []
186
+ for sample in samples:
187
+ sample = utils.move_to_cuda(sample) if use_cuda else sample
188
+ try:
189
+ latent = model.forward_latent(**sample['net_input'])
190
+ latents.append(latent.detach().cpu().numpy())
191
+ except:
192
+ latent = None
193
+ logit = model.forward(**sample['net_input'])
194
+ logits.append(logit.detach().cpu().numpy())
195
+
196
+ if len(latents) > 0:
197
+ latents = np.stack(latents, 1) # B,X,D
198
+ logits = np.stack(logits, 1) # B,X,Cls
199
+ list_latent.extend(latents)
200
+ list_logit.extend(logits)
201
+
202
+ # create big npz
203
+ list_uttname = np.array(list_uttname)
204
+ list_latent = np.array(list_latent)
205
+ list_target = np.array(list_target)
206
+ list_logit = np.array(list_logit)
207
+ list_src_len = np.array(list_src_len)
208
+ # save to npz
209
+ output_path = args.output_path
210
+ if (output_path is None):
211
+ output_path = tempfile.NamedTemporaryFile('wb', delete=False).name
212
+
213
+ with open(output_path, 'wb') as ww:
214
+ np.savez(ww, name=list_uttname,
215
+ latent=list_latent,
216
+ target=list_target,
217
+ logit=list_logit,
218
+ src_len=list_src_len)
219
+
220
+ print("="*10 + " REPORT " + "="*10)
221
+ print(f'| latent saved in {output_path}')
222
+ print(f'| {list_uttname.shape=}, {list_latent.shape=}, {list_target.shape=}, {list_logit.shape=}, {list_src_len.shape=}')
fairseq/examples/wmt19/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WMT 19
2
+
3
+ This page provides pointers to the models of Facebook-FAIR's WMT'19 news translation task submission [(Ng et al., 2019)](https://arxiv.org/abs/1907.06616).
4
+
5
+ ## Pre-trained models
6
+
7
+ Model | Description | Download
8
+ ---|---|---
9
+ `transformer.wmt19.en-de` | En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
10
+ `transformer.wmt19.de-en` | De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
11
+ `transformer.wmt19.en-ru` | En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
12
+ `transformer.wmt19.ru-en` | Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
13
+ `transformer_lm.wmt19.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
14
+ `transformer_lm.wmt19.de` | De Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
15
+ `transformer_lm.wmt19.ru` | Ru Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
16
+
17
+ ## Pre-trained single models before finetuning
18
+
19
+ Model | Description | Download
20
+ ---|---|---
21
+ `transformer.wmt19.en-de` | En->De Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.ffn8192.tar.gz)
22
+ `transformer.wmt19.de-en` | De->En Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.ffn8192.tar.gz)
23
+ `transformer.wmt19.en-ru` | En->Ru Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ffn8192.tar.gz)
24
+ `transformer.wmt19.ru-en` | Ru->En Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ffn8192.tar.gz)
25
+
26
+ ## Example usage (torch.hub)
27
+
28
+ #### Requirements
29
+
30
+ We require a few additional Python dependencies for preprocessing:
31
+ ```bash
32
+ pip install fastBPE sacremoses
33
+ ```
34
+
35
+ #### Translation
36
+
37
+ ```python
38
+ import torch
39
+
40
+ # English to German translation
41
+ en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
42
+ tokenizer='moses', bpe='fastbpe')
43
+ en2de.translate("Machine learning is great!") # 'Maschinelles Lernen ist großartig!'
44
+
45
+ # German to English translation
46
+ de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
47
+ tokenizer='moses', bpe='fastbpe')
48
+ de2en.translate("Maschinelles Lernen ist großartig!") # 'Machine learning is great!'
49
+
50
+ # English to Russian translation
51
+ en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
52
+ tokenizer='moses', bpe='fastbpe')
53
+ en2ru.translate("Machine learning is great!") # 'Машинное обучение - это здорово!'
54
+
55
+ # Russian to English translation
56
+ ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
57
+ tokenizer='moses', bpe='fastbpe')
58
+ ru2en.translate("Машинное обучение - это здорово!") # 'Machine learning is great!'
59
+ ```
60
+
61
+ #### Language Modeling
62
+
63
+ ```python
64
+ # Sample from the English LM
65
+ en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
66
+ en_lm.sample("Machine learning is") # 'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
67
+
68
+ # Sample from the German LM
69
+ de_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.de', tokenizer='moses', bpe='fastbpe')
70
+ de_lm.sample("Maschinelles lernen ist") # 'Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...'
71
+
72
+ # Sample from the Russian LM
73
+ ru_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe')
74
+ ru_lm.sample("машинное обучение это") # 'машинное обучение это то, что мы называем "искусственным интеллектом".'
75
+ ```
76
+
77
+ ## Citation
78
+ ```bibtex
79
+ @inproceedings{ng2019facebook},
80
+ title = {Facebook FAIR's WMT19 News Translation Task Submission},
81
+ author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},
82
+ booktitle = {Proc. of WMT},
83
+ year = 2019,
84
+ }
85
+ ```