RedbeardNZ reach-vb HF Staff commited on
Commit
6977c7a
·
verified ·
0 Parent(s):

Duplicate from facebook/w2v-bert-2.0

Browse files

Co-authored-by: Vaibhav Srivastav <[email protected]>

Files changed (6) hide show
  1. .gitattributes +35 -0
  2. README.md +184 -0
  3. config.json +81 -0
  4. conformer_shaw.pt +3 -0
  5. model.safetensors +3 -0
  6. preprocessor_config.json +11 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - af
5
+ - am
6
+ - ar
7
+ - as
8
+ - az
9
+ - be
10
+ - bn
11
+ - bs
12
+ - bg
13
+ - ca
14
+ - cs
15
+ - zh
16
+ - cy
17
+ - da
18
+ - de
19
+ - el
20
+ - en
21
+ - et
22
+ - fi
23
+ - fr
24
+ - or
25
+ - om
26
+ - ga
27
+ - gl
28
+ - gu
29
+ - ha
30
+ - he
31
+ - hi
32
+ - hr
33
+ - hu
34
+ - hy
35
+ - ig
36
+ - id
37
+ - is
38
+ - it
39
+ - jv
40
+ - ja
41
+ - kn
42
+ - ka
43
+ - kk
44
+ - mn
45
+ - km
46
+ - ky
47
+ - ko
48
+ - lo
49
+ - ln
50
+ - lt
51
+ - lb
52
+ - lg
53
+ - lv
54
+ - ml
55
+ - mr
56
+ - mk
57
+ - mt
58
+ - mi
59
+ - my
60
+ - nl
61
+ - nb
62
+ - ne
63
+ - ny
64
+ - oc
65
+ - pa
66
+ - ps
67
+ - fa
68
+ - pl
69
+ - pt
70
+ - ro
71
+ - ru
72
+ - sk
73
+ - sl
74
+ - sn
75
+ - sd
76
+ - so
77
+ - es
78
+ - sr
79
+ - sv
80
+ - sw
81
+ - ta
82
+ - te
83
+ - tg
84
+ - tl
85
+ - th
86
+ - tr
87
+ - uk
88
+ - ur
89
+ - uz
90
+ - vi
91
+ - wo
92
+ - xh
93
+ - yo
94
+ - ms
95
+ - zu
96
+ - ary
97
+ - arz
98
+ - yue
99
+ - kea
100
+ inference: false
101
+ ---
102
+ # W2v-BERT 2.0 speech encoder
103
+
104
+ We are open-sourcing our Conformer-based [W2v-BERT 2.0 speech encoder](#w2v-bert-20-speech-encoder) as described in Section 3.2.1 of the [paper](https://arxiv.org/pdf/2312.05187.pdf), which is at the core of our Seamless models.
105
+
106
+ This model was pre-trained on 4.5M hours of unlabeled audio data covering more than 143 languages. It requires finetuning to be used for downstream tasks such as Automatic Speech Recognition (ASR), or Audio Classification.
107
+
108
+ | Model Name | #params | checkpoint |
109
+ | ----------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
110
+ | W2v-BERT 2.0 | 600M | [checkpoint](https://huggingface.co/reach-vb/conformer-shaw/resolve/main/conformer_shaw.pt)
111
+
112
+ **This model and its training are supported by 🤗 Transformers, more on it in the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert).**
113
+
114
+
115
+ # 🤗 Transformers usage
116
+
117
+ This is a bare checkpoint without any modeling head, and thus requires finetuning to be used for downstream tasks such as ASR. You can however use it to extract audio embeddings from the top layer with this code snippet:
118
+
119
+ ```python
120
+ from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
121
+ import torch
122
+ from datasets import load_dataset
123
+
124
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
125
+ dataset = dataset.sort("id")
126
+ sampling_rate = dataset.features["audio"].sampling_rate
127
+
128
+ processor = AutoProcessor.from_pretrained("facebook/w2v-bert-2.0")
129
+ model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
130
+
131
+ # audio file is decoded on the fly
132
+ inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
133
+ with torch.no_grad():
134
+ outputs = model(**inputs)
135
+ ```
136
+
137
+ To learn more about the model use, refer to the following resources:
138
+ - [its docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert)
139
+ - [a blog post showing how to fine-tune it on Mongolian ASR](https://huggingface.co/blog/fine-tune-w2v2-bert)
140
+ - [a training script example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py)
141
+
142
+
143
+ # Seamless Communication usage
144
+
145
+ This model can be used in [Seamless Communication](https://github.com/facebookresearch/seamless_communication), where it was released.
146
+
147
+ Here's how to make a forward pass through the voice encoder, after having completed the [installation steps](https://github.com/facebookresearch/seamless_communication?tab=readme-ov-file#installation):
148
+
149
+ ```python
150
+ import torch
151
+
152
+ from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
153
+ from fairseq2.memory import MemoryBlock
154
+ from fairseq2.nn.padding import get_seqs_and_padding_mask
155
+ from pathlib import Path
156
+ from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
157
+
158
+
159
+ audio_wav_path, device, dtype = ...
160
+ audio_decoder = AudioDecoder(dtype=torch.float32, device=device)
161
+ fbank_converter = WaveformToFbankConverter(
162
+ num_mel_bins=80,
163
+ waveform_scale=2**15,
164
+ channel_last=True,
165
+ standardize=True,
166
+ device=device,
167
+ dtype=dtype,
168
+ )
169
+ collater = Collater(pad_value=1)
170
+
171
+ model = load_conformer_shaw_model("conformer_shaw", device=device, dtype=dtype)
172
+ model.eval()
173
+
174
+ with Path(audio_wav_path).open("rb") as fb:
175
+ block = MemoryBlock(fb.read())
176
+
177
+ decoded_audio = audio_decoder(block)
178
+ src = collater(fbank_converter(decoded_audio))["fbank"]
179
+ seqs, padding_mask = get_seqs_and_padding_mask(src)
180
+
181
+ with torch.inference_mode():
182
+ seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
183
+ seqs, padding_mask = model.encoder(seqs, padding_mask)
184
+ ```
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "adapter_act": "relu",
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": false,
8
+ "architectures": [
9
+ "Wav2Vec2BertModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 768,
14
+ "codevector_dim": 768,
15
+ "conformer_conv_dropout": 0.1,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_depthwise_kernel_size": 31,
18
+ "ctc_loss_reduction": "sum",
19
+ "ctc_zero_infinity": false,
20
+ "diversity_loss_weight": 0.1,
21
+ "eos_token_id": 2,
22
+ "feat_proj_dropout": 0.0,
23
+ "feat_quantizer_dropout": 0.0,
24
+ "feature_projection_input_dim": 160,
25
+ "final_dropout": 0.1,
26
+ "hidden_act": "swish",
27
+ "hidden_dropout": 0.0,
28
+ "hidden_size": 1024,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 4096,
31
+ "layer_norm_eps": 1e-05,
32
+ "layerdrop": 0.1,
33
+ "left_max_position_embeddings": 64,
34
+ "mask_feature_length": 10,
35
+ "mask_feature_min_masks": 0,
36
+ "mask_feature_prob": 0.0,
37
+ "mask_time_length": 10,
38
+ "mask_time_min_masks": 2,
39
+ "mask_time_prob": 0.05,
40
+ "max_source_positions": 5000,
41
+ "model_type": "wav2vec2-bert",
42
+ "num_adapter_layers": 1,
43
+ "num_attention_heads": 16,
44
+ "num_codevector_groups": 2,
45
+ "num_codevectors_per_group": 320,
46
+ "num_hidden_layers": 24,
47
+ "num_negatives": 100,
48
+ "output_hidden_size": 1024,
49
+ "pad_token_id": 0,
50
+ "position_embeddings_type": "relative_key",
51
+ "proj_codevector_dim": 768,
52
+ "right_max_position_embeddings": 8,
53
+ "rotary_embedding_base": 10000,
54
+ "tdnn_dilation": [
55
+ 1,
56
+ 2,
57
+ 3,
58
+ 1,
59
+ 1
60
+ ],
61
+ "tdnn_dim": [
62
+ 512,
63
+ 512,
64
+ 512,
65
+ 512,
66
+ 1500
67
+ ],
68
+ "tdnn_kernel": [
69
+ 5,
70
+ 3,
71
+ 3,
72
+ 1,
73
+ 1
74
+ ],
75
+ "torch_dtype": "float32",
76
+ "transformers_version": "4.37.0.dev0",
77
+ "use_intermediate_ffn_before_adapter": false,
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": null,
80
+ "xvector_output_dim": 512
81
+ }
conformer_shaw.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8310b4270a5b499e92e20c859892dbf7429619347debb5f8feba79eb88f99b4f
3
+ size 2329131983
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb890c9660ed6e3414b6812e27257b8ce5454365d5490d3ad581ea60b93be043
3
+ size 2322063736
preprocessor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
+ "feature_size": 80,
4
+ "num_mel_bins": 80,
5
+ "padding_side": "right",
6
+ "padding_value": 1,
7
+ "processor_class": "Wav2Vec2BertProcessor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000,
10
+ "stride": 2
11
+ }