Commit
·
6977c7a
verified
·
0
Parent(s):
Duplicate from facebook/w2v-bert-2.0
Browse filesCo-authored-by: Vaibhav Srivastav <[email protected]>
- .gitattributes +35 -0
- README.md +184 -0
- config.json +81 -0
- conformer_shaw.pt +3 -0
- model.safetensors +3 -0
- preprocessor_config.json +11 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
language:
|
4 |
+
- af
|
5 |
+
- am
|
6 |
+
- ar
|
7 |
+
- as
|
8 |
+
- az
|
9 |
+
- be
|
10 |
+
- bn
|
11 |
+
- bs
|
12 |
+
- bg
|
13 |
+
- ca
|
14 |
+
- cs
|
15 |
+
- zh
|
16 |
+
- cy
|
17 |
+
- da
|
18 |
+
- de
|
19 |
+
- el
|
20 |
+
- en
|
21 |
+
- et
|
22 |
+
- fi
|
23 |
+
- fr
|
24 |
+
- or
|
25 |
+
- om
|
26 |
+
- ga
|
27 |
+
- gl
|
28 |
+
- gu
|
29 |
+
- ha
|
30 |
+
- he
|
31 |
+
- hi
|
32 |
+
- hr
|
33 |
+
- hu
|
34 |
+
- hy
|
35 |
+
- ig
|
36 |
+
- id
|
37 |
+
- is
|
38 |
+
- it
|
39 |
+
- jv
|
40 |
+
- ja
|
41 |
+
- kn
|
42 |
+
- ka
|
43 |
+
- kk
|
44 |
+
- mn
|
45 |
+
- km
|
46 |
+
- ky
|
47 |
+
- ko
|
48 |
+
- lo
|
49 |
+
- ln
|
50 |
+
- lt
|
51 |
+
- lb
|
52 |
+
- lg
|
53 |
+
- lv
|
54 |
+
- ml
|
55 |
+
- mr
|
56 |
+
- mk
|
57 |
+
- mt
|
58 |
+
- mi
|
59 |
+
- my
|
60 |
+
- nl
|
61 |
+
- nb
|
62 |
+
- ne
|
63 |
+
- ny
|
64 |
+
- oc
|
65 |
+
- pa
|
66 |
+
- ps
|
67 |
+
- fa
|
68 |
+
- pl
|
69 |
+
- pt
|
70 |
+
- ro
|
71 |
+
- ru
|
72 |
+
- sk
|
73 |
+
- sl
|
74 |
+
- sn
|
75 |
+
- sd
|
76 |
+
- so
|
77 |
+
- es
|
78 |
+
- sr
|
79 |
+
- sv
|
80 |
+
- sw
|
81 |
+
- ta
|
82 |
+
- te
|
83 |
+
- tg
|
84 |
+
- tl
|
85 |
+
- th
|
86 |
+
- tr
|
87 |
+
- uk
|
88 |
+
- ur
|
89 |
+
- uz
|
90 |
+
- vi
|
91 |
+
- wo
|
92 |
+
- xh
|
93 |
+
- yo
|
94 |
+
- ms
|
95 |
+
- zu
|
96 |
+
- ary
|
97 |
+
- arz
|
98 |
+
- yue
|
99 |
+
- kea
|
100 |
+
inference: false
|
101 |
+
---
|
102 |
+
# W2v-BERT 2.0 speech encoder
|
103 |
+
|
104 |
+
We are open-sourcing our Conformer-based [W2v-BERT 2.0 speech encoder](#w2v-bert-20-speech-encoder) as described in Section 3.2.1 of the [paper](https://arxiv.org/pdf/2312.05187.pdf), which is at the core of our Seamless models.
|
105 |
+
|
106 |
+
This model was pre-trained on 4.5M hours of unlabeled audio data covering more than 143 languages. It requires finetuning to be used for downstream tasks such as Automatic Speech Recognition (ASR), or Audio Classification.
|
107 |
+
|
108 |
+
| Model Name | #params | checkpoint |
|
109 |
+
| ----------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
110 |
+
| W2v-BERT 2.0 | 600M | [checkpoint](https://huggingface.co/reach-vb/conformer-shaw/resolve/main/conformer_shaw.pt)
|
111 |
+
|
112 |
+
**This model and its training are supported by 🤗 Transformers, more on it in the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert).**
|
113 |
+
|
114 |
+
|
115 |
+
# 🤗 Transformers usage
|
116 |
+
|
117 |
+
This is a bare checkpoint without any modeling head, and thus requires finetuning to be used for downstream tasks such as ASR. You can however use it to extract audio embeddings from the top layer with this code snippet:
|
118 |
+
|
119 |
+
```python
|
120 |
+
from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
|
121 |
+
import torch
|
122 |
+
from datasets import load_dataset
|
123 |
+
|
124 |
+
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
|
125 |
+
dataset = dataset.sort("id")
|
126 |
+
sampling_rate = dataset.features["audio"].sampling_rate
|
127 |
+
|
128 |
+
processor = AutoProcessor.from_pretrained("facebook/w2v-bert-2.0")
|
129 |
+
model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
|
130 |
+
|
131 |
+
# audio file is decoded on the fly
|
132 |
+
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
|
133 |
+
with torch.no_grad():
|
134 |
+
outputs = model(**inputs)
|
135 |
+
```
|
136 |
+
|
137 |
+
To learn more about the model use, refer to the following resources:
|
138 |
+
- [its docs](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2-bert)
|
139 |
+
- [a blog post showing how to fine-tune it on Mongolian ASR](https://huggingface.co/blog/fine-tune-w2v2-bert)
|
140 |
+
- [a training script example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py)
|
141 |
+
|
142 |
+
|
143 |
+
# Seamless Communication usage
|
144 |
+
|
145 |
+
This model can be used in [Seamless Communication](https://github.com/facebookresearch/seamless_communication), where it was released.
|
146 |
+
|
147 |
+
Here's how to make a forward pass through the voice encoder, after having completed the [installation steps](https://github.com/facebookresearch/seamless_communication?tab=readme-ov-file#installation):
|
148 |
+
|
149 |
+
```python
|
150 |
+
import torch
|
151 |
+
|
152 |
+
from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
|
153 |
+
from fairseq2.memory import MemoryBlock
|
154 |
+
from fairseq2.nn.padding import get_seqs_and_padding_mask
|
155 |
+
from pathlib import Path
|
156 |
+
from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
|
157 |
+
|
158 |
+
|
159 |
+
audio_wav_path, device, dtype = ...
|
160 |
+
audio_decoder = AudioDecoder(dtype=torch.float32, device=device)
|
161 |
+
fbank_converter = WaveformToFbankConverter(
|
162 |
+
num_mel_bins=80,
|
163 |
+
waveform_scale=2**15,
|
164 |
+
channel_last=True,
|
165 |
+
standardize=True,
|
166 |
+
device=device,
|
167 |
+
dtype=dtype,
|
168 |
+
)
|
169 |
+
collater = Collater(pad_value=1)
|
170 |
+
|
171 |
+
model = load_conformer_shaw_model("conformer_shaw", device=device, dtype=dtype)
|
172 |
+
model.eval()
|
173 |
+
|
174 |
+
with Path(audio_wav_path).open("rb") as fb:
|
175 |
+
block = MemoryBlock(fb.read())
|
176 |
+
|
177 |
+
decoded_audio = audio_decoder(block)
|
178 |
+
src = collater(fbank_converter(decoded_audio))["fbank"]
|
179 |
+
seqs, padding_mask = get_seqs_and_padding_mask(src)
|
180 |
+
|
181 |
+
with torch.inference_mode():
|
182 |
+
seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
|
183 |
+
seqs, padding_mask = model.encoder(seqs, padding_mask)
|
184 |
+
```
|
config.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.0,
|
3 |
+
"adapter_act": "relu",
|
4 |
+
"adapter_kernel_size": 3,
|
5 |
+
"adapter_stride": 2,
|
6 |
+
"add_adapter": false,
|
7 |
+
"apply_spec_augment": false,
|
8 |
+
"architectures": [
|
9 |
+
"Wav2Vec2BertModel"
|
10 |
+
],
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"classifier_proj_size": 768,
|
14 |
+
"codevector_dim": 768,
|
15 |
+
"conformer_conv_dropout": 0.1,
|
16 |
+
"contrastive_logits_temperature": 0.1,
|
17 |
+
"conv_depthwise_kernel_size": 31,
|
18 |
+
"ctc_loss_reduction": "sum",
|
19 |
+
"ctc_zero_infinity": false,
|
20 |
+
"diversity_loss_weight": 0.1,
|
21 |
+
"eos_token_id": 2,
|
22 |
+
"feat_proj_dropout": 0.0,
|
23 |
+
"feat_quantizer_dropout": 0.0,
|
24 |
+
"feature_projection_input_dim": 160,
|
25 |
+
"final_dropout": 0.1,
|
26 |
+
"hidden_act": "swish",
|
27 |
+
"hidden_dropout": 0.0,
|
28 |
+
"hidden_size": 1024,
|
29 |
+
"initializer_range": 0.02,
|
30 |
+
"intermediate_size": 4096,
|
31 |
+
"layer_norm_eps": 1e-05,
|
32 |
+
"layerdrop": 0.1,
|
33 |
+
"left_max_position_embeddings": 64,
|
34 |
+
"mask_feature_length": 10,
|
35 |
+
"mask_feature_min_masks": 0,
|
36 |
+
"mask_feature_prob": 0.0,
|
37 |
+
"mask_time_length": 10,
|
38 |
+
"mask_time_min_masks": 2,
|
39 |
+
"mask_time_prob": 0.05,
|
40 |
+
"max_source_positions": 5000,
|
41 |
+
"model_type": "wav2vec2-bert",
|
42 |
+
"num_adapter_layers": 1,
|
43 |
+
"num_attention_heads": 16,
|
44 |
+
"num_codevector_groups": 2,
|
45 |
+
"num_codevectors_per_group": 320,
|
46 |
+
"num_hidden_layers": 24,
|
47 |
+
"num_negatives": 100,
|
48 |
+
"output_hidden_size": 1024,
|
49 |
+
"pad_token_id": 0,
|
50 |
+
"position_embeddings_type": "relative_key",
|
51 |
+
"proj_codevector_dim": 768,
|
52 |
+
"right_max_position_embeddings": 8,
|
53 |
+
"rotary_embedding_base": 10000,
|
54 |
+
"tdnn_dilation": [
|
55 |
+
1,
|
56 |
+
2,
|
57 |
+
3,
|
58 |
+
1,
|
59 |
+
1
|
60 |
+
],
|
61 |
+
"tdnn_dim": [
|
62 |
+
512,
|
63 |
+
512,
|
64 |
+
512,
|
65 |
+
512,
|
66 |
+
1500
|
67 |
+
],
|
68 |
+
"tdnn_kernel": [
|
69 |
+
5,
|
70 |
+
3,
|
71 |
+
3,
|
72 |
+
1,
|
73 |
+
1
|
74 |
+
],
|
75 |
+
"torch_dtype": "float32",
|
76 |
+
"transformers_version": "4.37.0.dev0",
|
77 |
+
"use_intermediate_ffn_before_adapter": false,
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": null,
|
80 |
+
"xvector_output_dim": 512
|
81 |
+
}
|
conformer_shaw.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8310b4270a5b499e92e20c859892dbf7429619347debb5f8feba79eb88f99b4f
|
3 |
+
size 2329131983
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb890c9660ed6e3414b6812e27257b8ce5454365d5490d3ad581ea60b93be043
|
3 |
+
size 2322063736
|
preprocessor_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"feature_extractor_type": "SeamlessM4TFeatureExtractor",
|
3 |
+
"feature_size": 80,
|
4 |
+
"num_mel_bins": 80,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 1,
|
7 |
+
"processor_class": "Wav2Vec2BertProcessor",
|
8 |
+
"return_attention_mask": true,
|
9 |
+
"sampling_rate": 16000,
|
10 |
+
"stride": 2
|
11 |
+
}
|