yangwang825 commited on
Commit
d505d4f
·
verified ·
1 Parent(s): 0a23095

Upload feature extractor

Browse files
feature_extraction_wav2vec2_spkreg.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature extractor class for Wav2Vec2
3
+ """
4
+
5
+ from typing import List, Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
10
+ from transformers.feature_extraction_utils import BatchFeature
11
+ from transformers.utils import PaddingStrategy, TensorType, logging
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class Wav2Vec2SpkRegFeatureExtractor(SequenceFeatureExtractor):
17
+ r"""
18
+ Constructs a Wav2Vec2 feature extractor.
19
+
20
+ This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
21
+ most of the main methods. Users should refer to this superclass for more information regarding those methods.
22
+
23
+ Args:
24
+ feature_size (`int`, *optional*, defaults to 1):
25
+ The feature dimension of the extracted features.
26
+ sampling_rate (`int`, *optional*, defaults to 16000):
27
+ The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
28
+ padding_value (`float`, *optional*, defaults to 0.0):
29
+ The value that is used to fill the padding values.
30
+ do_normalize (`bool`, *optional*, defaults to `True`):
31
+ Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
32
+ improve the performance for some models, *e.g.*,
33
+ [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
34
+ return_attention_mask (`bool`, *optional*, defaults to `False`):
35
+ Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.
36
+
37
+ <Tip>
38
+
39
+ Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
40
+ [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
41
+ `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
42
+ should be passed.
43
+
44
+ For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
45
+ [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
46
+ passed for batched inference.
47
+
48
+ </Tip>"""
49
+
50
+ model_input_names = ["input_values", "attention_mask"]
51
+
52
+ def __init__(
53
+ self,
54
+ feature_size=1,
55
+ sampling_rate=16000,
56
+ padding_value=0.0,
57
+ return_attention_mask=False,
58
+ do_normalize=True,
59
+ **kwargs,
60
+ ):
61
+ super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
62
+ self.return_attention_mask = return_attention_mask
63
+ self.do_normalize = do_normalize
64
+
65
+ @staticmethod
66
+ def zero_mean_unit_var_norm(
67
+ input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
68
+ ) -> List[np.ndarray]:
69
+ """
70
+ Every array in the list is normalized to have zero mean and unit variance
71
+ """
72
+ if attention_mask is not None:
73
+ attention_mask = np.array(attention_mask, np.int32)
74
+ normed_input_values = []
75
+
76
+ for vector, length in zip(input_values, attention_mask.sum(-1)):
77
+ normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
78
+ if length < normed_slice.shape[0]:
79
+ normed_slice[length:] = padding_value
80
+
81
+ normed_input_values.append(normed_slice)
82
+ else:
83
+ normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
84
+
85
+ return normed_input_values
86
+
87
+ def __call__(
88
+ self,
89
+ raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
90
+ padding: Union[bool, str, PaddingStrategy] = False,
91
+ max_length: Optional[int] = None,
92
+ truncation: bool = False,
93
+ pad_to_multiple_of: Optional[int] = None,
94
+ return_attention_mask: Optional[bool] = None,
95
+ return_tensors: Optional[Union[str, TensorType]] = None,
96
+ sampling_rate: Optional[int] = None,
97
+ **kwargs,
98
+ ) -> BatchFeature:
99
+ """
100
+ Main method to featurize and prepare for the model one or several sequence(s).
101
+
102
+ Args:
103
+ raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
104
+ The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
105
+ values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
106
+ stereo, i.e. single float per timestep.
107
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
108
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
109
+ index) among:
110
+
111
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
112
+ sequence if provided).
113
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
114
+ acceptable input length for the model if that argument is not provided.
115
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
116
+ lengths).
117
+ max_length (`int`, *optional*):
118
+ Maximum length of the returned list and optionally padding length (see above).
119
+ truncation (`bool`):
120
+ Activates truncation to cut input sequences longer than *max_length* to *max_length*.
121
+ pad_to_multiple_of (`int`, *optional*):
122
+ If set will pad the sequence to a multiple of the provided value.
123
+
124
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
125
+ `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
126
+ return_attention_mask (`bool`, *optional*):
127
+ Whether to return the attention mask. If left to the default, will return the attention mask according
128
+ to the specific feature_extractor's default.
129
+
130
+ [What are attention masks?](../glossary#attention-mask)
131
+
132
+ <Tip>
133
+
134
+ Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
135
+ [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
136
+ `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
137
+ `attention_mask` should be passed.
138
+
139
+ For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
140
+ [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
141
+ be passed for batched inference.
142
+
143
+ </Tip>
144
+
145
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
146
+ If set, will return tensors instead of list of python integers. Acceptable values are:
147
+
148
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
149
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
150
+ - `'np'`: Return Numpy `np.ndarray` objects.
151
+ sampling_rate (`int`, *optional*):
152
+ The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
153
+ `sampling_rate` at the forward call to prevent silent errors.
154
+ padding_value (`float`, *optional*, defaults to 0.0):
155
+ """
156
+
157
+ if sampling_rate is not None:
158
+ if sampling_rate != self.sampling_rate:
159
+ raise ValueError(
160
+ f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
161
+ f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
162
+ f" {self.sampling_rate} and not {sampling_rate}."
163
+ )
164
+ else:
165
+ logger.warning(
166
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
167
+ "Failing to do so can result in silent errors that might be hard to debug."
168
+ )
169
+
170
+ is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
171
+ if is_batched_numpy and len(raw_speech.shape) > 2:
172
+ raise ValueError(f"Only mono-channel audio is supported for input to {self}")
173
+ is_batched = is_batched_numpy or (
174
+ isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
175
+ )
176
+
177
+ # always return batch
178
+ if not is_batched:
179
+ raw_speech = [raw_speech]
180
+
181
+ # convert into correct format for padding
182
+ encoded_inputs = BatchFeature({"input_values": raw_speech})
183
+
184
+ padded_inputs = self.pad(
185
+ encoded_inputs,
186
+ padding=padding,
187
+ max_length=max_length,
188
+ truncation=truncation,
189
+ pad_to_multiple_of=pad_to_multiple_of,
190
+ return_attention_mask=return_attention_mask,
191
+ )
192
+
193
+ # convert input values to correct format
194
+ input_values = padded_inputs["input_values"]
195
+ if not isinstance(input_values[0], np.ndarray):
196
+ padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
197
+ elif (
198
+ not isinstance(input_values, np.ndarray)
199
+ and isinstance(input_values[0], np.ndarray)
200
+ and input_values[0].dtype is np.dtype(np.float64)
201
+ ):
202
+ padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
203
+ elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
204
+ padded_inputs["input_values"] = input_values.astype(np.float32)
205
+
206
+ # convert attention_mask to correct format
207
+ attention_mask = padded_inputs.get("attention_mask")
208
+ if attention_mask is not None:
209
+ padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
210
+
211
+ # zero-mean and unit-variance normalization
212
+ if self.do_normalize:
213
+ attention_mask = (
214
+ attention_mask
215
+ if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
216
+ else None
217
+ )
218
+ padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
219
+ padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
220
+ )
221
+
222
+ if return_tensors is not None:
223
+ padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
224
+
225
+ return padded_inputs
preprocessor_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "do_normalize": true,
3
  "feature_extractor_type": "Wav2Vec2SpkRegFeatureExtractor",
4
  "feature_size": 1,
 
1
  {
2
+ "auto_map": {
3
+ "AutoFeatureExtractor": "feature_extraction_wav2vec2_spkreg.Wav2Vec2SpkRegFeatureExtractor"
4
+ },
5
  "do_normalize": true,
6
  "feature_extractor_type": "Wav2Vec2SpkRegFeatureExtractor",
7
  "feature_size": 1,