yangwang825 commited on
Commit
be28212
·
verified ·
1 Parent(s): 09285a2

Upload feature extractor

Browse files
feature_extraction_hubert_spkreg.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature extractor class for HuBERT
3
+ """
4
+
5
+ from typing import List, Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from transformers.feature_extraction_utils import BatchFeature
10
+ from transformers.utils import PaddingStrategy, TensorType, logging
11
+ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class HubertSpkRegFeatureExtractor(SequenceFeatureExtractor):
17
+
18
+ model_input_names = ["input_values", "attention_mask"]
19
+
20
+ def __init__(
21
+ self,
22
+ feature_size=1,
23
+ sampling_rate=16000,
24
+ padding_value=0.0,
25
+ return_attention_mask=False,
26
+ do_normalize=True,
27
+ **kwargs,
28
+ ):
29
+ super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
30
+ self.return_attention_mask = return_attention_mask
31
+ self.do_normalize = do_normalize
32
+
33
+ @staticmethod
34
+ def zero_mean_unit_var_norm(
35
+ input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
36
+ ) -> List[np.ndarray]:
37
+ """
38
+ Every array in the list is normalized to have zero mean and unit variance
39
+ """
40
+ if attention_mask is not None:
41
+ attention_mask = np.array(attention_mask, np.int32)
42
+ normed_input_values = []
43
+
44
+ for vector, length in zip(input_values, attention_mask.sum(-1)):
45
+ normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
46
+ if length < normed_slice.shape[0]:
47
+ normed_slice[length:] = padding_value
48
+
49
+ normed_input_values.append(normed_slice)
50
+ else:
51
+ normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
52
+
53
+ return normed_input_values
54
+
55
+ def __call__(
56
+ self,
57
+ raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
58
+ padding: Union[bool, str, PaddingStrategy] = False,
59
+ max_length: Optional[int] = None,
60
+ truncation: bool = False,
61
+ pad_to_multiple_of: Optional[int] = None,
62
+ return_attention_mask: Optional[bool] = None,
63
+ return_tensors: Optional[Union[str, TensorType]] = None,
64
+ sampling_rate: Optional[int] = None,
65
+ **kwargs,
66
+ ) -> BatchFeature:
67
+ """
68
+ Main method to featurize and prepare for the model one or several sequence(s).
69
+
70
+ Args:
71
+ raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
72
+ The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
73
+ values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
74
+ stereo, i.e. single float per timestep.
75
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
76
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
77
+ index) among:
78
+
79
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
80
+ sequence if provided).
81
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
82
+ acceptable input length for the model if that argument is not provided.
83
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
84
+ lengths).
85
+ max_length (`int`, *optional*):
86
+ Maximum length of the returned list and optionally padding length (see above).
87
+ truncation (`bool`):
88
+ Activates truncation to cut input sequences longer than *max_length* to *max_length*.
89
+ pad_to_multiple_of (`int`, *optional*):
90
+ If set will pad the sequence to a multiple of the provided value.
91
+
92
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
93
+ `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
94
+ return_attention_mask (`bool`, *optional*):
95
+ Whether to return the attention mask. If left to the default, will return the attention mask according
96
+ to the specific feature_extractor's default.
97
+
98
+ [What are attention masks?](../glossary#attention-mask)
99
+
100
+ <Tip>
101
+
102
+ Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
103
+ [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
104
+ `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
105
+ `attention_mask` should be passed.
106
+
107
+ For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
108
+ [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
109
+ be passed for batched inference.
110
+
111
+ </Tip>
112
+
113
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
114
+ If set, will return tensors instead of list of python integers. Acceptable values are:
115
+
116
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
117
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
118
+ - `'np'`: Return Numpy `np.ndarray` objects.
119
+ sampling_rate (`int`, *optional*):
120
+ The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
121
+ `sampling_rate` at the forward call to prevent silent errors.
122
+ padding_value (`float`, *optional*, defaults to 0.0):
123
+ """
124
+
125
+ if sampling_rate is not None:
126
+ if sampling_rate != self.sampling_rate:
127
+ raise ValueError(
128
+ f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
129
+ f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
130
+ f" {self.sampling_rate} and not {sampling_rate}."
131
+ )
132
+ else:
133
+ logger.warning(
134
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
135
+ "Failing to do so can result in silent errors that might be hard to debug."
136
+ )
137
+
138
+ is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
139
+ if is_batched_numpy and len(raw_speech.shape) > 2:
140
+ raise ValueError(f"Only mono-channel audio is supported for input to {self}")
141
+ is_batched = is_batched_numpy or (
142
+ isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
143
+ )
144
+
145
+ # always return batch
146
+ if not is_batched:
147
+ raw_speech = [raw_speech]
148
+
149
+ # convert into correct format for padding
150
+ encoded_inputs = BatchFeature({"input_values": raw_speech})
151
+
152
+ padded_inputs = self.pad(
153
+ encoded_inputs,
154
+ padding=padding,
155
+ max_length=max_length,
156
+ truncation=truncation,
157
+ pad_to_multiple_of=pad_to_multiple_of,
158
+ return_attention_mask=return_attention_mask,
159
+ )
160
+
161
+ # convert input values to correct format
162
+ input_values = padded_inputs["input_values"]
163
+ if not isinstance(input_values[0], np.ndarray):
164
+ padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
165
+ elif (
166
+ not isinstance(input_values, np.ndarray)
167
+ and isinstance(input_values[0], np.ndarray)
168
+ and input_values[0].dtype is np.dtype(np.float64)
169
+ ):
170
+ padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
171
+ elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
172
+ padded_inputs["input_values"] = input_values.astype(np.float32)
173
+
174
+ # convert attention_mask to correct format
175
+ attention_mask = padded_inputs.get("attention_mask")
176
+ if attention_mask is not None:
177
+ padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
178
+
179
+ # zero-mean and unit-variance normalization
180
+ if self.do_normalize:
181
+ attention_mask = (
182
+ attention_mask
183
+ if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
184
+ else None
185
+ )
186
+ padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
187
+ padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
188
+ )
189
+
190
+ if return_tensors is not None:
191
+ padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
192
+
193
+ return padded_inputs
preprocessor_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoFeatureExtractor": "feature_extraction_hubert_spkreg.HubertSpkRegFeatureExtractor"
4
+ },
5
+ "do_normalize": true,
6
+ "feature_extractor_type": "HubertSpkRegFeatureExtractor",
7
+ "feature_size": 1,
8
+ "padding_side": "right",
9
+ "padding_value": 0,
10
+ "return_attention_mask": true,
11
+ "sampling_rate": 16000
12
+ }