import math import numpy as np from transformers import WhisperProcessor class WhisperPrePostProcessor(WhisperProcessor): def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size): inputs_len = inputs.shape[0] step = chunk_len - stride_left - stride_right all_chunk_start_idx = np.arange(0, inputs_len, step) num_samples = len(all_chunk_start_idx) num_batches = math.ceil(num_samples / batch_size) batch_idx = np.array_split(np.arange(num_samples), num_batches) for i, idx in enumerate(batch_idx): chunk_start_idx = all_chunk_start_idx[idx] chunk_end_idx = chunk_start_idx + chunk_len chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)] processed = self.feature_extractor( chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np" ) _stride_left = np.where(chunk_start_idx == 0, 0, stride_left) is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len) _stride_right = np.where(is_last, 0, stride_right) chunk_lens = [chunk.shape[0] for chunk in chunks] strides = [ (int(chunk_l), int(_stride_l), int(_stride_r)) for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right) ] yield {"stride": strides, **processed} def preprocess_batch(self, inputs, chunk_length_s=0, stride_length_s=None, batch_size=None): stride = None if isinstance(inputs, dict): stride = inputs.pop("stride", None) # Accepting `"array"` which is the key defined in `datasets` for # better integration if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): raise ValueError( "When passing a dictionary to FlaxWhisperPipline, the dict needs to contain a " '"raw" or "array" key containing the numpy array representing the audio, and a "sampling_rate" key ' "containing the sampling rate associated with the audio array." ) _inputs = inputs.pop("raw", None) if _inputs is None: # Remove path which will not be used from `datasets`. inputs.pop("path", None) _inputs = inputs.pop("array", None) in_sampling_rate = inputs.pop("sampling_rate") inputs = _inputs if in_sampling_rate != self.feature_extractor.sampling_rate: try: import librosa except ImportError as err: raise ImportError( "To support resampling audio files, please install 'librosa' and 'soundfile'." ) from err inputs = librosa.resample( inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate ) ratio = self.feature_extractor.sampling_rate / in_sampling_rate else: ratio = 1 if not isinstance(inputs, np.ndarray): raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`") if len(inputs.shape) != 1: raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline") if stride is not None: if stride[0] + stride[1] > inputs.shape[0]: raise ValueError("Stride is too large for input") # Stride needs to get the chunk length here, it's going to get # swallowed by the `feature_extractor` later, and then batching # can add extra data in the inputs, so we need to keep track # of the original length in the stride so we can cut properly. stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) if chunk_length_s: if stride_length_s is None: stride_length_s = chunk_length_s / 6 if isinstance(stride_length_s, (int, float)): stride_length_s = [stride_length_s, stride_length_s] chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate) stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate) stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate) if chunk_len < stride_left + stride_right: raise ValueError("Chunk length must be superior to stride length") for item in self.chunk_iter_with_batch( inputs, chunk_len, stride_left, stride_right, batch_size, ): yield item else: processed = self.feature_extractor( inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np" ) if stride is not None: processed["stride"] = stride yield processed def postprocess(self, model_outputs, return_timestamps=None, return_language=None): # unpack the outputs from list(dict(list)) to list(dict) model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())] time_precision = self.feature_extractor.chunk_length / 1500 # max source positions = 1500 # Send the chunking back to seconds, it's easier to handle in whisper sampling_rate = self.feature_extractor.sampling_rate for output in model_outputs: if "stride" in output: chunk_len, stride_left, stride_right = output["stride"] # Go back in seconds chunk_len /= sampling_rate stride_left /= sampling_rate stride_right /= sampling_rate output["stride"] = chunk_len, stride_left, stride_right text, optional = self.tokenizer._decode_asr( model_outputs, return_timestamps=return_timestamps, return_language=return_language, time_precision=time_precision, ) return {"text": text, **optional}