sanchit-gandhi's picture
fix deps
1d2ae3c
raw
history blame
24.3 kB
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import jax
import jax.numpy as jnp
import numpy as np
import requests
from flax import jax_utils
from flax.core.frozen_dict import freeze
from flax.training.common_utils import shard
from jax.sharding import PartitionSpec as P
from transformers import WhisperProcessor, is_tokenizers_available, WhisperFeatureExtractor, WhisperTokenizerFast
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE, WhisperTokenizer
from transformers.pipelines.audio_utils import ffmpeg_read
from transformers.utils import logging
from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
from .partitioner import PjitPartitioner
from .train_state import InferenceState
logger = logging.get_logger(__name__)
# 2D parameter and activation partitioning for DP
logical_axis_rules_dp = (
("batch", "data"),
("mlp", None),
("heads", None),
("vocab", None),
("embed", None),
("embed", None),
("joined_kv", None),
("kv", None),
("length", None),
("num_mel", None),
("channels", None),
)
class FlaxWhisperPipline:
def __init__(
self,
checkpoint="openai/whisper-large-v2",
dtype=jnp.float32,
batch_size=None,
max_length=None,
):
"""
Args
checkpoint (`str`, *optional*, defaults to `"openai/whisper-large-v2"):
The Whisper checkpoint to use with the pipeline. Must be an available checkpoint on the Hugging Face Hub
with Flax weights.
dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
`jax.numpy.bfloat16` (on TPUs). This can be used to enable half-precision inference on GPUs or TPUs.
If specified all the computation will be performed with the given `dtype`. **Note that this only
specifies the dtype of the computation and does not influence the dtype of model parameters.**
batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
a batch size in the `__init__` method will be superseded by any batch size passed to the `__call__` method.
max_length (`int`, *optional*):
The maximum numbers of tokens to generate. Defaults to `model.config.max_length`.
"""
self.checkpoint = checkpoint
self.dtype = dtype
self.processor = WhisperProcessor.from_pretrained(self.checkpoint)
self.feature_extractor = self.processor.feature_extractor
# potentially load fast tokenizer if available
tokenizer_cls = WhisperTokenizerFast if is_tokenizers_available() else WhisperTokenizer
self.tokenizer = tokenizer_cls.from_pretrained(checkpoint)
self.model, self.params = FlaxWhisperForConditionalGeneration.from_pretrained(
self.checkpoint,
_do_init=False,
dtype=self.dtype,
)
self.max_length = max_length if max_length is not None else self.model.generation_config.max_length
self.min_batch_size = jax.local_device_count()
self.batch_size = (
batch_size if batch_size is not None else self.min_batch_size
) # we need a minimum of 1 batch per-device
def generate(params, input_features, forced_decoder_ids, return_timestamps):
output_ids = self.model.pipeline_generate(
input_features,
params=params,
forced_decoder_ids=forced_decoder_ids,
return_timestamps=return_timestamps,
max_length=self.max_length,
)
return output_ids
# use pmap for DP by default - this is compatible on a Colab TPU v2
self.params = jax_utils.replicate(self.params)
self.p_generate = jax.pmap(
generate, "input_features", in_axes=(0, 0, None), out_axes=0, static_broadcasted_argnums=(3,)
)
self.is_sharded = False
def shard_params(self, num_mp_partitions=1, logical_axis_rules=logical_axis_rules_dp):
def init_fn():
input_shape = (1, self.model.config.num_mel_bins, 2 * self.model.config.max_source_positions)
input_features = jnp.zeros(input_shape, dtype="f4")
input_features = input_features.at[(..., -1)].set(self.model.config.eos_token_id)
decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
batch_size, sequence_length = decoder_input_ids.shape
decoder_position_ids = jnp.broadcast_to(
jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
)
rng = jax.random.PRNGKey(0)
init_params = self.model.module.init(
rng,
input_features=input_features,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
return_dict=False,
)
return init_params
# Axis names metadata
param_axes = jax.eval_shape(init_fn)["params_axes"]
# Create InferenceState, since the partitioner expects it
state = InferenceState(
step=jnp.array(0),
params=freeze(self.model.params_shape_tree),
params_axes=freeze(param_axes),
flax_mutables=None,
flax_mutables_axes=param_axes,
)
partitioner = PjitPartitioner(num_partitions=num_mp_partitions, logical_axis_rules=logical_axis_rules)
mesh_axes = partitioner.get_mesh_axes(state)
params_spec = mesh_axes.params
p_shard_params = partitioner.partition(self.model.to_bf16, (params_spec,), params_spec)
# This will auto-magically run in mesh context
self.params = p_shard_params(freeze(jax_utils.unreplicate(self.params)))
self.is_sharded = True
def generate(params, input_features, forced_decoder_ids, return_timestamps):
output_ids = self.model.pipeline_generate(
input_features,
params=params,
forced_decoder_ids=forced_decoder_ids,
return_timestamps=return_timestamps,
max_length=self.max_length,
)
return output_ids
# Use pjit for generate only once we've sharded the params
self.p_generate = partitioner.partition(
generate,
in_axis_resources=(params_spec, P("data"), None),
out_axis_resources=P("data"),
static_argnums=(3,),
)
def generate(self, input_features, language=None, task=None, return_timestamps=False):
forced_decoder_ids = self.get_forced_decoder_ids(
language=language, task=task, return_timestamps=return_timestamps
)
if not self.is_sharded:
# if we're using pmap we need to manually replicate the input data across devices and gather the output tokens
output_ids = self.p_generate(
freeze(self.params), shard(input_features), forced_decoder_ids, return_timestamps
).sequences
output_ids = jax.device_get(output_ids.reshape(-1, self.max_length))
else:
# pjit handles replication / gathering for us auto-magically
output_ids = self.p_generate(
freeze(self.params), input_features, forced_decoder_ids, return_timestamps
).sequences
return output_ids
def get_forced_decoder_ids(self, generation_config=None, task=None, language=None, return_timestamps=False):
if generation_config is None:
generation_config = self.model.generation_config
if hasattr(generation_config, "is_multilingual"):
is_multilingual = generation_config.is_multilingual
else:
is_multilingual = None
forced_decoder_ids = []
if is_multilingual:
if language is not None:
language = language.lower()
if language in generation_config.lang_to_id.keys():
language_token = language
elif language in TO_LANGUAGE_CODE.values():
language_token = f"<|{language}|>"
elif language in TO_LANGUAGE_CODE.keys():
language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
else:
if len(language) == 2:
# ISO 639-1 language code
acceptable_languages = list(TO_LANGUAGE_CODE.values())
elif "<" in language or "|" in language or ">" in language:
# generation config language code
acceptable_languages = list(generation_config.lang_to_id.keys())
else:
# language passed as a string
acceptable_languages = list(TO_LANGUAGE_CODE.keys())
raise ValueError(
f"Unsupported language: {language}. Language should be one of:" f" {acceptable_languages}."
)
forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
if task is not None:
forced_decoder_ids.append((2, generation_config.task_to_id[task]))
else:
forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
if not return_timestamps:
if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
return forced_decoder_ids
def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size):
inputs_len = inputs.shape[0]
step = chunk_len - stride_left - stride_right
all_chunk_start_idx = np.arange(0, inputs_len, step)
num_samples = len(all_chunk_start_idx)
num_batches = math.ceil(num_samples / batch_size)
batch_idx = np.array_split(np.arange(num_samples), num_batches)
for idx in batch_idx:
chunk_start_idx = all_chunk_start_idx[idx]
chunk_end_idx = chunk_start_idx + chunk_len
chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)]
processed = self.feature_extractor(
chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
)
_stride_left = np.where(chunk_start_idx == 0, 0, stride_left)
is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len)
_stride_right = np.where(is_last, 0, stride_right)
chunk_lens = [chunk.shape[0] for chunk in chunks]
strides = [
(chunk_l, _stride_l, _stride_r)
for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right)
]
yield {"stride": strides, **processed}
def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
if isinstance(inputs, np.ndarray):
logger.warning(
"Numpy array passed as input - no sampling rate checks will be performed."
"It is strongly recommended to pass the input as a dictionary with an 'array' key "
"containing the numpy array representing the audio, and a 'sampling_rate' key "
"containing the sampling rate associated with the audio array."
"Failing to do so can result in silent errors that might be hard to debug."
)
if isinstance(inputs, str):
if inputs.startswith("http://") or inputs.startswith("https://"):
# We need to actually check for a real protocol, otherwise it's impossible to use a local file
# like http_huggingface_co.png
inputs = requests.get(inputs).content
else:
with open(inputs, "rb") as f:
inputs = f.read()
if isinstance(inputs, bytes):
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
stride = None
if isinstance(inputs, dict):
stride = inputs.get("stride", None)
# Accepting `"array"` which is the key defined in `datasets` for
# better integration
if not ("sampling_rate" in inputs and "array" in inputs):
raise ValueError(
"When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
"containing the numpy array representing the audio, and a 'sampling_rate' key "
"containing the sampling rate associated with the audio array."
)
in_sampling_rate = inputs.get("sampling_rate")
inputs = inputs.get("array", None)
if in_sampling_rate != self.feature_extractor.sampling_rate:
try:
import librosa
except ImportError as err:
raise ImportError(
"To support resampling audio files, please install 'librosa' and 'soundfile'."
) from err
inputs = librosa.resample(
inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
)
ratio = self.feature_extractor.sampling_rate / in_sampling_rate
else:
ratio = 1
if not isinstance(inputs, np.ndarray):
raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
if len(inputs.shape) != 1:
raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
if stride is not None:
if stride[0] + stride[1] > inputs.shape[0]:
raise ValueError("Stride is too large for input")
# Stride needs to get the chunk length here, it's going to get
# swallowed by the `feature_extractor` later, and then batching
# can add extra data in the inputs, so we need to keep track
# of the original length in the stride so we can cut properly.
stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
if chunk_length_s:
if stride_length_s is None:
stride_length_s = chunk_length_s / 6
if isinstance(stride_length_s, (int, float)):
stride_length_s = [stride_length_s, stride_length_s]
chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
if chunk_len < stride_left + stride_right:
raise ValueError("Chunk length must be superior to stride length")
for item in self.chunk_iter_with_batch(
inputs,
chunk_len,
stride_left,
stride_right,
batch_size,
):
yield item
else:
processed = self.feature_extractor(
inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
)
if stride is not None:
processed["stride"] = stride
yield processed
def postprocess(self, model_outputs, return_timestamps=None, return_language=None):
# unpack the outputs from list(dict(list)) to list(dict)
model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())]
time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
# Send the chunking back to seconds, it's easier to handle in whisper
sampling_rate = self.feature_extractor.sampling_rate
for output in model_outputs:
if "stride" in output:
chunk_len, stride_left, stride_right = output["stride"]
# Go back in seconds
chunk_len /= sampling_rate
stride_left /= sampling_rate
stride_right /= sampling_rate
output["stride"] = chunk_len, stride_left, stride_right
text, optional = self.tokenizer._decode_asr(
model_outputs,
return_timestamps=return_timestamps,
return_language=return_language,
time_precision=time_precision,
)
return {"text": text, **optional}
def forward(self, model_inputs, batch_size=None, language=None, task=None, return_timestamps=False):
# We need to keep track of some additional input arguments for post-processing so need to forward these on after running generation
input_features = model_inputs.pop("input_features")
input_batch_size = input_features.shape[0]
if input_batch_size != batch_size:
padding = np.zeros([batch_size - input_batch_size, *input_features.shape[1:]], input_features.dtype)
input_features = np.concatenate([input_features, padding])
pred_ids = self.generate(input_features, language=language, task=task, return_timestamps=return_timestamps)[
:input_batch_size
]
# tokenizer's decode method expects an extra dim - we insert it here for convenience
out = {"tokens": pred_ids[:, None, :]}
stride = model_inputs.pop("stride", None)
if stride is not None:
out["stride"] = stride
return out
def __call__(
self,
inputs,
chunk_length_s=30.0,
stride_length_s=None,
batch_size=None,
language=None,
task=None,
return_timestamps=None,
generate_kwargs=None,
):
"""
Transcribe an audio input sequence to a text transcription, optionally with timestamps.
Args:
inputs (`np.ndarray` or `bytes` or `str` or `dict`):
The inputs is either:
- `str` that is the filename of the audio file, the file will be read at the correct sampling rate
to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
- `bytes` is the byte content of an audio file and is interpreted by *ffmpeg* in the
same way.
- (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
Raw audio assumed to be at the correct sampling rate (16kHz). Note that no further sampling
rate check will be done.
- `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "array":
np.array}`. Optionally an additional argument `"stride": (left: int, right: int)` can be used to
ask the pipeline to treat the first `left` samples and last `right` samples to be ignored in
decoding (but used at inference to provide more context to the model). In general, this additional
stride argument is not required.
chunk_length_s (`float`, *optional*, defaults to 30.0):
The input length for each chunk. If `chunk_length_s = 0` then chunking is disabled. By default, the chunk
length is set 30.0s, equal to Whisper's context window.
stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
the model to *see* more context and infer letters better than without this context but the pipeline
discards the stride bits at the end to make the final reconstitution as perfect as possible.
<Tip>
For more information on how to effectively use `stride_length_s`, refer to the [ASR chunking
blog post](https://huggingface.co/blog/asr-chunking).
</Tip>
batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
a batch size in the `__call__` method will supersede any batch size passed to the `__init__`.
task (`str`, *optional*):
Task to use for generation, either `"transcribe"` or `"translate"`. Defaults to `"transcribe"`.
language (`str`, *optional*):
Language token to use for generation, can be either in the form of `"<|en|>"`, `"en"` or `"english"`.
Defaults to `None`, meaning the language is automatically inferred from the audio input.
return_timestamps (*optional*, `bool`):
Whether to return timestamps in the prediction. Defaults to False. If set to true, the pipeline
will return two keys in the output dictionary: `"text"` containing the text transcription, and `"chunks"`
containing the transcription segments chunked by their utterance-level timestamps.
Return:
`Dict`: A dictionary with the following keys:
- **text** (`str` ) -- The recognised text.
- **chunks** (*optional(, `List[Dict]`)
When using `return_timestamps`, the `chunks` will become a list containing all the various text
chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
"there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
`"".join(chunk["text"] for chunk in output["chunks"])`.
"""
batch_size = batch_size if batch_size is not None else self.batch_size
if batch_size % self.min_batch_size != 0:
raise ValueError(
f"Batch size must be a multiple of the number of JAX devices, but got batch size {batch_size} and num devices {self.min_batch_size}."
)
dataloader = self.preprocess_batch(
inputs, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, batch_size=batch_size
)
model_outputs = []
# iterate over our chunked audio samples
for batch in dataloader:
model_outputs.append(
self.forward(
batch, batch_size=batch_size, language=language, task=task, return_timestamps=return_timestamps
)
)
post_processed = self.postprocess(model_outputs, return_timestamps=return_timestamps)
return post_processed