|
--- |
|
library_name: keras-hub |
|
--- |
|
### Model Overview |
|
# Model Summary |
|
|
|
The Moonshine models are trained for the speech recognition task, capable of transcribing English speech audio into English text. Useful Sensors developed the models to support their business direction of developing real time speech transcription products based on low cost hardware. There are 2 models of different sizes and capabilities, summarized in the presets table. |
|
|
|
Weights are released under the [MIT License](https://www.mit.edu/~amini/LICENSE.md) . Keras model code is released under the [Apache 2 License](https://github.com/keras-team/keras-hub/blob/master/LICENSE). |
|
|
|
|
|
|
|
## Links |
|
|
|
* [Moonshine Quickstart Notebook](https://www.kaggle.com/code/laxmareddypatlolla/moonshine-quickstart-notebook) |
|
* [Moonshine API Documentation](https://keras.io/keras_hub/api/models/moonshine/) |
|
* [Moonshine Model Card](https://arxiv.org/abs/2410.15608) |
|
* [KerasHub Beginner Guide](https://keras.io/guides/keras_hub/getting_started/) |
|
* [KerasHub Model Publishing Guide](https://keras.io/guides/keras_hub/upload/) |
|
|
|
## Installation |
|
|
|
Keras and KerasHub can be installed with: |
|
|
|
``` |
|
pip install -U -q keras-hub |
|
pip install -U -q keras |
|
``` |
|
|
|
Jax, TensorFlow, and Torch come preinstalled in Kaggle Notebooks. For instructions on installing them in another environment see the [Keras Getting Started](https://keras.io/getting_started/) page. |
|
|
|
## Presets |
|
|
|
The following model checkpoints are provided by the Keras team. Full code examples for each are available below. |
|
|
|
| Preset name | Parameters | Description | |
|
|---------------------------------------|------------|--------------------------------------------------------------------------------------------------------------| |
|
| moonshine_base_en | 61.5M | Moonshine base model for English speech recognition.Developed by Useful Sensors for real-time transcription.| |
|
| moonshine_tiny_en | 27.1M | Moonshine tiny model for English speech recognition. Developed by Useful Sensors for real-time transcription. | |
|
|
|
## Example Usage |
|
```Python |
|
import os |
|
|
|
import keras |
|
import keras_hub |
|
import numpy as np |
|
import librosa |
|
import tensorflow as tf |
|
|
|
from keras_hub.src.models.moonshine.moonshine_audio_to_text import ( |
|
MoonshineAudioToText, |
|
) |
|
|
|
# Custom backbone. |
|
backbone = keras_hub.models.MoonshineBackbone( |
|
vocabulary_size=10000, |
|
filter_dim=256, |
|
encoder_num_layers=6, |
|
decoder_num_layers=6, |
|
hidden_dim=256, |
|
intermediate_dim=512, |
|
encoder_num_heads=8, |
|
decoder_num_heads=8, |
|
feedforward_expansion_factor=4, |
|
decoder_use_swiglu_activation=True, |
|
encoder_use_swiglu_activation=False, |
|
) |
|
# Audio features as input (e.g., from MoonshineAudioConverter). |
|
outputs = backbone( |
|
{ |
|
"encoder_input_values": np.zeros((1, 16000, 1)), |
|
"encoder_padding_mask": np.ones((1, 16000), dtype=bool), |
|
"decoder_token_ids": np.zeros((1, 20), dtype=np.int32), |
|
"decoder_padding_mask": np.ones((1, 20), dtype=bool), |
|
} |
|
) |
|
|
|
# Config for test. |
|
BATCH_SIZE = 2 |
|
AUDIO_PATH = "path/to/audio_file.wav" |
|
|
|
# Load and prepare audio data. |
|
audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) |
|
audio_tensor = tf.expand_dims(audio, axis=-1) |
|
audio_tensor = tf.convert_to_tensor(audio_tensor, dtype=tf.float32) |
|
single_audio_input_batched = tf.expand_dims(audio_tensor, axis=0) |
|
audio_batch = tf.repeat(single_audio_input_batched, BATCH_SIZE, axis=0) |
|
dummy_texts = ["Sample transcription.", "Another sample transcription."] |
|
|
|
# Create tf.data.Dataset. |
|
audio_ds = tf.data.Dataset.from_tensor_slices(audio_batch) |
|
text_ds = tf.data.Dataset.from_tensor_slices(dummy_texts) |
|
audio_dataset = ( |
|
tf.data.Dataset.zip((audio_ds, text_ds)) |
|
.map(lambda audio, txt: {"audio": audio, "text": txt}) |
|
.batch(BATCH_SIZE) |
|
) |
|
print("Audio dataset created.") |
|
|
|
# Load pretrained Moonshine model. |
|
audio_to_text = MoonshineAudioToText.from_preset("moonshine_tiny_en") |
|
|
|
# Generation examples. |
|
generated_text_single = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (single audio): {generated_text_single}") |
|
|
|
generated_text_batch = audio_to_text.generate({"audio": audio_batch}) |
|
print(f"Generated text (batch audio): {generated_text_batch}") |
|
|
|
# Compile the generate() function with a custom sampler. |
|
audio_to_text.compile(sampler="top_k") |
|
generated_text_top_k = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (top_k sampler): {generated_text_top_k}") |
|
|
|
audio_to_text.compile(sampler="greedy") |
|
generated_text_greedy = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (greedy sampler): {generated_text_greedy}") |
|
|
|
# Fine-tuning example. |
|
audio_to_text.compile( |
|
optimizer=keras.optimizers.Adam(learning_rate=1e-5), |
|
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), |
|
weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()], |
|
) |
|
history = audio_to_text.fit(audio_dataset, steps_per_epoch=1, epochs=1) |
|
print(f"Fine-tuning completed. Training history: {history.history}") |
|
|
|
# Detached preprocessing. |
|
original_preprocessor = audio_to_text.preprocessor |
|
audio_to_text.preprocessor = None |
|
preprocessed_batch = original_preprocessor.generate_preprocess( |
|
{"audio": audio_batch} |
|
) |
|
print(f"Preprocessed batch keys: {preprocessed_batch.keys()}") |
|
stop_ids = (original_preprocessor.tokenizer.end_token_id,) |
|
generated_batch_tokens = audio_to_text.generate( |
|
preprocessed_batch, stop_token_ids=stop_ids |
|
) |
|
print(f"Generated tokens keys: {generated_batch_tokens.keys()}") |
|
final_strings = original_preprocessor.generate_postprocess( |
|
generated_batch_tokens |
|
) |
|
print(f"Final generated strings (detached): {final_strings}") |
|
audio_to_text.preprocessor = original_preprocessor |
|
print("Preprocessor reattached.") |
|
``` |
|
|
|
## Example Usage with Hugging Face URI |
|
|
|
```Python |
|
import os |
|
|
|
import keras |
|
import keras_hub |
|
import numpy as np |
|
import librosa |
|
import tensorflow as tf |
|
|
|
from keras_hub.src.models.moonshine.moonshine_audio_to_text import ( |
|
MoonshineAudioToText, |
|
) |
|
|
|
# Custom backbone. |
|
backbone = keras_hub.models.MoonshineBackbone( |
|
vocabulary_size=10000, |
|
filter_dim=256, |
|
encoder_num_layers=6, |
|
decoder_num_layers=6, |
|
hidden_dim=256, |
|
intermediate_dim=512, |
|
encoder_num_heads=8, |
|
decoder_num_heads=8, |
|
feedforward_expansion_factor=4, |
|
decoder_use_swiglu_activation=True, |
|
encoder_use_swiglu_activation=False, |
|
) |
|
# Audio features as input (e.g., from MoonshineAudioConverter). |
|
outputs = backbone( |
|
{ |
|
"encoder_input_values": np.zeros((1, 16000, 1)), |
|
"encoder_padding_mask": np.ones((1, 16000), dtype=bool), |
|
"decoder_token_ids": np.zeros((1, 20), dtype=np.int32), |
|
"decoder_padding_mask": np.ones((1, 20), dtype=bool), |
|
} |
|
) |
|
|
|
# Config for test. |
|
BATCH_SIZE = 2 |
|
AUDIO_PATH = "path/to/audio_file.wav" |
|
|
|
# Load and prepare audio data. |
|
audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) |
|
audio_tensor = tf.expand_dims(audio, axis=-1) |
|
audio_tensor = tf.convert_to_tensor(audio_tensor, dtype=tf.float32) |
|
single_audio_input_batched = tf.expand_dims(audio_tensor, axis=0) |
|
audio_batch = tf.repeat(single_audio_input_batched, BATCH_SIZE, axis=0) |
|
dummy_texts = ["Sample transcription.", "Another sample transcription."] |
|
|
|
# Create tf.data.Dataset. |
|
audio_ds = tf.data.Dataset.from_tensor_slices(audio_batch) |
|
text_ds = tf.data.Dataset.from_tensor_slices(dummy_texts) |
|
audio_dataset = ( |
|
tf.data.Dataset.zip((audio_ds, text_ds)) |
|
.map(lambda audio, txt: {"audio": audio, "text": txt}) |
|
.batch(BATCH_SIZE) |
|
) |
|
print("Audio dataset created.") |
|
|
|
# Load pretrained Moonshine model. |
|
audio_to_text = MoonshineAudioToText.from_preset("hf://keras/moonshine_tiny_en") |
|
|
|
# Generation examples. |
|
generated_text_single = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (single audio): {generated_text_single}") |
|
|
|
generated_text_batch = audio_to_text.generate({"audio": audio_batch}) |
|
print(f"Generated text (batch audio): {generated_text_batch}") |
|
|
|
# Compile the generate() function with a custom sampler. |
|
audio_to_text.compile(sampler="top_k") |
|
generated_text_top_k = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (top_k sampler): {generated_text_top_k}") |
|
|
|
audio_to_text.compile(sampler="greedy") |
|
generated_text_greedy = audio_to_text.generate( |
|
{"audio": single_audio_input_batched} |
|
) |
|
print(f"Generated text (greedy sampler): {generated_text_greedy}") |
|
|
|
# Fine-tuning example. |
|
audio_to_text.compile( |
|
optimizer=keras.optimizers.Adam(learning_rate=1e-5), |
|
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), |
|
weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()], |
|
) |
|
history = audio_to_text.fit(audio_dataset, steps_per_epoch=1, epochs=1) |
|
print(f"Fine-tuning completed. Training history: {history.history}") |
|
|
|
# Detached preprocessing. |
|
original_preprocessor = audio_to_text.preprocessor |
|
audio_to_text.preprocessor = None |
|
preprocessed_batch = original_preprocessor.generate_preprocess( |
|
{"audio": audio_batch} |
|
) |
|
print(f"Preprocessed batch keys: {preprocessed_batch.keys()}") |
|
stop_ids = (original_preprocessor.tokenizer.end_token_id,) |
|
generated_batch_tokens = audio_to_text.generate( |
|
preprocessed_batch, stop_token_ids=stop_ids |
|
) |
|
print(f"Generated tokens keys: {generated_batch_tokens.keys()}") |
|
final_strings = original_preprocessor.generate_postprocess( |
|
generated_batch_tokens |
|
) |
|
print(f"Final generated strings (detached): {final_strings}") |
|
audio_to_text.preprocessor = original_preprocessor |
|
print("Preprocessor reattached.") |
|
``` |
|
|