Spaces:
Sleeping
Sleeping
File size: 3,476 Bytes
5abfa23 8e20a50 d3b9c5f 5abfa23 8e20a50 5abfa23 8e20a50 2e477e0 8e20a50 d3b9c5f 8e20a50 1ac84fb 805863b 8e20a50 805863b 8e20a50 5abfa23 8e20a50 5abfa23 8e20a50 5abfa23 8e20a50 5abfa23 c2e5dee 728ec5d c2e5dee 5abfa23 c2e5dee 728ec5d 5abfa23 c2e5dee 5abfa23 d471c64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
from huggingface_hub import hf_hub_download, from_pretrained_keras
# Mel Spectrogram parameters
n_fft = 512 # FFT window length
hop_length = 160 # number of samples between successive frames
n_mels = 80 # Number of Mel bands
fmin = 0.0 # Minimum frequency
fmax = 8000.0 # Maximum frequency
sampling_rate = 16000
def extract_mel_spectrogram(audio) -> np.ndarray:
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
#spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model
return spectrogram
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.math.reduce_sum(tf.cast(tf.not_equal(tf.reduce_max(y_pred, axis=2), 0), dtype="int64"), axis=1, keepdims=True)
label_length = tf.math.reduce_sum(tf.cast(tf.not_equal(y_true, -1), dtype="int64"), axis=1, keepdims=True)
loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss
# Download model from Hugging Face Hub
# model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="saved_model.pb")
# with tf.keras.utils.custom_object_scope({'CTCLoss': CTCLoss}):
# model = tf.keras.models.load_model(model_path)
model = from_pretrained_keras("kobrasoft/kobraspeech-rnn-cs")
import pickle as pkl
num_to_char_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="num_to_char.json")
with open(num_to_char_path, "rb") as f:
num_to_char = tf.keras.layers.StringLookup(vocabulary=pkl.load(f), oov_token="", invert=True)
def label_to_string(label):
return tf.strings.reduce_join(num_to_char(label)).numpy().decode()
def decode_batch_predictions(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
# Iterate over the results and get back the text
output_text = []
for result in results:
result = label_to_string(result)
output_text.append(result)
return output_text
def transcribe(audio_path):
# Load audio
audio, _ = librosa.load(audio_path, sr=sampling_rate)
# Extract features
features = extract_mel_spectrogram(audio)
# Model expects batch dimension
features = np.expand_dims(features, axis=0)
# Predict
prediction = model.predict(features)
# Assuming you have a method to decode the prediction into text
transcription = decode_batch_predictions(prediction)
return transcription[0]
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(),
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(),
)
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"],
)
demo.launch(debug=True)
if __name__ == "__main__":
iface.launch() |