Spaces:

kobrasoft
/

kobraspeech-rnn-cs

Sleeping

File size: 3,476 Bytes

5abfa23
 
 
8e20a50
d3b9c5f
5abfa23
8e20a50
 
 
 
 
 
 
5abfa23
8e20a50
 
 
 
 
 
 
2e477e0
 
 
 
 
 
 
 
 
8e20a50
d3b9c5f
 
 
 
8e20a50
1ac84fb
 
805863b
 
 
 
 
 
 
8e20a50
 
 
805863b
8e20a50
 
 
 
 
 
 
 
5abfa23
8e20a50
 
 
 
 
 
 
 
 
 
5abfa23
8e20a50
 
5abfa23
8e20a50
5abfa23
c2e5dee
 
 
 
 
728ec5d
c2e5dee
 
 
5abfa23
c2e5dee
728ec5d
5abfa23
 
c2e5dee
 
 
 
 
 
 
 
5abfa23
d471c64

import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
from huggingface_hub import hf_hub_download, from_pretrained_keras

# Mel Spectrogram parameters
n_fft = 512        # FFT window length
hop_length = 160   # number of samples between successive frames
n_mels = 80        # Number of Mel bands
fmin = 0.0         # Minimum frequency
fmax = 8000.0      # Maximum frequency
sampling_rate = 16000

def extract_mel_spectrogram(audio) -> np.ndarray:
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
                                                 n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    #spectrogram = np.expand_dims(spectrogram, axis=-1)  # Adding channel dimension for the model
    return spectrogram

def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.math.reduce_sum(tf.cast(tf.not_equal(tf.reduce_max(y_pred, axis=2), 0), dtype="int64"), axis=1, keepdims=True)
    label_length = tf.math.reduce_sum(tf.cast(tf.not_equal(y_true, -1), dtype="int64"), axis=1, keepdims=True)

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

# Download model from Hugging Face Hub
# model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="saved_model.pb")
# with tf.keras.utils.custom_object_scope({'CTCLoss': CTCLoss}):
#     model = tf.keras.models.load_model(model_path)
model = from_pretrained_keras("kobrasoft/kobraspeech-rnn-cs")

import pickle as pkl

num_to_char_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="num_to_char.json")
with open(num_to_char_path, "rb") as f:
    num_to_char = tf.keras.layers.StringLookup(vocabulary=pkl.load(f), oov_token="", invert=True)

def label_to_string(label):
    return tf.strings.reduce_join(num_to_char(label)).numpy().decode()

def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = label_to_string(result)
        output_text.append(result)
    return output_text

def transcribe(audio_path):
    # Load audio
    audio, _ = librosa.load(audio_path, sr=sampling_rate)
    
    # Extract features
    features = extract_mel_spectrogram(audio)
    
    # Model expects batch dimension
    features = np.expand_dims(features, axis=0)

    # Predict
    prediction = model.predict(features)
    
    # Assuming you have a method to decode the prediction into text
    transcription = decode_batch_predictions(prediction)
    
    return transcription[0]

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(),
)

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

if __name__ == "__main__":
    iface.launch()