Spaces:

kobrasoft
/

kobraspeech-rnn-cs

Sleeping

App Files Files Community

kozak-vaclav commited on Jun 23, 2024

Commit

8e20a50

verified ·

1 Parent(s): f9bfdc6

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -14

app.py CHANGED Viewed

@@ -1,31 +1,63 @@
 import gradio as gr
 import tensorflow as tf
-from transformers import Wav2Vec2Processor, TFWav2Vec2Model
 import librosa
-# Load the model and processor
-processor = Wav2Vec2Processor.from_pretrained("openai/whisper-tiny")
-model = TFWav2Vec2Model.from_pretrained("kobrasoft/kobraspeech-rnn-cs")
-def transcribe(audio):
     # Load audio
-    audio, rate = librosa.load(audio, sr=16000)
-    # Process audio
-    inputs = processor(audio, sampling_rate=rate, return_tensors="tf", padding="longest")
-    logits = model(inputs.input_values).logits
-    # Decode the logits
-    predicted_ids = tf.argmax(logits, axis=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
-    return transcription
 # Create Gradio interface
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.inputs.Audio(source="microphone", type="filepath"),
     outputs="text",
-    title="ASR Model Demo",
     description="Upload an audio file or record your voice to get the transcription."
 )

 import gradio as gr
 import tensorflow as tf
 import librosa
+import numpy as np
+from huggingface_hub import hf_hub_download
+# Mel Spectrogram parameters
+n_fft = 512        # FFT window length
+hop_length = 160   # number of samples between successive frames
+n_mels = 80        # Number of Mel bands
+fmin = 0.0         # Minimum frequency
+fmax = 8000.0      # Maximum frequency
+sampling_rate = 16000
+def extract_mel_spectrogram(audio) -> np.ndarray:
+    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
+                                                 n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
+    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
+    #spectrogram = np.expand_dims(spectrogram, axis=-1)  # Adding channel dimension for the model
+    return spectrogram
+# Download model from Hugging Face Hub
+model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="kobraspeech.17-40.19.keras")
+model = tf.keras.models.load_model(model_path)
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
+    # Iterate over the results and get back the text
+    output_text = []
+    for result in results:
+        result = label_to_string(result)
+        output_text.append(result)
+    return output_text
+def transcribe(audio_path):
     # Load audio
+    audio, _ = librosa.load(audio_path, sr=sampling_rate)
+    # Extract features
+    features = extract_mel_spectrogram(audio)
+    # Model expects batch dimension
+    features = np.expand_dims(features, axis=0)
+    # Predict
+    prediction = model.predict(features)
+    # Assuming you have a method to decode the prediction into text
+    transcription = decode_batch_predictions(prediction)
+    return transcription[0]
 # Create Gradio interface
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.inputs.Audio(source="microphone", type="filepath"),
     outputs="text",
+    title="Kobraspeech RNN ASR demo (cs)",
     description="Upload an audio file or record your voice to get the transcription."
 )