axxam commited on
Commit
715584d
·
verified ·
1 Parent(s): 741f964

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -44
app.py CHANGED
@@ -1,45 +1,39 @@
1
- import torch
2
  import gradio as gr
3
- from transformers import pipeline
4
-
5
- MODEL_NAME = "BlueRaccoon/whisper-small-kab"
6
- lang = "uz" # Used uz instead of kab
7
-
8
- device = 0 if torch.cuda.is_available() else "cpu"
9
- pipe = pipeline(
10
- task="automatic-speech-recognition",
11
- model=MODEL_NAME,
12
- chunk_length_s=30,
13
- device=device,
14
- )
15
-
16
- pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
17
-
18
-
19
- def transcribe(microphone):
20
- if microphone is None:
21
- return "ERROR: You need to record or upload an audio file."
22
-
23
- text = pipe(microphone)["text"]
24
- return text
25
-
26
-
27
- with gr.Blocks() as demo:
28
- with gr.Tab("Transcribe Kabyle Audio"):
29
- gr.Markdown(
30
- f"""
31
- # Kabyle Whisper Demo: Transcribe Audio
32
- Transcribe Kabyle audio recorded from the microphone or uploaded as a file. This demo uses the fine-tuned
33
- checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe Kabyle audio
34
- files of arbitrary length.
35
- """
36
- )
37
- # Input for microphone recording only
38
- microphone_input = gr.Audio(type="filepath", label="Record or Upload Kabyle Audio")
39
- gr.Interface(
40
- fn=transcribe,
41
- inputs=[microphone_input],
42
- outputs=gr.Textbox(label="Transcription"),
43
- )
44
-
45
- demo.launch()
 
 
1
  import gradio as gr
2
+ import nemo.collections.asr as nemo_asr
3
+ import numpy as np
4
+ import torch
5
+
6
+ # Load the pre-trained Kabyle ASR model
7
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large")
8
+
9
+ # Function to transcribe the audio input
10
+ def transcribe(audio):
11
+ # Print the raw audio input
12
+ print(f"Raw audio input: {audio}")
13
+
14
+ # Audio in Gradio is returned as a tuple (sample_rate, audio_data)
15
+ sample_rate, audio_data = audio
16
+
17
+ # Print to check the types
18
+ print(f"Audio data type: {type(audio_data)}")
19
+ print(f"Sample rate type: {type(sample_rate)}")
20
+
21
+ # Ensure the audio data is in numpy array format
22
+ if isinstance(audio_data, np.ndarray):
23
+ # If it's already numpy, we pass it directly
24
+ audio_data = np.array(audio_data)
25
+ elif isinstance(audio_data, torch.Tensor):
26
+ # If it's a tensor, convert to numpy array
27
+ audio_data = audio_data.numpy()
28
+ else:
29
+ print("Error: Audio data is neither a numpy array nor a tensor.")
30
+ return "Invalid audio format"
31
+
32
+ # Now transcribe the audio
33
+ return asr_model.transcribe([audio_data])
34
+
35
+ # Create the Gradio interface with audio input and text output
36
+ iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text")
37
+
38
+ # Launch the Gradio interface
39
+ iface.launch()