camanalo1 commited on
Commit
13268f4
·
1 Parent(s): e2721c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -19
app.py CHANGED
@@ -1,24 +1,23 @@
1
  import gradio as gr
2
- import torchaudio
3
  import torch
4
- import transformers
5
 
6
- transformer = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
7
- processor = transformers.Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
 
9
- def speech_to_text(audio):
10
- # Convert audio to torch tensor
11
- waveform, _ = torchaudio.load(audio.name)
12
- input_values = processor(waveform, return_tensors="pt").input_values
 
 
 
13
 
14
- # Perform inference
15
- logits = transformer(input_values).logits
16
- predicted_ids = torch.argmax(logits, dim=-1)
17
- transcription = processor.batch_decode(predicted_ids)[0]
18
-
19
- return transcription
20
-
21
- audio_input = gr.inputs.Audio(source="microphone", type="file", label="Record your voice:")
22
- text_output = gr.outputs.Text(label="Transcription")
23
-
24
- gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=text_output, title="Speech-to-Text").launch(inline=True)
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import pipeline
4
 
5
+ # Load the NeMo Canary ASR model pipeline from Hugging Face
6
+ asr_pipeline = pipeline("speech-recognition", model="nvidia/canary-1b")
7
 
8
+ # Define a Gradio interface to listen to microphone input and display the transcribed text
9
+ def transcribe_audio():
10
+ # Listen to microphone input
11
+ audio_input = gr.audio_input()
12
+ # Transcribe speech
13
+ transcription = asr_pipeline(audio_input)
14
+ return transcription[0]["transcription"]
15
 
16
+ # Define Gradio interface
17
+ gr.Interface(
18
+ fn=transcribe_audio,
19
+ inputs=gr.inputs.Microphone(label="Speak into the microphone"),
20
+ outputs="text",
21
+ title="Speech-to-Text with NeMo Canary Model",
22
+ description="Speak into the microphone and see the text transcription.",
23
+ ).launch()