camanalo1 commited on
Commit
841bb81
·
1 Parent(s): 63033c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -1,21 +1,24 @@
1
  import gradio as gr
2
- from nemo.collections.asr.models import ASRModel
 
 
3
 
4
- # Load the ASR model
5
- model = ASRModel.from_pretrained("nvidia/canary-1b")
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Define a function to transcribe audio from the microphone
8
- def transcribe_audio(audio):
9
- # Perform transcription
10
- transcription = model.transcribe([audio])[0]
11
  return transcription
12
 
13
- # Interface with microphone input and text output
14
- inputs = gr.inputs.Microphone(label="Speak into the microphone")
15
- outputs = gr.outputs.Textbox(label="Transcription")
16
- title = "Speech-to-Text Transcription"
17
- description = "Transcribe speech from the microphone using the NeMo Canary ASR model."
18
- interface = gr.Interface(transcribe_audio, inputs, outputs, title=title, description=description)
19
 
20
- # Launch the interface
21
- interface.launch()
 
1
  import gradio as gr
2
+ import torchaudio
3
+ import torch
4
+ import transformers
5
 
6
+ transformer = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
7
+ processor = transformers.Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
+
9
+ def speech_to_text(audio):
10
+ # Convert audio to torch tensor
11
+ waveform, _ = torchaudio.load(audio.name)
12
+ input_values = processor(waveform, return_tensors="pt").input_values
13
+
14
+ # Perform inference
15
+ logits = transformer(input_values).logits
16
+ predicted_ids = torch.argmax(logits, dim=-1)
17
+ transcription = processor.batch_decode(predicted_ids)[0]
18
 
 
 
 
 
19
  return transcription
20
 
21
+ audio_input = gr.inputs.Audio(source="microphone", type="file", label="Record your voice:")
22
+ text_output = gr.outputs.Text(label="Transcription")
 
 
 
 
23
 
24
+ gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=text_output, title="Speech-to-Text").launch(inline=True)