camanalo1 commited on
Commit
c9adb84
·
1 Parent(s): 97fde10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -35
app.py CHANGED
@@ -1,42 +1,24 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import pipeline
4
- from nemo.collections.asr.models import EncDecMultiTaskModel # Add this import statement
5
 
6
- # Load Canary ASR model
7
  canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
8
- decode_cfg = canary_model.cfg.decoding
9
- decode_cfg.beam.beam_size = 1
10
- canary_model.change_decoding_strategy(decode_cfg)
11
-
12
- # Load Phi-3 Mini-4K-Instruct LLM model
13
- phi_3_model_id = "microsoft/Phi-3-mini-4k-instruct"
14
- phi_3_pipeline = pipeline("text-generation", model=phi_3_model_id, trust_remote_code=True)
15
-
16
- # Load VITS TTS model
17
- vits_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
18
- vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
19
 
 
20
  def transcribe_audio(audio):
21
- transcribed_text = canary_model.transcribe(audio, batch_size=16)
22
- return transcribed_text
23
-
24
- def generate_response(prompt):
25
- response = phi_3_pipeline(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
26
- return response
27
 
28
- def synthesize_speech(text):
29
- inputs = vits_tokenizer(text=text, return_tensors="pt")
30
- with torch.no_grad():
31
- outputs = vits_model(**inputs)
32
- waveform = outputs.waveform[0]
33
- return waveform
34
 
35
- # Define Gradio interface
36
- gr.Interface(
37
- fn=[transcribe_audio, generate_response, synthesize_speech],
38
- inputs=["audio", "text", "text"],
39
- outputs=[gr.outputs.Textbox(label="Transcribed Text"),
40
- gr.outputs.Textbox(label="Generated Response"),
41
- gr.outputs.Audio(label="Synthesized Speech")]
42
- ).launch()
 
1
  import gradio as gr
2
+ from nemo.collections.asr.models import EncDecMultiTaskModel
 
 
3
 
4
+ # Load the model
5
  canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Define ASR function
8
  def transcribe_audio(audio):
9
+ # Perform transcription
10
+ predicted_text = canary_model.transcribe(
11
+ paths2audio_files=[audio.name],
12
+ batch_size=16 # Batch size for inference
13
+ )
14
+ return predicted_text[0]
15
 
16
+ # Interface
17
+ inputs = gr.inputs.Audio(source="microphone", label="Speak into the microphone", type="microphone")
18
+ outputs = gr.outputs.Textbox(label="Transcription")
19
+ title = "Canary ASR"
20
+ description = "Transcribe speech from the microphone using the NeMo Canary ASR model."
21
+ interface = gr.Interface(transcribe_audio, inputs, outputs, title=title, description=description)
22
 
23
+ # Launch interface
24
+ interface.launch()