Update app.py
Browse files
app.py
CHANGED
@@ -1,42 +1,24 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
from transformers import pipeline
|
4 |
-
from nemo.collections.asr.models import EncDecMultiTaskModel # Add this import statement
|
5 |
|
6 |
-
# Load
|
7 |
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
8 |
-
decode_cfg = canary_model.cfg.decoding
|
9 |
-
decode_cfg.beam.beam_size = 1
|
10 |
-
canary_model.change_decoding_strategy(decode_cfg)
|
11 |
-
|
12 |
-
# Load Phi-3 Mini-4K-Instruct LLM model
|
13 |
-
phi_3_model_id = "microsoft/Phi-3-mini-4k-instruct"
|
14 |
-
phi_3_pipeline = pipeline("text-generation", model=phi_3_model_id, trust_remote_code=True)
|
15 |
-
|
16 |
-
# Load VITS TTS model
|
17 |
-
vits_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
18 |
-
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
19 |
|
|
|
20 |
def transcribe_audio(audio):
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
return
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
-
#
|
36 |
-
|
37 |
-
fn=[transcribe_audio, generate_response, synthesize_speech],
|
38 |
-
inputs=["audio", "text", "text"],
|
39 |
-
outputs=[gr.outputs.Textbox(label="Transcribed Text"),
|
40 |
-
gr.outputs.Textbox(label="Generated Response"),
|
41 |
-
gr.outputs.Audio(label="Synthesized Speech")]
|
42 |
-
).launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from nemo.collections.asr.models import EncDecMultiTaskModel
|
|
|
|
|
3 |
|
4 |
+
# Load the model
|
5 |
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
# Define ASR function
|
8 |
def transcribe_audio(audio):
|
9 |
+
# Perform transcription
|
10 |
+
predicted_text = canary_model.transcribe(
|
11 |
+
paths2audio_files=[audio.name],
|
12 |
+
batch_size=16 # Batch size for inference
|
13 |
+
)
|
14 |
+
return predicted_text[0]
|
15 |
|
16 |
+
# Interface
|
17 |
+
inputs = gr.inputs.Audio(source="microphone", label="Speak into the microphone", type="microphone")
|
18 |
+
outputs = gr.outputs.Textbox(label="Transcription")
|
19 |
+
title = "Canary ASR"
|
20 |
+
description = "Transcribe speech from the microphone using the NeMo Canary ASR model."
|
21 |
+
interface = gr.Interface(transcribe_audio, inputs, outputs, title=title, description=description)
|
22 |
|
23 |
+
# Launch interface
|
24 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|