gdnartea commited on
Commit
d94780b
·
verified ·
1 Parent(s): 8c46445

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -17
app.py CHANGED
@@ -1,34 +1,167 @@
1
- from transformers import AutoTokenizer, VitsModel, set_seed
2
  import gradio as gr
3
- import torch
 
 
4
  import soundfile as sf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
6
 
 
 
7
 
8
- # Initialize the VITS model, tokenizer, and seed
9
 
10
- vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
11
- vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
12
- set_seed(2020)
13
 
 
14
 
15
- def text_to_speech(text_response):
16
- inputs = vits_tokenizer(text=text_response, return_tensors="pt")
17
- with torch.no_grad():
18
- outputs = vits_model(**inputs)
19
- waveform = outputs.waveform[0]
20
- sf.write('output.wav', waveform.numpy(), vits_model.config.sampling_rate)
21
 
22
- return 'output.wav'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
 
 
24
 
25
 
26
  # Create a Gradio interface
27
  iface = gr.Interface(
28
- fn=text_to_speech,
29
- inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
30
- outputs=gr.Audio("techno.wav")
31
  )
32
 
33
  # Launch the interface
34
- iface.launch()
 
1
+ # imports
2
  import gradio as gr
3
+ import json
4
+ import librosa
5
+ import os
6
  import soundfile as sf
7
+ import tempfile
8
+ import uuid
9
+ import torch
10
+ from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline
11
+
12
+ from nemo.collections.asr.models import ASRModel
13
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
+
16
+
17
+
18
+ SAMPLE_RATE = 16000 # Hz
19
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
20
+
21
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
22
+ model.eval()
23
+
24
+ # make sure beam size always 1 for consistency
25
+ model.change_decoding_strategy(None)
26
+ decoding_cfg = model.cfg.decoding
27
+ decoding_cfg.beam.beam_size = 1
28
+ model.change_decoding_strategy(decoding_cfg)
29
+
30
+ # setup for buffered inference
31
+ model.cfg.preprocessor.dither = 0.0
32
+ model.cfg.preprocessor.pad_to = 0
33
+
34
+ feature_stride = model.cfg.preprocessor['window_stride']
35
+ model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
36
+
37
+ frame_asr = FrameBatchMultiTaskAED(
38
+ asr_model=model,
39
+ frame_len=40.0,
40
+ total_buffer=40.0,
41
+ batch_size=16,
42
+ )
43
+
44
+ amp_dtype = torch.float16
45
+
46
+
47
+ def convert_audio(audio_filepath, tmpdir, utt_id):
48
+ """
49
+ Convert all files to monochannel 16 kHz wav files.
50
+ Do not convert and raise error if audio too long.
51
+ Returns output filename and duration.
52
+ """
53
+
54
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
55
+
56
+ duration = librosa.get_duration(y=data, sr=sr)
57
 
58
+ if duration / 60.0 > MAX_AUDIO_MINUTES:
59
+ raise gr.Error(
60
+ f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
61
+ "If you wish, you may trim the audio using the Audio viewer in Step 1 "
62
+ "(click on the scissors icon to start trimming audio)."
63
+ )
64
 
65
+ if sr != SAMPLE_RATE:
66
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
67
 
68
+ out_filename = os.path.join(tmpdir, utt_id + '.wav')
69
 
70
+ # save output audio
71
+ sf.write(out_filename, data, SAMPLE_RATE)
 
72
 
73
+ return out_filename, duration
74
 
75
+ def transcribe(audio_filepath):
 
 
 
 
 
76
 
77
+ if audio_filepath is None:
78
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
79
+
80
+ utt_id = uuid.uuid4()
81
+ with tempfile.TemporaryDirectory() as tmpdir:
82
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
83
+
84
+ # make manifest file and save
85
+ manifest_data = {
86
+ "audio_filepath": converted_audio_filepath,
87
+ "source_lang": "en",
88
+ "target_lang": "en",
89
+ "taskname": "asr",
90
+ "pnc": "no",
91
+ "answer": "predict",
92
+ "duration": str(duration),
93
+ }
94
+
95
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
96
+
97
+ with open(manifest_filepath, 'w') as fout:
98
+ line = json.dumps(manifest_data)
99
+ fout.write(line + '\n')
100
+
101
+ # call transcribe, passing in manifest filepath
102
+ if duration < 40:
103
+ output_text = model.transcribe(manifest_filepath)[0]
104
+ else: # do buffered inference
105
+ with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
106
+ with torch.no_grad():
107
+ hyps = get_buffered_pred_feat_multitaskAED(
108
+ frame_asr,
109
+ model.cfg.preprocessor,
110
+ model_stride_in_secs,
111
+ model.device,
112
+ manifest=manifest_filepath,
113
+ filepaths=None,
114
+ )
115
+
116
+ output_text = hyps[0].text
117
+
118
+ return output_text
119
+
120
+ torch.random.manual_seed(0)
121
+
122
+ proc_model = AutoModelForCausalLM.from_pretrained(
123
+ "microsoft/Phi-3-mini-4k-instruct",
124
+ trust_remote_code=True,
125
+ )
126
+
127
+ proc_model.to("cpu")
128
+ proc_model.eval()
129
+ proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
130
+
131
+
132
+ start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}
133
+
134
+
135
+
136
+ def generate_response(user_input):
137
+ messages = [start, {"role": "user", "content": user_input}]
138
+ inputs = proc_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
139
+
140
+ outputs = proc_model.generate(
141
+ inputs,
142
+ max_new_tokens=48,
143
+ )
144
+
145
+ response = proc_tokenizer.batch_decode(
146
+ outputs,
147
+ skip_special_tokens=True,
148
+ clean_up_tokenization_spaces=False,
149
+ )[0]
150
+
151
+ return response
152
 
153
+ def CanaryPhi(audio_filepath):
154
+ user_input = transcribe(audio_filepath)
155
+ response = generate_response(user_input)
156
+ return response
157
 
158
 
159
  # Create a Gradio interface
160
  iface = gr.Interface(
161
+ fn=CanaryPhi,
162
+ inputs=gr.Audio(sources="microphone", type="filepath"),
163
+ outputs=gr.Textbox(),
164
  )
165
 
166
  # Launch the interface
167
+ iface.launch()