Kevin676 commited on
Commit
43986c9
·
1 Parent(s): ec78133

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -41
app.py CHANGED
@@ -39,6 +39,39 @@ from scipy.io.wavfile import write, read
39
 
40
  import subprocess
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  '''
43
  from google.colab import drive
44
  drive.mount('/content/drive')
@@ -118,63 +151,134 @@ def compute_spec(ref_file):
118
  return spec
119
 
120
 
121
- def voice_conversion(ta, ra, da):
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- target_audio = 'target.wav'
124
- reference_audio = 'reference.wav'
125
- driving_audio = 'driving.wav'
 
126
 
127
- write(target_audio, ta[0], ta[1])
128
- write(reference_audio, ra[0], ra[1])
129
- write(driving_audio, da[0], da[1])
 
 
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
132
  # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
133
  # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
134
 
135
- files = [target_audio, reference_audio, driving_audio]
136
 
137
- for file in files:
138
- subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
139
 
140
  # ta_ = read(target_audio)
141
 
142
- target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
143
- target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
144
 
145
- driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
146
- driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
147
 
148
  # Convert the voice
149
 
150
- driving_spec = compute_spec(driving_audio)
151
- y_lengths = torch.tensor([driving_spec.size(-1)])
152
- if USE_CUDA:
153
- ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
154
- ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
155
- else:
156
- ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
157
- ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
158
 
159
  # print("Reference Audio after decoder:")
160
  # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
161
 
162
- return (ap.sample_rate, ref_wav_voc)
163
-
164
- c3 = gr.Interface(
165
- fn=voice_conversion,
166
- inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
167
- outputs=gr.Audio(label='Target Speaker - Converted Clip'),
168
- examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']],
169
- description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert."
170
- )
171
-
172
- c1_m2 = gr.Interface(
173
- fn=voice_conversion,
174
- inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')],
175
- outputs=gr.Audio(label='Target Speaker - Converted Clip'),
176
- description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert."
177
- )
178
-
179
- demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion")
180
- demo.launch(debug='True')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  import subprocess
41
 
42
+
43
+ from TTS.api import TTS
44
+ tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=True)
45
+ import whisper
46
+ model = whisper.load_model("small")
47
+ os.system('pip install voicefixer --upgrade')
48
+ from voicefixer import VoiceFixer
49
+ voicefixer = VoiceFixer()
50
+ import openai
51
+ import torchaudio
52
+ from speechbrain.pretrained import SpectralMaskEnhancement
53
+
54
+ enhance_model = SpectralMaskEnhancement.from_hparams(
55
+ source="speechbrain/metricgan-plus-voicebank",
56
+ savedir="pretrained_models/metricgan-plus-voicebank",
57
+ run_opts={"device":"cuda"},
58
+ )
59
+
60
+ mes1 = [
61
+ {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
62
+ ]
63
+
64
+ mes2 = [
65
+ {"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
66
+ ]
67
+
68
+ mes3 = [
69
+ {"role": "system", "content": "You are my personal assistant. Your name is Alice."}
70
+ ]
71
+
72
+ res = []
73
+
74
+
75
  '''
76
  from google.colab import drive
77
  drive.mount('/content/drive')
 
151
  return spec
152
 
153
 
154
+ def voice_conversion(apikey, ta, audio, choice1):
155
+
156
+ openai.api_key = apikey
157
+
158
+ # load audio and pad/trim it to fit 30 seconds
159
+ audio = whisper.load_audio(audio)
160
+ audio = whisper.pad_or_trim(audio)
161
+
162
+ # make log-Mel spectrogram and move to the same device as the model
163
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
164
+
165
+ # detect the spoken language
166
+ _, probs = model.detect_language(mel)
167
+ print(f"Detected language: {max(probs, key=probs.get)}")
168
 
169
+ # decode the audio
170
+ options = whisper.DecodingOptions()
171
+ result = whisper.decode(model, mel, options)
172
+ res.append(result.text)
173
 
174
+ if choice1 == "TOEFL":
175
+ messages = mes1
176
+ elif choice1 == "Therapist":
177
+ messages = mes2
178
+ elif choice1 == "Alice":
179
+ messages = mes3
180
 
181
+ # chatgpt
182
+ n = len(res)
183
+ content = res[n-1]
184
+ messages.append({"role": "user", "content": content})
185
+
186
+ completion = openai.ChatCompletion.create(
187
+ model = "gpt-3.5-turbo",
188
+ messages = messages
189
+ )
190
+
191
+ chat_response = completion.choices[0].message.content
192
+
193
+ messages.append({"role": "assistant", "content": chat_response})
194
+
195
+ tts.tts_to_file(chat_response, file_path="output.wav")
196
+
197
+ target_audio = "target.wav"
198
+ reference_audio = "output.wav"
199
+ driving_audio = "output.wav"
200
+
201
+ ra = "output.wav"
202
+ da = "output.wav"
203
+
204
+ write(target_audio, ta[0], ta[1])
205
+ write(reference_audio, ra[0], ra[1])
206
+ write(driving_audio, da[0], da[1])
207
+
208
  # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
209
  # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
210
  # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
211
 
212
+ files = [target_audio, reference_audio, driving_audio]
213
 
214
+ for file in files:
215
+ subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
216
 
217
  # ta_ = read(target_audio)
218
 
219
+ target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
220
+ target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
221
 
222
+ driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
223
+ driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
224
 
225
  # Convert the voice
226
 
227
+ driving_spec = compute_spec(driving_audio)
228
+ y_lengths = torch.tensor([driving_spec.size(-1)])
229
+ if USE_CUDA:
230
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
231
+ ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
232
+ else:
233
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
234
+ ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
235
 
236
  # print("Reference Audio after decoder:")
237
  # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
238
 
239
+ voicefixer.restore(input=ref_wav_voc, # input wav file path
240
+ output="audio1.wav", # output wav file path
241
+ cuda=True, # whether to use gpu acceleration
242
+ mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
243
+
244
+ noisy = enhance_model.load_audio(
245
+ "audio1.wav"
246
+ ).unsqueeze(0)
247
+
248
+ enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
249
+ torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
250
+
251
+ return [result.text, chat_response, "enhanced.wav"]
252
+
253
+ c1=gr.Interface(
254
+ fn=voice_conversion,
255
+ inputs=[
256
+ gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
257
+ gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
258
+ gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
259
+ gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
260
+ ],
261
+ outputs=[
262
+ gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
263
+ ],
264
+ #theme="huggingface",
265
+ description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
266
+ )
267
+
268
+ c2=gr.Interface(
269
+ fn=voice_conversion,
270
+ inputs=[
271
+ gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
272
+ gr.Audio(source="microphone", label = "请上传您喜欢的声音,并尽量避免噪音", type="filepath"),
273
+ gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
274
+ gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
275
+ ],
276
+ outputs=[
277
+ gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
278
+ ],
279
+ #theme="huggingface",
280
+ description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
281
+ )
282
+
283
+ demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"], title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!')
284
+ demo.launch()