Kevin676 commited on
Commit
b92854c
·
1 Parent(s): 351a696

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -52
app.py CHANGED
@@ -48,9 +48,6 @@ from scipy.io.wavfile import write, read
48
 
49
  import subprocess
50
 
51
- import whisper
52
- model1 = whisper.load_model("small")
53
-
54
  import openai
55
 
56
  mes = [
@@ -131,29 +128,14 @@ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encod
131
  # Define helper function
132
 
133
 
134
- def chatgpt(apikey, audio):
135
 
136
  openai.api_key = apikey
137
 
138
- # load audio and pad/trim it to fit 30 seconds
139
- audio = whisper.load_audio(audio)
140
- audio = whisper.pad_or_trim(audio)
141
-
142
- # make log-Mel spectrogram and move to the same device as the model1
143
- mel = whisper.log_mel_spectrogram(audio).to(model1.device)
144
-
145
- # detect the spoken language
146
- _, probs = model1.detect_language(mel)
147
- print(f"Detected language: {max(probs, key=probs.get)}")
148
-
149
- # decode the audio
150
- options = whisper.DecodingOptions()
151
- result = whisper.decode(model1, mel, options)
152
-
153
  messages = mes
154
 
155
  # chatgpt
156
- content = result.text
157
  messages.append({"role": "user", "content": content})
158
 
159
  completion = openai.ChatCompletion.create(
@@ -175,57 +157,58 @@ def chatgpt(apikey, audio):
175
 
176
  write(audio_out, a1, b1)
177
 
178
- return [result.text, chat_response, audio_out]
179
 
180
  def compute_spec(ref_file):
181
- y, sr = librosa.load(ref_file, sr=ap.sample_rate)
182
- spec = ap.spectrogram(y)
183
- spec = torch.FloatTensor(spec).unsqueeze(0)
184
- return spec
185
 
186
 
187
  def voice_conversion(ta, ra, da):
188
 
189
- target_audio = 'target.wav'
190
- reference_audio = 'reference.wav'
191
- driving_audio = 'driving.wav'
 
 
 
 
192
 
193
- write(target_audio, ta[0], ta[1])
194
- write(reference_audio, ra[0], ra[1])
195
- write(driving_audio, da[0], da[1])
196
-
197
  # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
198
  # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
199
  # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
200
 
201
- files = [target_audio, reference_audio, driving_audio]
202
 
203
- for file in files:
204
- subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
205
 
206
  # ta_ = read(target_audio)
207
 
208
- target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
209
- target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
210
 
211
- driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
212
- driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
213
 
214
  # Convert the voice
215
 
216
- driving_spec = compute_spec(driving_audio)
217
- y_lengths = torch.tensor([driving_spec.size(-1)])
218
- if USE_CUDA:
219
- ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
220
- ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
221
- else:
222
- ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
223
- ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
224
 
225
  # print("Reference Audio after decoder:")
226
  # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
227
 
228
- return (ap.sample_rate, ref_wav_voc)
 
229
 
230
  block = gr.Blocks()
231
 
@@ -243,15 +226,14 @@ with block:
243
  with gr.Row().style(mobile_collapse=False, equal_height=True):
244
 
245
  inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
246
- inp2 = gr.Audio(source="microphone", type="filepath",label="说些什么吧")
247
 
248
  btn = gr.Button("开始对话吧")
249
-
250
- yousay = gr.Textbox(lines=1, label="您的提问")
251
  texts = gr.Textbox(lines=2, label="ChatGPT的回答")
252
  audio_tts = gr.Audio(label="自动合成的声音")
253
 
254
- btn.click(chatgpt, [inp1, inp2], [yousay, texts, audio_tts])
255
 
256
  with gr.Box():
257
  with gr.Row().style(mobile_collapse=False, equal_height=True):
 
48
 
49
  import subprocess
50
 
 
 
 
51
  import openai
52
 
53
  mes = [
 
128
  # Define helper function
129
 
130
 
131
+ def chatgpt(apikey, result):
132
 
133
  openai.api_key = apikey
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  messages = mes
136
 
137
  # chatgpt
138
+ content = result
139
  messages.append({"role": "user", "content": content})
140
 
141
  completion = openai.ChatCompletion.create(
 
157
 
158
  write(audio_out, a1, b1)
159
 
160
+ return [chat_response, audio_out]
161
 
162
  def compute_spec(ref_file):
163
+ y, sr = librosa.load(ref_file, sr=ap.sample_rate)
164
+ spec = ap.spectrogram(y)
165
+ spec = torch.FloatTensor(spec).unsqueeze(0)
166
+ return spec
167
 
168
 
169
  def voice_conversion(ta, ra, da):
170
 
171
+ target_audio = 'target.wav'
172
+ reference_audio = 'reference.wav'
173
+ driving_audio = 'driving.wav'
174
+
175
+ write(target_audio, ta[0], ta[1])
176
+ write(reference_audio, ra[0], ra[1])
177
+ write(driving_audio, da[0], da[1])
178
 
 
 
 
 
179
  # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
180
  # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
181
  # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
182
 
183
+ files = [target_audio, reference_audio, driving_audio]
184
 
185
+ for file in files:
186
+ subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
187
 
188
  # ta_ = read(target_audio)
189
 
190
+ target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
191
+ target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
192
 
193
+ driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
194
+ driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
195
 
196
  # Convert the voice
197
 
198
+ driving_spec = compute_spec(driving_audio)
199
+ y_lengths = torch.tensor([driving_spec.size(-1)])
200
+ if USE_CUDA:
201
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
202
+ ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
203
+ else:
204
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
205
+ ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
206
 
207
  # print("Reference Audio after decoder:")
208
  # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
209
 
210
+ return (ap.sample_rate, ref_wav_voc)
211
+
212
 
213
  block = gr.Blocks()
214
 
 
226
  with gr.Row().style(mobile_collapse=False, equal_height=True):
227
 
228
  inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
229
+ inp2 = gr.components.Textbox(lines=2, label="说些什么吧")
230
 
231
  btn = gr.Button("开始对话吧")
232
+
 
233
  texts = gr.Textbox(lines=2, label="ChatGPT的回答")
234
  audio_tts = gr.Audio(label="自动合成的声音")
235
 
236
+ btn.click(chatgpt, [inp1, inp2], [texts, audio_tts])
237
 
238
  with gr.Box():
239
  with gr.Row().style(mobile_collapse=False, equal_height=True):