ChatGPT-with-Voice-Conversion

Build error

App Files Files Community

Kevin676 commited on Apr 15, 2023

Commit

b92854c

1 Parent(s): 351a696

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -52

app.py CHANGED Viewed

@@ -48,9 +48,6 @@ from scipy.io.wavfile import write, read
 import subprocess
-import whisper
-model1 = whisper.load_model("small")
 import openai
 mes = [
@@ -131,29 +128,14 @@ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encod
 # Define helper function
-def chatgpt(apikey, audio):
     openai.api_key = apikey
-    # load audio and pad/trim it to fit 30 seconds
-    audio = whisper.load_audio(audio)
-    audio = whisper.pad_or_trim(audio)
-    # make log-Mel spectrogram and move to the same device as the model1
-    mel = whisper.log_mel_spectrogram(audio).to(model1.device)
-    # detect the spoken language
-    _, probs = model1.detect_language(mel)
-    print(f"Detected language: {max(probs, key=probs.get)}")
-    # decode the audio
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model1, mel, options)
     messages = mes
     # chatgpt
-    content = result.text
     messages.append({"role": "user", "content": content})
     completion = openai.ChatCompletion.create(
@@ -175,57 +157,58 @@ def chatgpt(apikey, audio):
     write(audio_out, a1, b1)
-    return [result.text, chat_response, audio_out]
 def compute_spec(ref_file):
-    y, sr = librosa.load(ref_file, sr=ap.sample_rate)
-    spec = ap.spectrogram(y)
-    spec = torch.FloatTensor(spec).unsqueeze(0)
-    return spec
 def voice_conversion(ta, ra, da):
-    target_audio = 'target.wav'
-    reference_audio = 'reference.wav'
-    driving_audio = 'driving.wav'
-    write(target_audio, ta[0], ta[1])
-    write(reference_audio, ra[0], ra[1])
-    write(driving_audio, da[0], da[1])
   # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
   # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
   # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
-    files = [target_audio, reference_audio, driving_audio]
-    for file in files:
-        subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
-    target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
-    target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
-    driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
-    driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
-    driving_spec = compute_spec(driving_audio)
-    y_lengths = torch.tensor([driving_spec.size(-1)])
-    if USE_CUDA:
-        ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
-        ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
-    else:
-        ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
-        ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
   # print("Reference Audio after decoder:")
   # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
-    return (ap.sample_rate, ref_wav_voc)
 block = gr.Blocks()
@@ -243,15 +226,14 @@ with block:
             with gr.Row().style(mobile_collapse=False, equal_height=True):
                 inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
-                inp2 = gr.Audio(source="microphone", type="filepath",label="说些什么吧")
                 btn = gr.Button("开始对话吧")
-        yousay = gr.Textbox(lines=1, label="您的提问")
         texts = gr.Textbox(lines=2, label="ChatGPT的回答")
         audio_tts = gr.Audio(label="自动合成的声音")
-        btn.click(chatgpt, [inp1, inp2], [yousay, texts, audio_tts])
         with gr.Box():
             with gr.Row().style(mobile_collapse=False, equal_height=True):

 import subprocess
 import openai
 mes = [
 # Define helper function
+def chatgpt(apikey, result):
     openai.api_key = apikey
     messages = mes
     # chatgpt
+    content = result
     messages.append({"role": "user", "content": content})
     completion = openai.ChatCompletion.create(
     write(audio_out, a1, b1)
+    return [chat_response, audio_out]
 def compute_spec(ref_file):
+  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
+  spec = ap.spectrogram(y)
+  spec = torch.FloatTensor(spec).unsqueeze(0)
+  return spec
 def voice_conversion(ta, ra, da):
+  target_audio = 'target.wav'
+  reference_audio = 'reference.wav'
+  driving_audio = 'driving.wav'
+  write(target_audio, ta[0], ta[1])
+  write(reference_audio, ra[0], ra[1])
+  write(driving_audio, da[0], da[1])
   # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
   # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
   # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
+  files = [target_audio, reference_audio, driving_audio]
+  for file in files:
+      subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
+  target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
+  target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
+  driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
+  driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
+  driving_spec = compute_spec(driving_audio)
+  y_lengths = torch.tensor([driving_spec.size(-1)])
+  if USE_CUDA:
+      ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
+      ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
+  else:
+      ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
+      ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
   # print("Reference Audio after decoder:")
   # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
+  return (ap.sample_rate, ref_wav_voc)
 block = gr.Blocks()
             with gr.Row().style(mobile_collapse=False, equal_height=True):
                 inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
+                inp2 = gr.components.Textbox(lines=2, label="说些什么吧")
                 btn = gr.Button("开始对话吧")
         texts = gr.Textbox(lines=2, label="ChatGPT的回答")
         audio_tts = gr.Audio(label="自动合成的声音")
+        btn.click(chatgpt, [inp1, inp2], [texts, audio_tts])
         with gr.Box():
             with gr.Row().style(mobile_collapse=False, equal_height=True):