ChatGPT-with-Voice-Conversion

Build error

App Files Files Community

Kevin676 commited on Apr 3, 2023

Commit

43986c9

1 Parent(s): ec78133

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -41

app.py CHANGED Viewed

@@ -39,6 +39,39 @@ from scipy.io.wavfile import write, read
 import subprocess
 '''
 from google.colab import drive
 drive.mount('/content/drive')
@@ -118,63 +151,134 @@ def compute_spec(ref_file):
   return spec
-def voice_conversion(ta, ra, da):
-  target_audio = 'target.wav'
-  reference_audio = 'reference.wav'
-  driving_audio = 'driving.wav'
-  write(target_audio, ta[0], ta[1])
-  write(reference_audio, ra[0], ra[1])
-  write(driving_audio, da[0], da[1])
   # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
   # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
   # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
-  files = [target_audio, reference_audio, driving_audio]
-  for file in files:
-      subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
-  target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
-  target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
-  driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
-  driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
-  driving_spec = compute_spec(driving_audio)
-  y_lengths = torch.tensor([driving_spec.size(-1)])
-  if USE_CUDA:
-      ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
-      ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
-  else:
-      ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
-      ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
   # print("Reference Audio after decoder:")
   # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
-  return (ap.sample_rate, ref_wav_voc)
-c3 = gr.Interface(
-    fn=voice_conversion,
-    inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
-    outputs=gr.Audio(label='Target Speaker - Converted Clip'),
-    examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']],
-    description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert."
-)
-c1_m2 = gr.Interface(
-    fn=voice_conversion,
-    inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')],
-    outputs=gr.Audio(label='Target Speaker - Converted Clip'),
-    description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert."
-)
-demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion")
-demo.launch(debug='True')

 import subprocess
+from TTS.api import TTS
+tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=True)
+import whisper
+model = whisper.load_model("small")
+os.system('pip install voicefixer --upgrade')
+from voicefixer import VoiceFixer
+voicefixer = VoiceFixer()
+import openai
+import torchaudio
+from speechbrain.pretrained import SpectralMaskEnhancement
+enhance_model = SpectralMaskEnhancement.from_hparams(
+source="speechbrain/metricgan-plus-voicebank",
+savedir="pretrained_models/metricgan-plus-voicebank",
+run_opts={"device":"cuda"},
+)
+mes1 = [
+    {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
+]
+mes2 = [
+    {"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
+]
+mes3 = [
+    {"role": "system", "content": "You are my personal assistant. Your name is Alice."}
+]
+res = []
 '''
 from google.colab import drive
 drive.mount('/content/drive')
   return spec
+def voice_conversion(apikey, ta, audio, choice1):
+    openai.api_key = apikey
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+    # decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    res.append(result.text)
+    if choice1 == "TOEFL":
+      messages = mes1
+    elif choice1 == "Therapist":
+      messages = mes2
+    elif choice1 == "Alice":
+      messages = mes3
+    # chatgpt
+    n = len(res)
+    content = res[n-1]
+    messages.append({"role": "user", "content": content})
+    completion = openai.ChatCompletion.create(
+      model = "gpt-3.5-turbo",
+      messages = messages
+    )
+    chat_response = completion.choices[0].message.content
+    messages.append({"role": "assistant", "content": chat_response})
+    tts.tts_to_file(chat_response, file_path="output.wav")
+    target_audio = "target.wav"
+    reference_audio = "output.wav"
+    driving_audio = "output.wav"
+    ra = "output.wav"
+    da = "output.wav"
+    write(target_audio, ta[0], ta[1])
+    write(reference_audio, ra[0], ra[1])
+    write(driving_audio, da[0], da[1])
   # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
   # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
   # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
+    files = [target_audio, reference_audio, driving_audio]
+    for file in files:
+        subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
+    target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
+    target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
+    driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
+    driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
+    driving_spec = compute_spec(driving_audio)
+    y_lengths = torch.tensor([driving_spec.size(-1)])
+    if USE_CUDA:
+        ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
+        ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
+    else:
+        ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
+        ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
   # print("Reference Audio after decoder:")
   # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
+    voicefixer.restore(input=ref_wav_voc, # input wav file path
+                    output="audio1.wav", # output wav file path
+                    cuda=True, # whether to use gpu acceleration
+                    mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
+    noisy = enhance_model.load_audio(
+    "audio1.wav"
+    ).unsqueeze(0)
+    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
+    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
+    return [result.text, chat_response, "enhanced.wav"]
+c1=gr.Interface(
+    fn=voice_conversion,
+    inputs=[
+        gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
+        gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
+        gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
+        gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
+    ],
+    outputs=[
+        gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
+    ],
+    #theme="huggingface",
+    description = "🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",
+    )
+c2=gr.Interface(
+    fn=voice_conversion,
+    inputs=[
+        gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
+        gr.Audio(source="microphone", label = "请上传您喜欢的声音，并尽量避免噪音", type="filepath"),
+        gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
+        gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
+    ],
+    outputs=[
+        gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
+    ],
+    #theme="huggingface",
+    description = "🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",
+    )
+demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"], title = '🥳💬💕 - TalktoAI，随时随地，谈天说地！')
+demo.launch()