Spaces:
Runtime error
Runtime error
File size: 3,706 Bytes
5be928f 0ecdad8 eeeccc0 ca27f63 0ecdad8 5be928f eeeccc0 5be928f eeeccc0 5be928f eeeccc0 5be928f 8c3abca 3ebd6b3 8929203 8c3abca 5be928f 0ecdad8 04f2289 0ecdad8 8c3abca 0ecdad8 5be928f 0ecdad8 26e0b3b eeeccc0 8c3abca 5be928f 5eedbf3 5be928f 6d8b482 5be928f 5eedbf3 6d8b482 5be928f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from TTS.api import TTS
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
import whisper
model = whisper.load_model("small")
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
from scipy.io import wavfile
import noisereduce as nr
import gradio as gr
import openai
mes1 = [
{"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
]
mes2 = [
{"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
]
mes3 = [
{"role": "system", "content": "You are my personal assistant. Your name is Alice."}
]
res = []
def transcribe(apikey, upload, audio, choice1):
openai.api_key = apikey
# time.sleep(3)
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
res.append(result.text)
if choice1 == "TOEFL":
messages = mes1
elif choice1 == "Therapist":
messages = mes2
elif choice1 == "Alice":
messages = mes3
# chatgpt
n = len(res)
content = res[n-1]
messages.append({"role": "user", "content": content})
completion = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = messages
)
chat_response = completion.choices[0].message.content
messages.append({"role": "assistant", "content": chat_response})
tts.tts_to_file(chat_response, speaker_wav = upload, language="en", file_path="output.wav")
rate, data = wavfile.read("output.wav")
#reduced_noise = nr.reduce_noise(y=data, sr=rate, prop_decrease= 0.9, stationary=True)
reduced_noise = nr.reduce_noise(y = data, sr=rate, prop_decrease= 0.8, thresh_n_mult_nonstationary=2, stationary=False)
#reduced_noise = nr.reduce_noise(y = data, sr=rate, thresh_n_mult_nonstationary=2, stationary=False)
wavfile.write("audio1.wav", rate, reduced_noise)
enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)
noisy = enhance_model.load_audio(
"audio1.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return [result.text, chat_response, "enhanced.wav"]
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Audio(label="Audio with Custom Voice")
gr.Interface(
title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!',
theme="huggingface",
description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
fn=transcribe,
inputs=[
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
gr.inputs.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
gr.inputs.Audio(source="microphone", type="filepath"),
gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
],
outputs=[
output_1, output_2, output_3
],
).launch() |