Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -39,6 +39,39 @@ from scipy.io.wavfile import write, read
|
|
39 |
|
40 |
import subprocess
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
'''
|
43 |
from google.colab import drive
|
44 |
drive.mount('/content/drive')
|
@@ -118,63 +151,134 @@ def compute_spec(ref_file):
|
|
118 |
return spec
|
119 |
|
120 |
|
121 |
-
def voice_conversion(ta,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
|
132 |
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
|
133 |
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
|
134 |
|
135 |
-
|
136 |
|
137 |
-
|
138 |
-
|
139 |
|
140 |
# ta_ = read(target_audio)
|
141 |
|
142 |
-
|
143 |
-
|
144 |
|
145 |
-
|
146 |
-
|
147 |
|
148 |
# Convert the voice
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
|
159 |
# print("Reference Audio after decoder:")
|
160 |
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
import subprocess
|
41 |
|
42 |
+
|
43 |
+
from TTS.api import TTS
|
44 |
+
tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=True)
|
45 |
+
import whisper
|
46 |
+
model = whisper.load_model("small")
|
47 |
+
os.system('pip install voicefixer --upgrade')
|
48 |
+
from voicefixer import VoiceFixer
|
49 |
+
voicefixer = VoiceFixer()
|
50 |
+
import openai
|
51 |
+
import torchaudio
|
52 |
+
from speechbrain.pretrained import SpectralMaskEnhancement
|
53 |
+
|
54 |
+
enhance_model = SpectralMaskEnhancement.from_hparams(
|
55 |
+
source="speechbrain/metricgan-plus-voicebank",
|
56 |
+
savedir="pretrained_models/metricgan-plus-voicebank",
|
57 |
+
run_opts={"device":"cuda"},
|
58 |
+
)
|
59 |
+
|
60 |
+
mes1 = [
|
61 |
+
{"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
|
62 |
+
]
|
63 |
+
|
64 |
+
mes2 = [
|
65 |
+
{"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
|
66 |
+
]
|
67 |
+
|
68 |
+
mes3 = [
|
69 |
+
{"role": "system", "content": "You are my personal assistant. Your name is Alice."}
|
70 |
+
]
|
71 |
+
|
72 |
+
res = []
|
73 |
+
|
74 |
+
|
75 |
'''
|
76 |
from google.colab import drive
|
77 |
drive.mount('/content/drive')
|
|
|
151 |
return spec
|
152 |
|
153 |
|
154 |
+
def voice_conversion(apikey, ta, audio, choice1):
|
155 |
+
|
156 |
+
openai.api_key = apikey
|
157 |
+
|
158 |
+
# load audio and pad/trim it to fit 30 seconds
|
159 |
+
audio = whisper.load_audio(audio)
|
160 |
+
audio = whisper.pad_or_trim(audio)
|
161 |
+
|
162 |
+
# make log-Mel spectrogram and move to the same device as the model
|
163 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
164 |
+
|
165 |
+
# detect the spoken language
|
166 |
+
_, probs = model.detect_language(mel)
|
167 |
+
print(f"Detected language: {max(probs, key=probs.get)}")
|
168 |
|
169 |
+
# decode the audio
|
170 |
+
options = whisper.DecodingOptions()
|
171 |
+
result = whisper.decode(model, mel, options)
|
172 |
+
res.append(result.text)
|
173 |
|
174 |
+
if choice1 == "TOEFL":
|
175 |
+
messages = mes1
|
176 |
+
elif choice1 == "Therapist":
|
177 |
+
messages = mes2
|
178 |
+
elif choice1 == "Alice":
|
179 |
+
messages = mes3
|
180 |
|
181 |
+
# chatgpt
|
182 |
+
n = len(res)
|
183 |
+
content = res[n-1]
|
184 |
+
messages.append({"role": "user", "content": content})
|
185 |
+
|
186 |
+
completion = openai.ChatCompletion.create(
|
187 |
+
model = "gpt-3.5-turbo",
|
188 |
+
messages = messages
|
189 |
+
)
|
190 |
+
|
191 |
+
chat_response = completion.choices[0].message.content
|
192 |
+
|
193 |
+
messages.append({"role": "assistant", "content": chat_response})
|
194 |
+
|
195 |
+
tts.tts_to_file(chat_response, file_path="output.wav")
|
196 |
+
|
197 |
+
target_audio = "target.wav"
|
198 |
+
reference_audio = "output.wav"
|
199 |
+
driving_audio = "output.wav"
|
200 |
+
|
201 |
+
ra = "output.wav"
|
202 |
+
da = "output.wav"
|
203 |
+
|
204 |
+
write(target_audio, ta[0], ta[1])
|
205 |
+
write(reference_audio, ra[0], ra[1])
|
206 |
+
write(driving_audio, da[0], da[1])
|
207 |
+
|
208 |
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
|
209 |
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
|
210 |
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
|
211 |
|
212 |
+
files = [target_audio, reference_audio, driving_audio]
|
213 |
|
214 |
+
for file in files:
|
215 |
+
subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
|
216 |
|
217 |
# ta_ = read(target_audio)
|
218 |
|
219 |
+
target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
|
220 |
+
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
|
221 |
|
222 |
+
driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
|
223 |
+
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
|
224 |
|
225 |
# Convert the voice
|
226 |
|
227 |
+
driving_spec = compute_spec(driving_audio)
|
228 |
+
y_lengths = torch.tensor([driving_spec.size(-1)])
|
229 |
+
if USE_CUDA:
|
230 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
|
231 |
+
ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
|
232 |
+
else:
|
233 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
|
234 |
+
ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
|
235 |
|
236 |
# print("Reference Audio after decoder:")
|
237 |
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
|
238 |
|
239 |
+
voicefixer.restore(input=ref_wav_voc, # input wav file path
|
240 |
+
output="audio1.wav", # output wav file path
|
241 |
+
cuda=True, # whether to use gpu acceleration
|
242 |
+
mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
|
243 |
+
|
244 |
+
noisy = enhance_model.load_audio(
|
245 |
+
"audio1.wav"
|
246 |
+
).unsqueeze(0)
|
247 |
+
|
248 |
+
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
|
249 |
+
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
|
250 |
+
|
251 |
+
return [result.text, chat_response, "enhanced.wav"]
|
252 |
+
|
253 |
+
c1=gr.Interface(
|
254 |
+
fn=voice_conversion,
|
255 |
+
inputs=[
|
256 |
+
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
|
257 |
+
gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
|
258 |
+
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
|
259 |
+
gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
|
260 |
+
],
|
261 |
+
outputs=[
|
262 |
+
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
|
263 |
+
],
|
264 |
+
#theme="huggingface",
|
265 |
+
description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
|
266 |
+
)
|
267 |
+
|
268 |
+
c2=gr.Interface(
|
269 |
+
fn=voice_conversion,
|
270 |
+
inputs=[
|
271 |
+
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
|
272 |
+
gr.Audio(source="microphone", label = "请上传您喜欢的声音,并尽量避免噪音", type="filepath"),
|
273 |
+
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
|
274 |
+
gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
|
275 |
+
],
|
276 |
+
outputs=[
|
277 |
+
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
|
278 |
+
],
|
279 |
+
#theme="huggingface",
|
280 |
+
description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
|
281 |
+
)
|
282 |
+
|
283 |
+
demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"], title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!')
|
284 |
+
demo.launch()
|