Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -48,9 +48,6 @@ from scipy.io.wavfile import write, read
|
|
48 |
|
49 |
import subprocess
|
50 |
|
51 |
-
import whisper
|
52 |
-
model1 = whisper.load_model("small")
|
53 |
-
|
54 |
import openai
|
55 |
|
56 |
mes = [
|
@@ -131,29 +128,14 @@ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encod
|
|
131 |
# Define helper function
|
132 |
|
133 |
|
134 |
-
def chatgpt(apikey,
|
135 |
|
136 |
openai.api_key = apikey
|
137 |
|
138 |
-
# load audio and pad/trim it to fit 30 seconds
|
139 |
-
audio = whisper.load_audio(audio)
|
140 |
-
audio = whisper.pad_or_trim(audio)
|
141 |
-
|
142 |
-
# make log-Mel spectrogram and move to the same device as the model1
|
143 |
-
mel = whisper.log_mel_spectrogram(audio).to(model1.device)
|
144 |
-
|
145 |
-
# detect the spoken language
|
146 |
-
_, probs = model1.detect_language(mel)
|
147 |
-
print(f"Detected language: {max(probs, key=probs.get)}")
|
148 |
-
|
149 |
-
# decode the audio
|
150 |
-
options = whisper.DecodingOptions()
|
151 |
-
result = whisper.decode(model1, mel, options)
|
152 |
-
|
153 |
messages = mes
|
154 |
|
155 |
# chatgpt
|
156 |
-
content = result
|
157 |
messages.append({"role": "user", "content": content})
|
158 |
|
159 |
completion = openai.ChatCompletion.create(
|
@@ -175,57 +157,58 @@ def chatgpt(apikey, audio):
|
|
175 |
|
176 |
write(audio_out, a1, b1)
|
177 |
|
178 |
-
return [
|
179 |
|
180 |
def compute_spec(ref_file):
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
|
186 |
|
187 |
def voice_conversion(ta, ra, da):
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
192 |
|
193 |
-
write(target_audio, ta[0], ta[1])
|
194 |
-
write(reference_audio, ra[0], ra[1])
|
195 |
-
write(driving_audio, da[0], da[1])
|
196 |
-
|
197 |
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
|
198 |
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
|
199 |
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
|
200 |
|
201 |
-
|
202 |
|
203 |
-
|
204 |
-
|
205 |
|
206 |
# ta_ = read(target_audio)
|
207 |
|
208 |
-
|
209 |
-
|
210 |
|
211 |
-
|
212 |
-
|
213 |
|
214 |
# Convert the voice
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
|
225 |
# print("Reference Audio after decoder:")
|
226 |
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
|
227 |
|
228 |
-
|
|
|
229 |
|
230 |
block = gr.Blocks()
|
231 |
|
@@ -243,15 +226,14 @@ with block:
|
|
243 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
244 |
|
245 |
inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
|
246 |
-
inp2 = gr.
|
247 |
|
248 |
btn = gr.Button("开始对话吧")
|
249 |
-
|
250 |
-
yousay = gr.Textbox(lines=1, label="您的提问")
|
251 |
texts = gr.Textbox(lines=2, label="ChatGPT的回答")
|
252 |
audio_tts = gr.Audio(label="自动合成的声音")
|
253 |
|
254 |
-
btn.click(chatgpt, [inp1, inp2], [
|
255 |
|
256 |
with gr.Box():
|
257 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
|
|
48 |
|
49 |
import subprocess
|
50 |
|
|
|
|
|
|
|
51 |
import openai
|
52 |
|
53 |
mes = [
|
|
|
128 |
# Define helper function
|
129 |
|
130 |
|
131 |
+
def chatgpt(apikey, result):
|
132 |
|
133 |
openai.api_key = apikey
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
messages = mes
|
136 |
|
137 |
# chatgpt
|
138 |
+
content = result
|
139 |
messages.append({"role": "user", "content": content})
|
140 |
|
141 |
completion = openai.ChatCompletion.create(
|
|
|
157 |
|
158 |
write(audio_out, a1, b1)
|
159 |
|
160 |
+
return [chat_response, audio_out]
|
161 |
|
162 |
def compute_spec(ref_file):
|
163 |
+
y, sr = librosa.load(ref_file, sr=ap.sample_rate)
|
164 |
+
spec = ap.spectrogram(y)
|
165 |
+
spec = torch.FloatTensor(spec).unsqueeze(0)
|
166 |
+
return spec
|
167 |
|
168 |
|
169 |
def voice_conversion(ta, ra, da):
|
170 |
|
171 |
+
target_audio = 'target.wav'
|
172 |
+
reference_audio = 'reference.wav'
|
173 |
+
driving_audio = 'driving.wav'
|
174 |
+
|
175 |
+
write(target_audio, ta[0], ta[1])
|
176 |
+
write(reference_audio, ra[0], ra[1])
|
177 |
+
write(driving_audio, da[0], da[1])
|
178 |
|
|
|
|
|
|
|
|
|
179 |
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
|
180 |
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
|
181 |
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
|
182 |
|
183 |
+
files = [target_audio, reference_audio, driving_audio]
|
184 |
|
185 |
+
for file in files:
|
186 |
+
subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
|
187 |
|
188 |
# ta_ = read(target_audio)
|
189 |
|
190 |
+
target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
|
191 |
+
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
|
192 |
|
193 |
+
driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
|
194 |
+
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
|
195 |
|
196 |
# Convert the voice
|
197 |
|
198 |
+
driving_spec = compute_spec(driving_audio)
|
199 |
+
y_lengths = torch.tensor([driving_spec.size(-1)])
|
200 |
+
if USE_CUDA:
|
201 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
|
202 |
+
ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
|
203 |
+
else:
|
204 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
|
205 |
+
ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
|
206 |
|
207 |
# print("Reference Audio after decoder:")
|
208 |
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
|
209 |
|
210 |
+
return (ap.sample_rate, ref_wav_voc)
|
211 |
+
|
212 |
|
213 |
block = gr.Blocks()
|
214 |
|
|
|
226 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
227 |
|
228 |
inp1 = gr.components.Textbox(lines=2, label="请填写您的OpenAI-API-key")
|
229 |
+
inp2 = gr.components.Textbox(lines=2, label="说些什么吧")
|
230 |
|
231 |
btn = gr.Button("开始对话吧")
|
232 |
+
|
|
|
233 |
texts = gr.Textbox(lines=2, label="ChatGPT的回答")
|
234 |
audio_tts = gr.Audio(label="自动合成的声音")
|
235 |
|
236 |
+
btn.click(chatgpt, [inp1, inp2], [texts, audio_tts])
|
237 |
|
238 |
with gr.Box():
|
239 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|