Spaces:
Paused
Paused
import os | |
import tempfile | |
from openai import OpenAI | |
from tts_voice import tts_order_voice | |
import edge_tts | |
import anyio | |
import torch | |
import torchaudio | |
import gradio as gr | |
from scipy.io import wavfile | |
from scipy.io.wavfile import write | |
import numpy as np | |
# 创建 KNN-VC 模型 | |
knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device='cpu') | |
# 初始化 language_dict | |
language_dict = tts_order_voice | |
# 异步文字转语音函数 | |
async def text_to_speech_edge(text, language_code): | |
voice = language_dict[language_code] | |
communicate = edge_tts.Communicate(text, voice) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
return "语音合成完成:{}".format(text), tmp_path | |
# 音频填充函数 | |
def pad_audio(data, target_length): | |
if len(data) < target_length: | |
pad_length = target_length - len(data) | |
data = np.pad(data, (0, pad_length), mode='constant') | |
return data | |
# 声音更改函数 | |
def voice_change(audio_in, audio_ref): | |
samplerate1, data1 = wavfile.read(audio_in) | |
samplerate2, data2 = wavfile.read(audio_ref) | |
# 使两个音频长度一致 | |
target_length = max(len(data1), len(data2)) | |
data1 = pad_audio(data1, target_length) | |
data2 = pad_audio(data2, target_length) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_in, \ | |
tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_ref: | |
audio_in_path = tmp_audio_in.name | |
audio_ref_path = tmp_audio_ref.name | |
write(audio_in_path, samplerate1, data1) | |
write(audio_ref_path, samplerate2, data2) | |
query_seq = knn_vc.get_features(audio_in_path) | |
matching_set = knn_vc.get_matching_set([audio_ref_path]) | |
print("query_seq shape:", query_seq.shape) | |
print("matching_set shape:", matching_set.shape) | |
# 确保 query_seq 和 matching_set 维度一致 | |
if query_seq.shape[0] > matching_set.shape[1]: | |
query_seq = query_seq[:matching_set.shape[1]] | |
elif query_seq.shape[0] < matching_set.shape[1]: | |
matching_set = matching_set[:, :query_seq.shape[0], :] | |
out_wav = knn_vc.match(query_seq, matching_set, topk=4) | |
# 确保 out_wav 是二维张量 | |
if len(out_wav.shape) == 1: | |
out_wav = out_wav.unsqueeze(0) | |
output_path = 'output.wav' | |
torchaudio.save(output_path, out_wav, 16000) | |
return output_path | |
# 示例使用 gradio 界面 | |
def gradio_interface(audio_in, audio_ref): | |
return voice_change(audio_in, audio_ref) | |
# 创建 Gradio 界面 | |
iface = gr.Interface(fn=gradio_interface, | |
inputs=["audio", "audio"], | |
outputs="audio", | |
title="KNN-VC Voice Changer") | |
if __name__ == "__main__": | |
iface.launch() | |