Spaces:
Paused
Paused
File size: 8,526 Bytes
9653cf4 e415129 9653cf4 7e13479 9653cf4 7e13479 9653cf4 7e13479 9653cf4 7e13479 6a0a2ac 9653cf4 6a0a2ac 3a50740 6a0a2ac 9653cf4 6a0a2ac 3a50740 6a0a2ac 3a50740 7e13479 9653cf4 c6aa473 7e13479 9653cf4 7e13479 9653cf4 7e13479 9653cf4 6a0a2ac 9653cf4 6a0a2ac 7e13479 9653cf4 7e13479 37e915a 7e13479 5c8f4fe 9653cf4 5c8f4fe 9653cf4 5c8f4fe 7e13479 9653cf4 6a0a2ac 37e915a 9653cf4 36e8f2b 9653cf4 6c18768 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import tempfile
from openai import OpenAI
from tts_voice import tts_order_voice
import edge_tts
import numpy as np
import anyio
import torch
import torchaudio
import gradio as gr
from scipy.io import wavfile
from scipy.io.wavfile import write
# 创建 KNN-VC 模型
knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device='cpu')
# 初始化 language_dict
language_dict = tts_order_voice
# 异步文字转语音函数
async def text_to_speech_edge(text, language_code):
voice = language_dict[language_code]
communicate = edge_tts.Communicate(text, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return "语音合成完成:{}".format(text), tmp_path
# 声音更改函数
#def voice_change(audio_in, audio_ref):
#samplerate1, data1 = wavfile.read(audio_in)
#samplerate2, data2 = wavfile.read(audio_ref)
#write("./audio_in.wav", samplerate1, data1)
#write("./audio_ref.wav", samplerate2, data2)
#query_seq = knn_vc.get_features("./audio_in.wav")
#matching_set = knn_vc.get_matching_set(["./audio_ref.wav"])
#out_wav = knn_vc.match(query_seq, matching_set, topk=4)
#torchaudio.save('output.wav', out_wav[None], 16000)
#return 'output.wav'
def voice_change(audio_in, audio_ref):
samplerate1, data1 = wavfile.read(audio_in)
samplerate2, data2 = wavfile.read(audio_ref)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_in, \
tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_ref:
audio_in_path = tmp_audio_in.name
audio_ref_path = tmp_audio_ref.name
write(audio_in_path, samplerate1, data1)
write(audio_ref_path, samplerate2, data2)
query_seq = knn_vc.get_features(audio_in_path)
matching_set = knn_vc.get_matching_set([audio_ref_path])
out_wav = knn_vc.match(query_seq, matching_set, topk=4)
output_path = 'output.wav'
torchaudio.save(output_path, out_wav[None], 16000)
return output_path
# def voice_change(audio_in, audio_ref):
# samplerate1, data1 = wavfile.read(audio_in)
# samplerate2, data2 = wavfile.read(audio_ref)
# # 强制匹配音频文件的长度
# max_length = max(data1.shape[0], data2.shape[0])
# if data1.shape[0] < max_length:
# data1 = np.pad(data1, (0, max_length - data1.shape[0]), mode='constant')
# if data2.shape[0] < max_length:
# data2 = np.pad(data2, (0, max_length - data2.shape[0]), mode='constant')
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_in, \
# tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_ref:
# audio_in_path = tmp_audio_in.name
# audio_ref_path = tmp_audio_ref.name
# wavfile.write(audio_in_path, samplerate1, data1)
# wavfile.write(audio_ref_path, samplerate2, data2)
# query_seq = knn_vc.get_features(audio_in_path)
# matching_set = knn_vc.get_matching_set([audio_ref_path])
# out_wav = knn_vc.match(query_seq, matching_set, topk=4)
# output_path = 'output.wav'
# torchaudio.save(output_path, torch.tensor(out_wav)[None], 16000)
# return output_path
# 文字转语音(OpenAI)
def tts(text, model, voice, api_key):
if len(text) > 300:
raise gr.Error('您输入的文本字符多于300个,请缩短您的文本')
if api_key == '':
raise gr.Error('请填写您的 中转API Key')
try:
client = OpenAI(api_key=api_key, base_url='https://lmzh.top/v1')
response = client.audio.speech.create(
model=model,
voice=voice,
input=text,
)
except Exception as error:
raise gr.Error(f"生成语音时出错:{error}")
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
temp_file.write(response.content)
return temp_file.name
# Gradio 前端设计
app = gr.Blocks()
with app:
gr.Markdown("# <center>OpenAI TTS + 3秒实时AI变声+需要使用中转key</center>")
gr.Markdown("### <center>中转key购买地址https://buy.sipola.cn</center>")
with gr.Tab("TTS"):
with gr.Row(variant='panel'):
api_key = gr.Textbox(type='password', label='API Key', placeholder='请在此填写您的API Key')
model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='请选择模型(tts-1推理更快,tts-1-hd音质更好)', value='tts-1')
voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='请选择一个说话人', value='alloy')
with gr.Row():
with gr.Column():
inp_text = gr.Textbox(label="请填写您想生成的文本中英文皆可", placeholder="请输入ai生成的文案,不要超过300字,最好200字左右", lines=5)
btn_text = gr.Button("一键开启真实拟声吧", variant="primary")
with gr.Column():
inp1 = gr.Audio(type="filepath", label="OpenAI TTS真实拟声", interactive=False)
inp2 = gr.Audio(type="filepath", label="请上传AI变声的参照音频(决定变声后的语音音色)")
btn1 = gr.Button("一键开启AI变声吧", variant="primary")
with gr.Column():
out1 = gr.Audio(type="filepath", label="AI变声后的专属音频")
btn_text.click(tts, [inp_text, model, voice, api_key], inp1)
btn1.click(voice_change, [inp1, inp2], out1)
# with gr.Tab("⚡ Edge TTS"):
# with gr.Row():
# input_text = gr.Textbox(lines=5, placeholder="请输入ai生成的文案,不要超过300字,最好200字左右", label="请填写您想生成的文本中英文皆可")
# default_language = list(language_dict.keys())[15]
# language = gr.Dropdown(choices=list(language_dict.keys()), value=default_language, label="请选择文本对应的语言")
# btn_edge = gr.Button("一键开启真实拟声吧", variant="primary")
# output_text = gr.Textbox(label="输出文本", visible=False)
# output_audio = gr.Audio(type="filepath", label="Edge TTS真实拟声")
# with gr.Row():
# inp_vc = gr.Audio(type="filepath", label="请上传AI变声的参照音频决定变声后的语音音色")
# btn_vc = gr.Button("一键开启AI变声吧", variant="primary")
# out_vc = gr.Audio(type="filepath", label="AI变声后的专属音频")
# btn_edge.click(lambda text, lang: anyio.run(text_to_speech_edge, text, lang), [input_text, language], [output_text, output_audio])
# btn_vc.click(voice_change, [output_audio, inp_vc], out_vc)
with gr.Tab("Edge TTS"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="请填写您想生成的文本中英文皆可",placeholder="请输入ai生成的文案,不要超过300字,最好200字左右",lines=5)
btn_edge = gr.Button("一键开启真实拟声吧", variant="primary")
with gr.Column():
default_language = list(language_dict.keys())[15]
language = gr.Dropdown(choices=list(language_dict.keys()), value=default_language, label="请选择文本对应的语言")
output_audio = gr.Audio(type="filepath", label="TTS真实拟声")
output_text = gr.Textbox(label="输出文本", visible=False)
with gr.Row():
with gr.Column():
inp_vc = gr.Audio(type="filepath", label="请上传AI变声的参照音频决定变声后的语音音色")
btn_vc = gr.Button("一键开启AI变声吧", variant="primary")
with gr.Column():
out_vc = gr.Audio(type="filepath", label="AI变声后的专属音频")
btn_edge.click(lambda text, lang: anyio.run(text_to_speech_edge, text, lang), [input_text, language], [output_text, output_audio])
btn_vc.click(voice_change, [output_audio, inp_vc], out_vc)
gr.Markdown("### <center>注意获取中转API Key [here](https://buy.sipola.cn).</center>")
gr.Markdown("### <center>ai文案生成可使用中转key,请访问 [here](https://ai.sipola.cn).</center>")
gr.HTML('''
<div class="footer">
<center><p>Power by sipola </p></center>
</div>
''')
app.launch(show_error=True)
|