File size: 7,023 Bytes
9653cf4
 
 
 
 
 
 
 
 
 
 
 
7e13479
9653cf4
 
7e13479
9653cf4
 
7e13479
9653cf4
 
 
 
 
 
 
 
 
 
7e13479
6a0a2ac
 
 
 
 
 
 
 
 
 
 
c6aa473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9653cf4
 
 
 
c6aa473
 
 
 
 
 
 
 
6a0a2ac
 
 
 
c6aa473
 
6a0a2ac
 
 
9653cf4
6a0a2ac
c6aa473
6a0a2ac
7e13479
 
 
9653cf4
 
c6aa473
7e13479
 
 
 
 
 
 
 
 
 
 
9653cf4
 
 
7e13479
9653cf4
7e13479
9653cf4
 
 
6a0a2ac
 
 
9653cf4
6a0a2ac
7e13479
 
9653cf4
7e13479
 
 
 
 
 
 
 
 
 
 
9653cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
7e13479
9653cf4
 
6a0a2ac
9653cf4
 
36e8f2b
9653cf4
 
 
6c18768
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import tempfile
from openai import OpenAI
from tts_voice import tts_order_voice
import edge_tts
import anyio
import torch
import torchaudio
import gradio as gr
from scipy.io import wavfile
from scipy.io.wavfile import write

# 创建 KNN-VC 模型
knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device='cpu')

# 初始化 language_dict
language_dict = tts_order_voice

# 异步文字转语音函数
async def text_to_speech_edge(text, language_code):
    voice = language_dict[language_code]
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name

    await communicate.save(tmp_path)

    return "语音合成完成:{}".format(text), tmp_path

# 声音更改函数
#def voice_change(audio_in, audio_ref):
    #samplerate1, data1 = wavfile.read(audio_in)
    #samplerate2, data2 = wavfile.read(audio_ref)
    #write("./audio_in.wav", samplerate1, data1)
    #write("./audio_ref.wav", samplerate2, data2)

    #query_seq = knn_vc.get_features("./audio_in.wav")
    #matching_set = knn_vc.get_matching_set(["./audio_ref.wav"])
    #out_wav = knn_vc.match(query_seq, matching_set, topk=4)
    #torchaudio.save('output.wav', out_wav[None], 16000)
    #return 'output.wav'
# def voice_change(audio_in, audio_ref):
#     samplerate1, data1 = wavfile.read(audio_in)
#     samplerate2, data2 = wavfile.read(audio_ref)

#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_in, \
#          tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_ref:
#         audio_in_path = tmp_audio_in.name
#         audio_ref_path = tmp_audio_ref.name
#         write(audio_in_path, samplerate1, data1)
#         write(audio_ref_path, samplerate2, data2)

#     query_seq = knn_vc.get_features(audio_in_path)
#     matching_set = knn_vc.get_matching_set([audio_ref_path])
#     out_wav = knn_vc.match(query_seq, matching_set, topk=4)
#     output_path = 'output.wav'
#     torchaudio.save(output_path, out_wav[None], 16000)
#     return output_path

def voice_change(audio_in, audio_ref):
    samplerate1, data1 = wavfile.read(audio_in)
    samplerate2, data2 = wavfile.read(audio_ref)

    # 强制匹配音频文件的长度
    max_length = max(data1.shape[0], data2.shape[0])

    if data1.shape[0] < max_length:
        data1 = np.pad(data1, (0, max_length - data1.shape[0]), mode='constant')
    if data2.shape[0] < max_length:
        data2 = np.pad(data2, (0, max_length - data2.shape[0]), mode='constant')

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_in, \
         tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_ref:
        audio_in_path = tmp_audio_in.name
        audio_ref_path = tmp_audio_ref.name
        wavfile.write(audio_in_path, samplerate1, data1)
        wavfile.write(audio_ref_path, samplerate2, data2)

    query_seq = knn_vc.get_features(audio_in_path)
    matching_set = knn_vc.get_matching_set([audio_ref_path])
    out_wav = knn_vc.match(query_seq, matching_set, topk=4)
    output_path = 'output.wav'
    torchaudio.save(output_path, torch.tensor(out_wav)[None], 16000)
    return output_path
# 文字转语音(OpenAI)
def tts(text, model, voice, api_key):
    if len(text) > 300:
        raise gr.Error('您输入的文本字符多于300个,请缩短您的文本')
    if api_key == '':
        raise gr.Error('请填写您的 中转API Key')

    try:
        client = OpenAI(api_key=api_key, base_url='https://lmzh.top/v1')
        response = client.audio.speech.create(
            model=model,
            voice=voice,
            input=text,
        )
    except Exception as error:
        raise gr.Error(f"生成语音时出错:{error}")

    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
        temp_file.write(response.content)

    return temp_file.name

# Gradio 前端设计
app = gr.Blocks()

with app:
    gr.Markdown("# <center>OpenAI TTS + 3秒实时AI变声+需要使用中转key</center>")
    gr.Markdown("### <center>中转key购买地址https://buy.sipola.cn</center>")
    with gr.Tab("TTS"):
        with gr.Row(variant='panel'):
            api_key = gr.Textbox(type='password', label='API Key', placeholder='请在此填写您的API Key')
            model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='请选择模型(tts-1推理更快,tts-1-hd音质更好)', value='tts-1')
            voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='请选择一个说话人', value='alloy')
        with gr.Row():
            with gr.Column():
                inp_text = gr.Textbox(label="请填写您想生成的文本(中英文皆可)", placeholder="想说却还没说的 还很多 攒着是因为想写成歌", lines=5)
                btn_text = gr.Button("一键开启真实拟声吧", variant="primary")
            with gr.Column():
                inp1 = gr.Audio(type="filepath", label="OpenAI TTS真实拟声", interactive=False)
                inp2 = gr.Audio(type="filepath", label="请上传AI变声的参照音频(决定变声后的语音音色)")
                btn1 = gr.Button("一键开启AI变声吧", variant="primary")
            with gr.Column():
                out1 = gr.Audio(type="filepath", label="AI变声后的专属音频")
            btn_text.click(tts, [inp_text, model, voice, api_key], inp1)
            btn1.click(voice_change, [inp1, inp2], out1)
    with gr.Tab("⚡ Edge TTS"):
        with gr.Row():
            input_text = gr.Textbox(lines=5, placeholder="想说却还没说的 还很多 攒着是因为想写成歌", label="请填写您想生成的文本(中英文皆可)")
            default_language = list(language_dict.keys())[15]
            language = gr.Dropdown(choices=list(language_dict.keys()), value=default_language, label="请选择文本对应的语言")
            btn_edge = gr.Button("一键开启真实拟声吧", variant="primary")
            output_text = gr.Textbox(label="输出文本", visible=False)
            output_audio = gr.Audio(type="filepath", label="Edge TTS真实拟声")

        with gr.Row():
            inp_vc = gr.Audio(type="filepath", label="请上传AI变声的参照音频(决定变声后的语音音色)")
            btn_vc = gr.Button("一键开启AI变声吧", variant="primary")
            out_vc = gr.Audio(type="filepath", label="AI变声后的专属音频")

        btn_edge.click(lambda text, lang: anyio.run(text_to_speech_edge, text, lang), [input_text, language], [output_text, output_audio])
        btn_vc.click(voice_change, [output_audio, inp_vc], out_vc)

    gr.Markdown("### <center>注意获取中转API Key [here](https://buy.sipola.cn).</center>")
    gr.HTML('''
        <div class="footer">
         <center><p>Power by sipola </p></center>
        </div>
    ''')

app.launch(show_error=True)