Mahiruoshi's picture
Update main.py
6c297e8
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
import json
import re
import numpy as np
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
import datetime
import os
import pickle
import openai
from scipy.io.wavfile import write
import librosa
import romajitable
from mel_processing import spectrogram_torch
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_english(string):
import re
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
if pattern.fullmatch(string):
return True
else:
return False
def extrac(text):
text = re.sub("<[^>]*>","",text)
result_list = re.split(r'\n', text)
final_list = []
for i in result_list:
if is_english(i):
i = romajitable.to_kana(i).katakana
i = i.replace('\n','').replace(' ','')
#Current length of single sentence: 20
if len(i)>1:
if len(i) > 50:
try:
cur_list = re.split(r'。|!', i)
for i in cur_list:
if len(i)>1:
final_list.append(i+'。')
except:
pass
else:
final_list.append(i)
'''
final_list.append(i)
'''
final_list = [x for x in final_list if x != '']
print(final_list)
return final_list
def to_numpy(tensor: torch.Tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad \
else tensor.detach().numpy()
def chatgpt(text):
messages = []
try:
if text != 'exist':
with open('log.pickle', 'rb') as f:
messages = pickle.load(f)
messages.append({"role": "user", "content": text},)
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
reply = chat.choices[0].message.content
messages.append({"role": "assistant", "content": reply})
print(messages[-1])
if len(messages) == 12:
messages[6:10] = messages[8:]
del messages[-2:]
with open('log.pickle', 'wb') as f:
pickle.dump(messages, f)
return reply
except:
messages.append({"role": "user", "content": text},)
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
reply = chat.choices[0].message.content
messages.append({"role": "assistant", "content": reply})
print(messages[-1])
if len(messages) == 12:
messages[6:10] = messages[8:]
del messages[-2:]
with open('log.pickle', 'wb') as f:
pickle.dump(messages, f)
return reply
def get_symbols_from_json(path):
assert os.path.isfile(path)
with open(path, 'r') as f:
data = json.load(f)
return data['symbols']
def sle(language,text):
text = text.replace('\n', '').replace('\r', '').replace(" ", "")
if language == "中文":
tts_input1 = "[ZH]" + text + "[ZH]"
return tts_input1
elif language == "自动":
tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
return tts_input1
elif language == "日文":
tts_input1 = "[JA]" + text + "[JA]"
return tts_input1
elif language == "英文":
tts_input1 = "[EN]" + text + "[EN]"
return tts_input1
elif language == "手动":
return text
def get_text(text,hps_ms):
text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners)
if hps_ms.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def create_tts_fn(net_g,hps,speaker_id):
speaker_id = int(speaker_id)
def tts_fn(is_transfer,original_speaker, target_speaker,history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
text = check_text(text)
repeat_time = int(repeat_time)
original_speaker_id = selection(original_speaker)
target_speaker_id = selection(target_speaker)
if is_gpt:
openai.api_key = api_key
text = chatgpt(text)
history[-1][1] = text
if not extract:
print(text)
t1 = time.time()
stn_tst = get_text(sle(language,text),hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([speaker_id]).to(dev)
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "推理时间为:"+str(t2-t1)+"s"
print(spending_time)
file_path = "subtitles.srt"
try:
write(audiopath + '.wav',22050,audio)
if is_audio:
for i in range(repeat_time):
cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
os.system(cmd)
except:
pass
return history,file_path,(hps.data.sampling_rate,audio)
else:
a = ['【','[','(','(']
b = ['】',']',')',')']
for i in a:
text = text.replace(i,'<')
for i in b:
text = text.replace(i,'>')
final_list = extrac(text.replace('“','').replace('”',''))
split_list = []
while len(final_list) > 0:
split_list.append(final_list[:500])
final_list = final_list[500:]
c0 = 0
for lists in split_list:
audio_fin = []
t = datetime.timedelta(seconds=0)
c = 0
f1 = open(audiopath.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8')
for sentence in lists:
try:
c +=1
stn_tst = get_text(sle(language,sentence),hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([original_speaker_id]).to(dev)
t1 = time.time()
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
print(spending_time)
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
last_time = datetime.timedelta(seconds=len(audio)/float(22050))
t+=last_time
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
print(time_end)
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
if is_transfer:
with torch.no_grad():
y = torch.FloatTensor(audio)
y = y / max(-y.min(), y.max()) / 0.99
y = y.to(dev)
y = y.unsqueeze(0)
spec = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(dev)
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
sid_src = torch.LongTensor([original_speaker_id]).to(dev)
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
0, 0].data.cpu().float().numpy()
del y, spec, spec_lengths, sid_src, sid_tgt
audio_fin.append(audio)
except:
pass
write(audiopath.replace('.wav',str(c0)+'.wav'),22050,np.concatenate(audio_fin))
c0 += 1
file_path = audiopath.replace('.wav',str(c0)+".srt")
return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
return tts_fn
def create_vc_fn(net_g,hps):
def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
input_audio = record_audio if record_audio is not None else upload_audio
original_speaker_id = selection(original_speaker)
target_speaker_id = selection(target_speaker)
if input_audio is None:
stn_tst = get_text(sle(language,text),hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([original_speaker_id]).to(dev)
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
sampling_rate = hps.data.sampling_rate
else:
sampling_rate, audio = input_audio
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != hps.data.sampling_rate:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
with torch.no_grad():
y = torch.FloatTensor(audio)
y = y / max(-y.min(), y.max()) / 0.99
y = y.to(dev)
y = y.unsqueeze(0)
spec = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(dev)
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
sid_src = torch.LongTensor([original_speaker_id]).to(dev)
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
0, 0].data.cpu().float().numpy()
del y, spec, spec_lengths, sid_src, sid_tgt
return "Success", (hps.data.sampling_rate, audio)
return vc_fn
def bot(history,user_message):
return history + [[check_text(user_message), None]]
def selection(speaker):
if speaker == "高咲侑":
spk = 0
return spk
elif speaker == "歩夢":
spk = 1
return spk
elif speaker == "かすみ":
spk = 2
return spk
elif speaker == "しずく":
spk = 3
return spk
elif speaker == "果林":
spk = 4
return spk
elif speaker == "愛":
spk = 5
return spk
elif speaker == "彼方":
spk = 6
return spk
elif speaker == "せつ菜":
spk = 7
return spk
elif speaker == "エマ":
spk = 8
return spk
elif speaker == "璃奈":
spk = 9
return spk
elif speaker == "栞子":
spk = 10
return spk
elif speaker == "ランジュ":
spk = 11
return spk
elif speaker == "ミア":
spk = 12
return spk
elif speaker == "派蒙":
spk = 16
return spk
elif speaker == "c1":
spk = 18
return spk
elif speaker == "c2":
spk = 19
return spk
elif speaker == "華恋":
spk = 21
return spk
elif speaker == "まひる":
spk = 22
return spk
elif speaker == "なな":
spk = 23
return spk
elif speaker == "クロディーヌ":
spk = 24
return spk
elif speaker == "ひかり":
spk = 25
return spk
elif speaker == "純那":
spk = 26
return spk
elif speaker == "香子":
spk = 27
return spk
elif speaker == "真矢":
spk = 28
return spk
elif speaker == "双葉":
spk = 29
return spk
elif speaker == "ミチル":
spk = 30
return spk
elif speaker == "メイファン":
spk = 31
return spk
elif speaker == "やちよ":
spk = 32
return spk
elif speaker == "晶":
spk = 33
return spk
elif speaker == "いちえ":
spk = 34
return spk
elif speaker == "ゆゆ子":
spk = 35
return spk
elif speaker == "塁":
spk = 36
return spk
elif speaker == "珠緒":
spk = 37
return spk
elif speaker == "あるる":
spk = 38
return spk
elif speaker == "ララフィン":
spk = 39
return spk
elif speaker == "美空":
spk = 40
return spk
elif speaker == "静羽":
spk = 41
return spk
else:
return 0
def check_text(input):
if isinstance(input, str):
return input
else:
with open(input.name, "r", encoding="utf-8") as f:
return f.read()
if __name__ == '__main__':
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
models = []
schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
schools = []
lan = ["中文","日文","自动","手动"]
with open("checkpoints/info.json", "r", encoding="utf-8") as f:
models_info = json.load(f)
for i in models_info:
checkpoint = models_info[i]["checkpoint"]
phone_dict = {
symbol: i for i, symbol in enumerate(symbols)
}
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model).to(dev)
_ = net_g.eval()
_ = utils.load_checkpoint(checkpoint, net_g)
school = models_info[i]
speakers = school["speakers"]
content = []
for j in speakers:
sid = int(speakers[j]['sid'])
title = school
example = speakers[j]['speech']
name = speakers[j]["name"]
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
models.append(content)
schools.append((i,create_vc_fn(net_g,hps)))
with gr.Blocks() as app:
with gr.Tabs():
for (i,vc_fn) in schools:
with gr.TabItem(i):
idols = ["派蒙"]
for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]:
idols.append(name)
with gr.TabItem(name):
with gr.Column():
with gr.Row():
with gr.Row():
gr.Markdown(
'<div align="center">'
f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
'</div>'
)
chatbot = gr.Chatbot()
with gr.Row():
with gr.Column(scale=0.85):
input1 = gr.TextArea(label="Text", value=example,lines = 1)
with gr.Column(scale=0.15, min_width=0):
btnVC = gr.Button("Send")
output1 = gr.Audio(label="采样率22050")
with gr.Accordion(label="Setting", open=False):
input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
with gr.Accordion(label="Advanced Setting", open=False):
audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
with gr.Accordion(label="Advanced Setting", open=False):
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
audio_input1 = gr.Checkbox(value=False, label="保存路径")
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
inputxt = gr.File(label="Text")
is_transfer = gr.Checkbox(value=False, label="是否声线转化")
source_speaker = gr.Dropdown(choices=idols, value=name, label="source speaker")
target_speaker = gr.Dropdown(choices=idols, value=name, label="target speaker")
btnbook = gr.Button("小说合成")
btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
)
btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
)
with gr.Tab("Voice Conversion(类似sovits)"):
gr.Markdown("""
声线转化,使用模型中的说话人作为音源时效果更佳
""")
with gr.Column():
with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False):
record_audio = gr.Audio(label="record your voice", source="microphone")
upload_audio = gr.Audio(label="or upload audio here", source="upload")
with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True):
text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1)
language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1)
source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker")
target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker")
with gr.Column():
message_box = gr.Textbox(label="Message")
converted_audio = gr.Audio(label='converted audio')
btn = gr.Button("Convert!")
btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio],
outputs=[message_box, converted_audio])
app.launch()