english / app.py
ATForest's picture
Update app.py
4f9c46f
raw
history blame
8.86 kB
import gradio as gr
from textwrap import dedent
import edge_tts
import tempfile
from tts_voice import tts_order_voice
from english.translate import Translate
from english.split_text import sentence_split
from english.generator import generatorArticle
import random
import codecs
import torch
import librosa
from models import SynthesizerTrn
from scipy.io.wavfile import write
import utils
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder
from transformers import WavLMModel
language_dict = tts_order_voice
def parse_text(input):
text = generatorArticle(input).strip()
lines = text.split("\n")
lines = [line for line in lines if line != ""]
count = 0
for i, line in enumerate(lines):
if "```" in line:
count += 1
items = line.split("`")
if count % 2 == 1:
lines[i] = f'<pre><code class="language-{items[-1]}">'
else:
lines[i] = "<br></code></pre>"
else:
if i > 0:
if count % 2 == 1:
line = line.replace("`", r"\`")
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace(" ", "&nbsp;")
line = line.replace("*", "&ast;")
line = line.replace("_", "&lowbar;")
line = line.replace("-", "&#45;")
line = line.replace(".", "&#46;")
line = line.replace("!", "&#33;")
line = line.replace("(", "&#40;")
line = line.replace(")", "&#41;")
line = line.replace("$", "&#36;")
lines[i] = "<br>" + line
return text
def predict(input):
article = parse_text(input)
yield article,article
async def text_to_speech_edge(text, language_code):
voice = language_dict[language_code]
communicate = edge_tts.Communicate(text, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
def tran_2_chianese(text):
translate = Translate()
sentence_str = sentence_split(text)
i = 0
result=''
length = len(sentence_str)
while(i < length):
tmp = sentence_str[i]
print('\n'+tmp)
tran = translate.translateToZh(tmp)
result = result+tmp+'\n'+tran+'\n'
i+=1
return result
def readWorldsFile(file_path):
fp = codecs.open(file_path, 'r', encoding='gb2312')
lines = fp.readlines()
worlds ,paraphrase = [],[]
for line in lines:
tmp = line.split('|')
worlds.append(tmp[0].strip())
paraphrase.append(tmp[1].strip())
fp.close()
return worlds, paraphrase
def generatorWorlds(file_path):
worlds,paraphrase = readWorldsFile(file_path)
length = len(worlds)
index = 0
worlds_text = ''
while index < 15:
num = random.randint(0,length)
worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n'
index += 1
print('\n' + worlds_text)
return worlds_text
def choose_word_from_file(input):
result = generatorWorlds(input.orig_name)
return result
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
def convert(model, src, tgt):
with torch.no_grad():
# tgt
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
if model == "FreeVC" or model == "FreeVC (24kHz)":
g_tgt = smodel.embed_utterance(wav_tgt)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
else:
wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
mel_tgt = mel_spectrogram_torch(
wav_tgt,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
# src
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
# infer
if model == "FreeVC":
audio = freevc.infer(c, g=g_tgt)
elif model == "FreeVC-s":
audio = freevc_s.infer(c, mel=mel_tgt)
else:
audio = freevc_24.infer(c, g=g_tgt)
audio = audio[0][0].data.cpu().float().numpy()
if model == "FreeVC" or model == "FreeVC-s":
write("out.wav", hps.data.sampling_rate, audio)
else:
write("out.wav", 24000, audio)
out = "out.wav"
return out
with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo:
gr.HTML("<center>"
"<h1>OpenAI + 声音克隆:根据单词生成短文,帮助理解单词使用的语境!!</h1>"
"</center>")
with gr.Accordion("📒 相关信息", open=True):
_ = f"""OpenAI Prompt 的可选参数信息:
* 输入 10-15 个单词为宜
* prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内'
* Open AI 用的是限制账号,每分钟请求 3 次
* 单词文件:每个单词及解释单独成行,单词与注释同行,用 “|” 分割
"""
gr.Markdown(dedent(_))
with gr.Row():
file = gr.File()
chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary")
user_input = gr.Textbox(
max_lines=5,
lines=3,
label="单词用逗号分割:",
placeholder="10-15 words will be better",
)
with gr.Column(scale=1):
submitBtn = gr.Button("开始生成英语短文", variant="primary")
chatbot = gr.Textbox(label="英语短文:", lines = 5, max_lines=8)
chooseBtn.click(
choose_word_from_file,
inputs=[file],
outputs=[user_input],
show_progress="full",
api_name="choose_word_from_file"
)
with gr.Column(scale=3):
with gr.Row():
tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2)
tran_btn = gr.Button("翻译", variant="primary")
tran_btn.click(
tran_2_chianese,
inputs=[chatbot],
outputs=[tran_result],
show_progress="full",
api_name="tran_2_chianese"
)
with gr.Column(min_width=32, scale=2):
with gr.Row():
with gr.Column():
language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
tts_btn = gr.Button("生成对应的音频吧", variant="primary")
output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio])
with gr.Row():
model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
audio1 = output_audio
audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
user_input.submit(
predict,
[user_input],
[chatbot,tran_result],
show_progress="full",
)
submitBtn.click(
predict,
[user_input],
[chatbot,tran_result],
show_progress="full",
api_name="predict",
)
# submitBtn.click(reset_user_input, [], [user_input])
demo.queue().launch(show_error=True, debug=True)