Spaces:

HKUST-Audio
/

Llasa-1B-multi-speakers-genshin-zh-en-ja-ko

Running on Zero

App Files Files Community

Llasa-1B-multi-speakers-genshin-zh-en-ja-ko / app.py

HKUST-Audio

Update app.py

c6c1f65 verified 14 days ago

raw

history blame

6.4 kB

	import spaces
	import gradio as gr
	import torch
	import soundfile as sf
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from xcodec2.modeling_xcodec2 import XCodec2Model
	import tempfile

	device = "cuda" if torch.cuda.is_available() else "cpu"

	####################
	# 全局加载模型
	####################
	llasa_3b = "HKUSTAudio/Llasa-1B-multi-speakers-genshin-zh-en-ja-ko"
	print("Loading tokenizer & model ...")
	tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
	model = AutoModelForCausalLM.from_pretrained(llasa_3b)
	model.eval().to(device)

	print("Loading XCodec2Model ...")
	codec_model_path = "HKUSTAudio/xcodec2"
	Codec_model = XCodec2Model.from_pretrained(codec_model_path)
	Codec_model.eval().to(device)

	print("Models loaded.")

	####################
	# 推理用函数
	####################
	def extract_speech_ids(speech_tokens_str):
	"""
	将类似 <\|s_23456\|> 还原为 int 23456
	"""
	speech_ids = []
	for token_str in speech_tokens_str:
	if token_str.startswith("<\|s_") and token_str.endswith("\|>"):
	num_str = token_str[4:-2]
	num = int(num_str)
	speech_ids.append(num)
	else:
	print(f"Unexpected token: {token_str}")
	return speech_ids
	@spaces.GPU
	def text2speech(input_text, speaker_choice):
	"""
	将文本转为语音波形，并返回音频文件路径
	"""
	with torch.no_grad():
	# 在输入文本前后拼接提示token
	formatted_text = f"<\|TEXT_UNDERSTANDING_START\|>{input_text}<\|TEXT_UNDERSTANDING_END\|>"
	chat = [
	{"role": "user", "content": "Convert the text to speech:" + formatted_text},
	{"role": "assistant", "content": f"Speaker {speaker_choice} <\|SPEECH_GENERATION_START\|>"}
	]

	# tokenizer.apply_chat_template 是 Llasa 风格的对话模式
	input_ids = tokenizer.apply_chat_template(
	chat,
	tokenize=True,
	return_tensors='pt',
	continue_final_message=True
	).to(device)

	# 结束符
	speech_end_id = tokenizer.convert_tokens_to_ids("<\|SPEECH_GENERATION_END\|>")

	# 文本生成
	outputs = model.generate(
	input_ids,
	max_length=2048, # We trained our model with a max length of 2048
	eos_token_id= speech_end_id ,
	do_sample=True,
	top_p=0.95, # Adjusts the diversity of generated content
	temperature=0.9, # Controls randomness in output
	repetition_penalty= 1.2,
	)

	# 把新生成的 token（不包括输入部分）取出来
	generated_ids = outputs[0][input_ids.shape[1]:-1]
	speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

	# 将 <\|s_23456\|> 提取成 [23456 ...]
	speech_tokens_int = extract_speech_ids(speech_tokens_str)
	speech_tokens_int = torch.tensor(speech_tokens_int).to(device).unsqueeze(0).unsqueeze(0)

	# 调用 XCodec2Model 解码波形
	gen_wav = Codec_model.decode_code(speech_tokens_int) # [batch, channels, samples]

	# 获取音频数据和采样率
	audio = gen_wav[0, 0, :].cpu().numpy()
	sample_rate = 16000

	# 将音频保存到临时文件
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	sf.write(tmpfile.name, audio, sample_rate)
	audio_path = tmpfile.name

	return audio_path

	####################
	# Gradio 界面
	####################
	speaker_choices = [
	'Paimon', 'Traveler', 'Nahida', 'Navia', 'Furina', 'Lyney', 'Layla', 'Neuvillette',
	'Kaveh', 'Tighnari', 'Alhaitham', 'Kaeya', 'Dehya', 'Zhongli', 'Cyno', 'Yoimiya',
	'Ningguang', 'Nilou', 'Faruzan', 'Wriothesley', 'Collei', 'Thoma', 'Noelle',
	'Venti', 'Lynette', 'Charlotte', 'Diona', 'Yelan', 'Clorinde', 'Sigewinne', 'Beidou',
	'Gorou', 'Lisa', 'Yanfei', 'Sucrose', 'Sayu', 'Ganyu', 'Chiori', 'Chongyun', 'Freminet',
	'Barbara', 'Baizhu', 'Kirara', 'Dainsleif', 'Klee', 'Albedo', 'Dori', 'Eula', 'Xiao',
	'Mona', 'Bennett', 'Amber', 'Xingqiu', 'Shenhe', 'Childe', 'Xiangling', 'Jean', 'Diluc',
	'Katheryne', 'Mika', 'Keqing', 'Candace'
	]
	#["puck", "kore"]

	demo = gr.Interface(
	fn=text2speech,
	inputs=[gr.Textbox(label="Enter text", lines=5),
	gr.Dropdown(choices=speaker_choices, label="Select Speaker", value="Paimon")],
	outputs=gr.Audio(label="Generated Audio", type="filepath"),
	title="Llasa-1B TTS finetuned using simon3000/genshin-voice",
	description = (
	"Input a piece of text (Chinese, English, Japanese, Korean), select a speaker, "
	"and click to generate speech. If fail, try a few more times\n"
	"Speakers (角色选择):\n"
	"'Paimon（派蒙）', 'Traveler（旅行者）', 'Nahida（纳西妲）', 'Navia（纳维亚）', 'Furina（芙宁娜）', "
	"'Lyney（莱依拉）', 'Layla（莱依拉）', 'Neuvillette（诺维利特）', 'Kaveh（卡维赫）', 'Tighnari（提纳里）', "
	"'Alhaitham（艾尔海森）', 'Kaeya（凯亚）', 'Dehya（迪希雅）', 'Zhongli（钟离）', 'Cyno（赛诺）', 'Yoimiya（宵宫）', "
	"'Ningguang（凝光）', 'Nilou（妮露）', 'Faruzan（法露珊）', 'Wriothesley（维欧塞利）', 'Collei（可莉）', 'Thoma（托马）', "
	"'Noelle（诺艾尔）', 'Venti（温迪）', 'Lynette（莉妮特）', 'Charlotte（夏洛特）', 'Diona（迪奥娜）', 'Yelan（夜兰）', "
	"'Clorinde（克洛琳德）', 'Sigewinne（希格温）', 'Beidou（北斗）', 'Gorou（五郎）', 'Lisa（丽莎）', 'Yanfei（烟绯）', "
	"'Sucrose（砂糖）', 'Sayu（早柚）', 'Ganyu（甘雨）', 'Chiori（千里）', 'Chongyun（重云）', 'Freminet（弗雷明内）', "
	"'Barbara（芭芭拉）', 'Baizhu（白术）', 'Kirara（切尔拉）', 'Dainsleif（戴因斯雷布）', 'Klee（可莉）', 'Albedo（阿贝多）', "
	"'Dori（多莉）', 'Eula（优菈）', 'Xiao（魈）', 'Mona（莫娜）', 'Bennett（班尼特）', 'Amber（安柏）', 'Xingqiu（行秋）', "
	"'Shenhe（申鹤）', 'Childe（公子）', 'Xiangling（香菱）', 'Jean（琴）', 'Diluc（迪卢克）', 'Katheryne（凯瑟琳）', "
	"'Mika（米卡）', 'Keqing（刻晴）', 'Candace（坎蒂丝）'"
	))

	if __name__ == "__main__":
	demo.launch(
	share=True )