# coding=utf-8

import os
import librosa
import base64
import io
import gradio as gr
import re

import numpy as np
import torch
import torchaudio
from modelscope import HubApi

api = HubApi()

key = os.environ["apikey"] if "apikey" in os.environ else ""
try:
	api.login(key)
except:
	pass

from funasr import AutoModel

model = "iic/SenseVoiceSmall"
model = AutoModel(model=model,
				  vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
				  vad_kwargs={"max_single_segment_time": 30000},
				  trust_remote_code=True,
				  )

import re

emo_dict = {
	"<|HAPPY|>": "😊",
	"<|SAD|>": "😔",
	"<|ANGRY|>": "😡",
	"<|NEUTRAL|>": "",
	"<|FEARFUL|>": "😰",
	"<|DISGUSTED|>": "🤢",
	"<|SURPRISED|>": "😮",
}

event_dict = {
	"<|BGM|>": "🎼",
	"<|Speech|>": "",
	"<|Applause|>": "👏",
	"<|Laughter|>": "😀",
	"<|Cry|>": "😭",
	"<|Sneeze|>": "🤧",
	"<|Breath|>": "",
	"<|Cough|>": "🤧",
}

emoji_dict = {
	"<|nospeech|><|Event_UNK|>": "❓",
	"<|zh|>": "",
	"<|en|>": "",
	"<|yue|>": "",
	"<|ja|>": "",
	"<|ko|>": "",
	"<|nospeech|>": "",
	"<|HAPPY|>": "😊",
	"<|SAD|>": "😔",
	"<|ANGRY|>": "😡",
	"<|NEUTRAL|>": "",
	"<|BGM|>": "🎼",
	"<|Speech|>": "",
	"<|Applause|>": "👏",
	"<|Laughter|>": "😀",
	"<|FEARFUL|>": "😰",
	"<|DISGUSTED|>": "🤢",
	"<|SURPRISED|>": "😮",
	"<|Cry|>": "😭",
	"<|EMO_UNKNOWN|>": "",
	"<|Sneeze|>": "🤧",
	"<|Breath|>": "",
	"<|Cough|>": "😷",
	"<|Sing|>": "",
	"<|Speech_Noise|>": "",
	"<|withitn|>": "",
	"<|woitn|>": "",
	"<|GBG|>": "",
	"<|Event_UNK|>": "",
}

lang_dict =  {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}


emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}

notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

def format_str(s):
	for sptk in emoji_dict:
		s = s.replace(sptk, emoji_dict[sptk])
	return s


def format_str_v2(s):
	sptk_dict = {}
	for sptk in emoji_dict:
		sptk_dict[sptk] = s.count(sptk)
		s = s.replace(sptk, "")
	emo = "<|NEUTRAL|>"
	for e in emo_dict:
		if sptk_dict[e] > sptk_dict[emo]:
			emo = e
	for e in event_dict:
		if sptk_dict[e] > 0:
			s = event_dict[e] + s
	s = s + emo_dict[emo]

	for emoji in emo_set.union(event_set):
		s = s.replace(" " + emoji, emoji)
		s = s.replace(emoji + " ", emoji)
	return s.strip()

def format_str_v3(s):
	def get_emo(s):
		return s[-1] if s[-1] in emo_set else None
	def get_event(s):
		return s[0] if s[0] in event_set else None

	s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
	for lang in lang_dict:
		s = s.replace(lang, "<|lang|>")
	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
	new_s = " " + s_list[0]
	cur_ent_event = get_event(new_s)
	for i in range(1, len(s_list)):
		if len(s_list[i]) == 0:
			continue
		if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
			s_list[i] = s_list[i][1:]
		#else:
		cur_ent_event = get_event(s_list[i])
		if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
			new_s = new_s[:-1]
		new_s += s_list[i].strip().lstrip()
	new_s = new_s.replace("The.", " ")
	return new_s.strip()

def model_inference(input_wav, language, fs=16000):
	# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
	language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
					 "nospeech": "nospeech"}
	
	# task = "Speech Recognition" if task is None else task
	language = "auto" if len(language) < 1 else language
	selected_language = language_abbr[language]
	# selected_task = task_abbr.get(task)
	
	# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
	
	if isinstance(input_wav, tuple):
		fs, input_wav = input_wav
		input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
		if len(input_wav.shape) > 1:
			input_wav = input_wav.mean(-1)
		if fs != 16000:
			print(f"audio_fs: {fs}")
			resampler = torchaudio.transforms.Resample(fs, 16000)
			input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
			input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
	

	merge_vad = True 
	print(f"language: {language}, merge_vad: {merge_vad}")
	text = model.generate(input=input_wav,
						  cache={},
						  language=language,
						  use_itn=True,
						  batch_size_s=300, merge_vad=merge_vad)
                          
	print(text)
	text = text[0]["text"]
	text = format_str_v3(text)
	print(text)
	
	return text


audio_examples = [
    ["example/inspiremusic/inspiremusic_01.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_noflow_01.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_w_cfm_intro.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_w_cfm_verse.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_w_cfm_chorus.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_w_cfm_outro.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_w_cfm_verse_ras.wav", "text-to-music"],
    ["example/inspiremusic/inspiremusic_wo_cfm_verse_topk.wav", "text-to-music"],
    ["example/ras/chorus/chorus_01.wav", "music-continuation"],
    ["example/ras/chorus/chorus_02.wav", "music-continuation"],
    ["example/ras/chorus/chorus_03.wav", "music-continuation"],
    ["example/ras/chorus/chorus_04.wav", "music-continuation"],
    ["example/ras/chorus/chorus_05.wav", "music-continuation"],
]


description = """

# InspireMusic is a music generation model with text to music generation capability, including text to music, music continuation.

## Usage
### Input text descriptions of the music, click submit, then generate music.

*Example Texts*
- `Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.`
- `The instrumental rock piece features a prominent bass guitar, delivering a pure and energetic sound.`
- `A serene blend of instrumental and light pop, featuring soothing melodies and a gentle, soulful keyboard performance.`

Recommended select audio duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended, github repo.

"""

html_content = """
<div>
    <h2 style="font-size: 22px;margin-left: 0px;">Music Generation Model: InspireMusic</h2>
    <p style="font-size: 18px;margin-left: 20px;">InspireMusic is a unified music, song and audio generation framework through the audio tokenization and detokenization process integrated with an autoregressive transformer. The toolkit provides both inference and training code for music generation. Featuring a unified framework, InspireMusic incorporates autoregressive Transformer and conditional flow-matching modeling (CFM), allowing for the controllable generation of music, songs, and audio with both textual and structural music conditioning, as well as neural audio tokenizers. Currently, the toolkit supports text-to-music generation and plans to expand its capabilities to include text-to-song and text-to-audio generation in the future.</p>
    <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Input a text description of music or input through a microphone, then select the chorus and duration. The music is generated based on the input text. The chorus labels are placed in the front of the text.</p>
	<p style="font-size: 18px;margin-left: 20px;">Recommended select audio duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
	<h2 style="font-size: 22px;margin-left: 0px;">Repo & Demo</h2>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank">Code</a> </p>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://iris2c.github.io/InspireMusic" target="_blank">Demo</a></p>
	<h2 style="font-size: 22px;margin-left: 0px;">Models</h2>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://modelscope.cn/models/iic/InspireMusic/summary" target="_blank">Modelscope Model</a>: </p>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://huggingface.co/FunAudioLLM/InspireMusic-Base" target="_blank">Huggingface Model</a></p>
</div>
"""

# 自定义表格的 HTML 和 CSS 代码
centered_table_html = """
<style>
.centered-table {
  margin-left: auto;
  margin-right: auto;
}
</style>
<div class="centered-table">
    <table border="1" style="border-collapse: collapse; width: 100%;">
        <tr>
            <th>Samples</th>
            <th>InspireMusic</th>
            <th>Text-to-Music</th>
        </tr>

        <tr>
            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/InspireMusic/demo/inspiremusic/inspiremusic_01.wav?OSSAccessKeyId=LTAI4Fmg1PUZcHLPSMGznooK&Expires=1734163633&Signature=hGhy9ACAm0ETPAGEyPhs%2BWkosrY%3D" target="_blank">normal mode</a></td>
            <td>Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.</td>
        </tr>

        <tr>
            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/InspireMusic/demo/inspiremusic/inspiremusic_noflow_01.wav?OSSAccessKeyId=LTAI4Fmg1PUZcHLPSMGznooK&Expires=1737403768&Signature=1AdAJxwLfGBajej0AIYk3oN0%2Bw8%3D" target="_blank">fast mode</a></td>
            <td>The instrumental piece exudes a playful and whimsical atmosphere, likely featuring lively and rhythmic elements. The music seems to be inspired by nature and animals, creating an engaging and light-hearted experience.</td>
        </tr>
    </table>
</div>
"""


def launch():
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
		# gr.Markdown(description)
		gr.HTML(html_content)
		with gr.Column():
			with gr.Row():
				with gr.Column():
					text_inputs = gr.Textbox(
							label="Input Text",
							placeholder="Enter the text you want to generate music, e.g., Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
							lines=3
					)
					fn_button = gr.Button("Start", variant="primary")
				audio_inputs = gr.Audio(
						label="Upload prompt audio",
				)
				with gr.Column():
					with gr.Accordion("Configuration"):
						# task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
						# 					   value="Speech Recognition", label="Task")
						task_inputs = gr.Dropdown(choices=["text-to-music", "music-continuation"],
													  value="text-to-music",
													  label="Task")
					inference_mode_inputs = gr.Dropdown(choices=["normal", "fast"],
													  value="normal",
													  label="Inference Mode")
					cfg_input = gr.Slider(3, 10, step=1, label="CFG value")
					audio_length = gr.Textbox(value="30",
												  label="Duration in seconds")

				gr.Examples(examples=audio_examples,
						inputs=[text_inputs, audio_inputs, task_inputs],
						examples_per_page=5)

			audio_output = gr.Audio(label="Audio Output")

		fn_button.click(model_inference, inputs=[text_inputs, audio_inputs, task_inputs], outputs=audio_output)

		# with gr.Accordion("More examples"):
		# 	gr.HTML(centered_table_html)
	demo.launch()


if __name__ == "__main__":
	# iface.launch()
	launch()