# coding=utf-8 import os import librosa import base64 import io import gradio as gr import re import numpy as np import torch import torchaudio from modelscope import HubApi api = HubApi() key = os.environ["apikey"] if "apikey" in os.environ else "" try: api.login(key) except: pass from funasr import AutoModel model = "iic/SenseVoiceSmall" model = AutoModel(model=model, vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", vad_kwargs={"max_single_segment_time": 30000}, trust_remote_code=True, ) import re emo_dict = { "<|HAPPY|>": "๐Ÿ˜Š", "<|SAD|>": "๐Ÿ˜”", "<|ANGRY|>": "๐Ÿ˜ก", "<|NEUTRAL|>": "", "<|FEARFUL|>": "๐Ÿ˜ฐ", "<|DISGUSTED|>": "๐Ÿคข", "<|SURPRISED|>": "๐Ÿ˜ฎ", } event_dict = { "<|BGM|>": "๐ŸŽผ", "<|Speech|>": "", "<|Applause|>": "๐Ÿ‘", "<|Laughter|>": "๐Ÿ˜€", "<|Cry|>": "๐Ÿ˜ญ", "<|Sneeze|>": "๐Ÿคง", "<|Breath|>": "", "<|Cough|>": "๐Ÿคง", } emoji_dict = { "<|nospeech|><|Event_UNK|>": "โ“", "<|zh|>": "", "<|en|>": "", "<|yue|>": "", "<|ja|>": "", "<|ko|>": "", "<|nospeech|>": "", "<|HAPPY|>": "๐Ÿ˜Š", "<|SAD|>": "๐Ÿ˜”", "<|ANGRY|>": "๐Ÿ˜ก", "<|NEUTRAL|>": "", "<|BGM|>": "๐ŸŽผ", "<|Speech|>": "", "<|Applause|>": "๐Ÿ‘", "<|Laughter|>": "๐Ÿ˜€", "<|FEARFUL|>": "๐Ÿ˜ฐ", "<|DISGUSTED|>": "๐Ÿคข", "<|SURPRISED|>": "๐Ÿ˜ฎ", "<|Cry|>": "๐Ÿ˜ญ", "<|EMO_UNKNOWN|>": "", "<|Sneeze|>": "๐Ÿคง", "<|Breath|>": "", "<|Cough|>": "๐Ÿ˜ท", "<|Sing|>": "", "<|Speech_Noise|>": "", "<|withitn|>": "", "<|woitn|>": "", "<|GBG|>": "", "<|Event_UNK|>": "", } lang_dict = { "<|zh|>": "<|lang|>", "<|en|>": "<|lang|>", "<|yue|>": "<|lang|>", "<|ja|>": "<|lang|>", "<|ko|>": "<|lang|>", "<|nospeech|>": "<|lang|>", } emo_set = {"๐Ÿ˜Š", "๐Ÿ˜”", "๐Ÿ˜ก", "๐Ÿ˜ฐ", "๐Ÿคข", "๐Ÿ˜ฎ"} event_set = {"๐ŸŽผ", "๐Ÿ‘", "๐Ÿ˜€", "๐Ÿ˜ญ", "๐Ÿคง", "๐Ÿ˜ท",} notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] def format_str(s): for sptk in emoji_dict: s = s.replace(sptk, emoji_dict[sptk]) return s def format_str_v2(s): sptk_dict = {} for sptk in emoji_dict: sptk_dict[sptk] = s.count(sptk) s = s.replace(sptk, "") emo = "<|NEUTRAL|>" for e in emo_dict: if sptk_dict[e] > sptk_dict[emo]: emo = e for e in event_dict: if sptk_dict[e] > 0: s = event_dict[e] + s s = s + emo_dict[emo] for emoji in emo_set.union(event_set): s = s.replace(" " + emoji, emoji) s = s.replace(emoji + " ", emoji) return s.strip() def format_str_v3(s): def get_emo(s): return s[-1] if s[-1] in emo_set else None def get_event(s): return s[0] if s[0] in event_set else None s = s.replace("<|nospeech|><|Event_UNK|>", "โ“") for lang in lang_dict: s = s.replace(lang, "<|lang|>") s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] new_s = " " + s_list[0] cur_ent_event = get_event(new_s) for i in range(1, len(s_list)): if len(s_list[i]) == 0: continue if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: s_list[i] = s_list[i][1:] #else: cur_ent_event = get_event(s_list[i]) if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): new_s = new_s[:-1] new_s += s_list[i].strip().lstrip() new_s = new_s.replace("The.", " ") return new_s.strip() def model_inference(input_wav, language, fs=16000): # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")} language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko", "nospeech": "nospeech"} # task = "Speech Recognition" if task is None else task language = "auto" if len(language) < 1 else language selected_language = language_abbr[language] # selected_task = task_abbr.get(task) # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}") if isinstance(input_wav, tuple): fs, input_wav = input_wav input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max if len(input_wav.shape) > 1: input_wav = input_wav.mean(-1) if fs != 16000: print(f"audio_fs: {fs}") resampler = torchaudio.transforms.Resample(fs, 16000) input_wav_t = torch.from_numpy(input_wav).to(torch.float32) input_wav = resampler(input_wav_t[None, :])[0, :].numpy() merge_vad = True print(f"language: {language}, merge_vad: {merge_vad}") text = model.generate(input=input_wav, cache={}, language=language, use_itn=True, batch_size_s=300, merge_vad=merge_vad) print(text) text = text[0]["text"] text = format_str_v3(text) print(text) return text audio_examples = [ ["example/inspiremusic/inspiremusic_01.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_noflow_01.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_w_cfm_intro.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_w_cfm_verse.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_w_cfm_chorus.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_w_cfm_outro.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_w_cfm_verse_ras.wav", "text-to-music"], ["example/inspiremusic/inspiremusic_wo_cfm_verse_topk.wav", "text-to-music"], ["example/ras/chorus/chorus_01.wav", "music-continuation"], ["example/ras/chorus/chorus_02.wav", "music-continuation"], ["example/ras/chorus/chorus_03.wav", "music-continuation"], ["example/ras/chorus/chorus_04.wav", "music-continuation"], ["example/ras/chorus/chorus_05.wav", "music-continuation"], ] description = """ # InspireMusic is a music generation model with text to music generation capability, including text to music, music continuation. ## Usage ### Input text descriptions of the music, click submit, then generate music. *Example Texts* - `Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.` - `The instrumental rock piece features a prominent bass guitar, delivering a pure and energetic sound.` - `A serene blend of instrumental and light pop, featuring soothing melodies and a gentle, soulful keyboard performance.` Recommended select audio duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended, github repo. """ html_content = """

Music Generation Model: InspireMusic

InspireMusic is a unified music, song and audio generation framework through the audio tokenization and detokenization process integrated with an autoregressive transformer. The toolkit provides both inference and training code for music generation. Featuring a unified framework, InspireMusic incorporates autoregressive Transformer and conditional flow-matching modeling (CFM), allowing for the controllable generation of music, songs, and audio with both textual and structural music conditioning, as well as neural audio tokenizers. Currently, the toolkit supports text-to-music generation and plans to expand its capabilities to include text-to-song and text-to-audio generation in the future.

Usage

Input a text description of music or input through a microphone, then select the chorus and duration. The music is generated based on the input text. The chorus labels are placed in the front of the text.

Recommended select audio duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.

Repo & Demo

Code

Demo

Models

Modelscope Model:

Huggingface Model

""" # ่‡ชๅฎšไน‰่กจๆ ผ็š„ HTML ๅ’Œ CSS ไปฃ็  centered_table_html = """
Samples InspireMusic Text-to-Music
normal mode Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.
fast mode The instrumental piece exudes a playful and whimsical atmosphere, likely featuring lively and rhythmic elements. The music seems to be inspired by nature and animals, creating an engaging and light-hearted experience.
""" def launch(): with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown(description) gr.HTML(html_content) with gr.Column(): with gr.Row(): with gr.Column(): text_inputs = gr.Textbox( label="Input Text", placeholder="Enter the text you want to generate music, e.g., Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.", lines=3 ) fn_button = gr.Button("Start", variant="primary") audio_inputs = gr.Audio( label="Upload prompt audio", ) with gr.Column(): with gr.Accordion("Configuration"): # task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"], # value="Speech Recognition", label="Task") task_inputs = gr.Dropdown(choices=["text-to-music", "music-continuation"], value="text-to-music", label="Task") inference_mode_inputs = gr.Dropdown(choices=["normal", "fast"], value="normal", label="Inference Mode") cfg_input = gr.Slider(3, 10, step=1, label="CFG value") audio_length = gr.Textbox(value="30", label="Duration in seconds") gr.Examples(examples=audio_examples, inputs=[text_inputs, audio_inputs, task_inputs], examples_per_page=5) audio_output = gr.Audio(label="Audio Output") fn_button.click(model_inference, inputs=[text_inputs, audio_inputs, task_inputs], outputs=audio_output) # with gr.Accordion("More examples"): # gr.HTML(centered_table_html) demo.launch() if __name__ == "__main__": # iface.launch() launch()