Spaces:

FunAudioLLM
/

InspireMusic

Runtime error

App Files Files Community

chong.zhang commited on Apr 14

Commit

2c50d95

1 Parent(s): b6363bb

update

Browse files

Files changed (13) hide show

inspiremusic/.DS_Store +0 -0
inspiremusic/bin/inference.py +15 -6
inspiremusic/cli/frontend.py +4 -5
inspiremusic/cli/inference.py +59 -65
inspiremusic/cli/inspiremusic.py +22 -11
inspiremusic/cli/model.py +27 -15
inspiremusic/flow/flow.py +1 -1
inspiremusic/llm/llm.py +29 -71
inspiremusic/transformer/qwen_encoder.py +26 -7
inspiremusic/utils/common.py +47 -5
inspiremusic/utils/executor.py +9 -3
inspiremusic/utils/utils.py +23 -6
inspiremusic/wavtokenizer/.DS_Store +0 -0

inspiremusic/.DS_Store DELETED Viewed

Binary file (8.2 kB)

inspiremusic/bin/inference.py CHANGED Viewed

@@ -28,7 +28,6 @@ from inspiremusic.cli.model import InspireMusicModel
 from inspiremusic.dataset.dataset import Dataset
 import time
 from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
-from inspiremusic.utils.common import MUSIC_STRUCTURE_LABELS
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -42,6 +41,7 @@ def get_args():
     parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
     parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
     parser.add_argument('--fast', action='store_true', required=False, help='True: fast inference mode, without flow matching for fast inference. False: normal inference mode, with flow matching for high quality.')
     parser.add_argument('--fp16', default=True, type=bool, required=False, help='inference with fp16 model')
     parser.add_argument('--fade_out', default=True, type=bool, required=False, help='add fade out effect to generated audio')
     parser.add_argument('--fade_out_duration', default=1.0, type=float, required=False, help='fade out duration in seconds')
@@ -53,7 +53,7 @@ def get_args():
                         help='sampling rate of input audio')
     parser.add_argument('--output_sample_rate', type=int, default=48000, required=False, choices=[24000, 48000],
                         help='sampling rate of generated output audio')
-    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, required=False,
                         help='the minimum generated audio length in seconds')
     parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
                         help='the maximum generated audio length in seconds')
@@ -70,9 +70,9 @@ def get_args():
     print(args)
     return args
 def main():
 	args = get_args()
 	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
 	os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
@@ -85,11 +85,20 @@ def main():
 	# Init inspiremusic models from configs
 	use_cuda = args.gpu >= 0 and torch.cuda.is_available()
-	device = torch.device('cuda' if use_cuda else 'cpu')
 	with open(args.config, 'r') as f:
 		configs = load_hyperpyyaml(f)
-	model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], args.fast, args.fp16)
 	model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
@@ -153,7 +162,7 @@ def main():
 			time_end = batch["time_end"].to(device)
 			chorus = batch["chorus"].to(torch.int)
-			text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{MUSIC_STRUCTURE_LABELS[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
 			chorus = chorus.to(device)
 			if batch["acoustic_token"] is None:

 from inspiremusic.dataset.dataset import Dataset
 import time
 from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
     parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
     parser.add_argument('--fast', action='store_true', required=False, help='True: fast inference mode, without flow matching for fast inference. False: normal inference mode, with flow matching for high quality.')
+    parser.add_argument('--dtype', type=str, default="fp16", required=False, choices=["fp16", "bf16", "fp32"], help='data type')
     parser.add_argument('--fp16', default=True, type=bool, required=False, help='inference with fp16 model')
     parser.add_argument('--fade_out', default=True, type=bool, required=False, help='add fade out effect to generated audio')
     parser.add_argument('--fade_out_duration', default=1.0, type=float, required=False, help='fade out duration in seconds')
                         help='sampling rate of input audio')
     parser.add_argument('--output_sample_rate', type=int, default=48000, required=False, choices=[24000, 48000],
                         help='sampling rate of generated output audio')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=0.0, required=False,
                         help='the minimum generated audio length in seconds')
     parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
                         help='the maximum generated audio length in seconds')
     print(args)
     return args
 def main():
 	args = get_args()
+	chorus_labels = ["intro", "verse1", "chorus", "verse2", "outro"]
 	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
 	os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
 	# Init inspiremusic models from configs
 	use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+	if args.gpu >=0:
+		if torch.cuda.is_available():
+			device = torch.device('cuda')
+		elif torch.backends.mps.is_available():
+			device = torch.device('mps')
+		elif torch.xpu.is_available():
+			device = torch.device('xpu')
+	else:
+		device = torch.device('cpu')
 	with open(args.config, 'r') as f:
 		configs = load_hyperpyyaml(f)
+	model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], args.dtype, args.fast, args.fp16)
 	model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
 			time_end = batch["time_end"].to(device)
 			chorus = batch["chorus"].to(torch.int)
+			text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{chorus_labels[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
 			chorus = chorus.to(device)
 			if batch["acoustic_token"] is None:

inspiremusic/cli/frontend.py CHANGED Viewed

@@ -29,6 +29,7 @@ class InspireMusicFrontEnd:
                  music_tokenizer_dir: str,
                  audio_tokenizer_dir: str,
                  instruct: bool = False,
                  fast: bool = False,
                  fp16: bool = True,
                  allowed_special: str = 'all'):
@@ -39,7 +40,7 @@ class InspireMusicFrontEnd:
         self.bandwidth_id = torch.tensor([0]).to(self.device)
         self.wavtokenizer = WavTokenizer.from_pretrained_feat(f"{audio_tokenizer_dir}/config.yaml", f"{audio_tokenizer_dir}/model.pt").to(self.device)
-        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
         self.model = self.model.load(llm_model, flow_model, music_tokenizer_dir, audio_tokenizer_dir)
         self.instruct = instruct
@@ -69,12 +70,10 @@ class InspireMusicFrontEnd:
             text = text.replace(" - ", "，")
             text = remove_bracket(text)
             text = re.sub(r'[，,]+$', '。', text)
-            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
-                                         token_min_n=60, merge_len=20, comma_split=False))
         else:
             text = spell_out_number(text, self.inflect_parser)
-            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
-                                         token_min_n=60, merge_len=20, comma_split=False))
         if split is False:
             return text
         return texts

                  music_tokenizer_dir: str,
                  audio_tokenizer_dir: str,
                  instruct: bool = False,
+                 dtype: str = "fp16",
                  fast: bool = False,
                  fp16: bool = True,
                  allowed_special: str = 'all'):
         self.bandwidth_id = torch.tensor([0]).to(self.device)
         self.wavtokenizer = WavTokenizer.from_pretrained_feat(f"{audio_tokenizer_dir}/config.yaml", f"{audio_tokenizer_dir}/model.pt").to(self.device)
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], dtype, fast, fp16)
         self.model = self.model.load(llm_model, flow_model, music_tokenizer_dir, audio_tokenizer_dir)
         self.instruct = instruct
             text = text.replace(" - ", "，")
             text = remove_bracket(text)
             text = re.sub(r'[，,]+$', '。', text)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False))
         else:
             text = spell_out_number(text, self.inflect_parser)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False))
         if split is False:
             return text
         return texts

inspiremusic/cli/inference.py CHANGED Viewed

@@ -23,53 +23,60 @@ from inspiremusic.utils.file_utils import logging
 import torch
 from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
-def set_env_variables():
     os.environ['PYTHONIOENCODING'] = 'UTF-8'
     os.environ['TOKENIZERS_PARALLELISM'] = 'False'
-    main_root = os.getcwd()
     bin_dir = os.path.join(main_root, 'inspiremusic')
     third_party_matcha_tts_path = os.path.join(main_root, 'third_party', 'Matcha-TTS')
     python_path = f"{main_root}:{bin_dir}:{third_party_matcha_tts_path}:{os.environ.get('PYTHONPATH', '')}"
-    os.environ['PATH'] = python_path
     sys.path.extend([main_root, third_party_matcha_tts_path])
-class InspireMusicUnified:
     def __init__(self,
-                 model_name: str = "InspireMusic-1.5B-Long",
                  model_dir: str = None,
-                 min_generate_audio_seconds: float = 10.0,
                  max_generate_audio_seconds: float = 30.0,
                  sample_rate: int = 24000,
                  output_sample_rate: int = 48000,
                  load_jit: bool = True,
                  load_onnx: bool = False,
                  fast: bool = False,
                  fp16: bool = True,
-                 gpu: int = 0,
                  result_dir: str = None,
-                 hub="modelscope"):
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
         # Set model_dir or default to downloading if it doesn't exist
         if model_dir is None:
-             model_dir = f"pretrained_models/{model_name}"
-        else:
-            model_dir = model_dir.replace("../../", "./")
-        if not os.path.isfile(f"{model_dir}/llm.pt"):
             if hub == "modelscope":
                 from modelscope import snapshot_download
                 if model_name == "InspireMusic-Base":
                     snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
                 else:
                     snapshot_download(f"iic/{model_name}", local_dir=model_dir)
         self.model_dir = model_dir
-        print(self.model_dir)
         self.sample_rate = sample_rate
         self.output_sample_rate = 24000 if fast else output_sample_rate
-        self.result_dir = result_dir or f"exp/{model_name}"
         os.makedirs(self.result_dir, exist_ok=True)
         self.min_generate_audio_seconds = min_generate_audio_seconds
@@ -79,9 +86,17 @@ class InspireMusicUnified:
         assert self.min_generate_audio_seconds <= self.max_generate_audio_seconds, "Min audio seconds must be less than or equal to max audio seconds"
         use_cuda = gpu >= 0 and torch.cuda.is_available()
-        self.device = torch.device('cuda' if use_cuda else 'cpu')
-        self.model = InspireMusic(self.model_dir, load_jit=load_jit, load_onnx=load_onnx, fast=fast, fp16=fp16)
-        self.model.model.llm = self.model.model.llm.to(torch.float16)
         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -90,6 +105,7 @@ class InspireMusicUnified:
                   task: str = 'text-to-music',
                   text: str = None,
                   audio_prompt: str = None, # audio prompt file path
                   chorus: str = "verse",
                   time_start: float = 0.0,
                   time_end: float = 30.0,
@@ -205,84 +221,61 @@ class InspireMusicUnified:
 def get_args():
     parser = argparse.ArgumentParser(description='Run inference with your model')
-    parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long",
-                        help='Model name')
-    parser.add_argument('-d', '--model_dir',
-                        help='Model folder path')
-    parser.add_argument('-t', '--text', default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
-                        help='Prompt text')
-    parser.add_argument('-a', '--audio_prompt', default=None,
-                        help='Prompt audio')
-    parser.add_argument('-c', '--chorus', default="intro",
-                        help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
-    parser.add_argument('-f', '--fast', type=bool, default=False,
-                        help='Enable fast inference mode (without flow matching)')
-    parser.add_argument('-g', '--gpu', type=int, default=0,
-                        help='GPU ID for this rank, -1 for CPU')
-    parser.add_argument('--task', default='text-to-music', choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'],
-                        help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
-    parser.add_argument('-r', '--result_dir', default="exp/inspiremusic",
-                        help='Directory to save generated audio')
-    parser.add_argument('-o', '--output_fn', default="output_audio",
-                        help='Output file name')
-    parser.add_argument('--format', type=str, default="wav", choices=["wav", "mp3", "m4a", "flac"],
-                        help='Format of output audio')
-    parser.add_argument('--sample_rate', type=int, default=24000,
-                        help='Sampling rate of input audio')
-    parser.add_argument('--output_sample_rate', type=int, default=48000, choices=[24000, 48000],
-                        help='Sampling rate of generated output audio')
-    parser.add_argument('-s', '--time_start', type=float, default=0.0,
-                        help='Start time in seconds')
-    parser.add_argument('-e', '--time_end', type=float, default=30.0,
-                        help='End time in seconds')
-    parser.add_argument('--max_audio_prompt_length', type=float, default=5.0,
-                        help='Maximum audio prompt length in seconds')
-    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0,
-                        help='Minimum generated audio length in seconds')
-    parser.add_argument('--max_generate_audio_seconds', type=float, default=300.0,
-                        help='Maximum generated audio length in seconds')
-    parser.add_argument('--fp16', type=bool, default=True,
-                        help='Inference with fp16 model')
-    parser.add_argument('--fade_out', type=bool, default=True,
-                        help='Apply fade out effect to generated audio')
-    parser.add_argument('--fade_out_duration', type=float, default=1.0,
-                        help='Fade out duration in seconds')
-    parser.add_argument('--trim', type=bool, default=False,
-                        help='Trim the silence ending of generated audio')
     args = parser.parse_args()
     if not args.model_dir:
-        args.model_dir = os.path.join("pretrained_models", args.model_name)
     print(args)
     return args
 def main():
-    set_env_variables()
     args = get_args()
-    model = InspireMusicUnified(model_name = args.model_name,
                  model_dir = args.model_dir,
                  min_generate_audio_seconds = args.min_generate_audio_seconds,
                  max_generate_audio_seconds = args.max_generate_audio_seconds,
@@ -290,6 +283,7 @@ def main():
                  output_sample_rate = args.output_sample_rate,
                  load_jit = True,
                  load_onnx = False,
                  fast = args.fast,
                  fp16 = args.fp16,
                  gpu = args.gpu,

 import torch
 from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
+def env_variables():
     os.environ['PYTHONIOENCODING'] = 'UTF-8'
     os.environ['TOKENIZERS_PARALLELISM'] = 'False'
+    current_working_dir = os.getcwd()
+    main_root = os.path.realpath(os.path.join(current_working_dir, '../../'))
     bin_dir = os.path.join(main_root, 'inspiremusic')
     third_party_matcha_tts_path = os.path.join(main_root, 'third_party', 'Matcha-TTS')
     python_path = f"{main_root}:{bin_dir}:{third_party_matcha_tts_path}:{os.environ.get('PYTHONPATH', '')}"
+    os.environ['PYTHONPATH'] = python_path
     sys.path.extend([main_root, third_party_matcha_tts_path])
+class InspireMusicModel:
     def __init__(self,
+                 model_name: str,
                  model_dir: str = None,
+                 min_generate_audio_seconds: float = 0.0,
                  max_generate_audio_seconds: float = 30.0,
                  sample_rate: int = 24000,
                  output_sample_rate: int = 48000,
                  load_jit: bool = True,
                  load_onnx: bool = False,
+                 dtype: str = "fp16",
                  fast: bool = False,
                  fp16: bool = True,
+                 gpu: int = 1,
                  result_dir: str = None,
+                 hub="modelscope",
+                 repo_url=None,
+                 token=None):
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
         # Set model_dir or default to downloading if it doesn't exist
         if model_dir is None:
+            if sys.platform == "win32":
+                model_dir = f"..\..\pretrained_models\{model_name}"
+            else:
+                model_dir = f"../../pretrained_models/{model_name}"
+        if not os.path.isfile(os.path.join(model_dir, "llm.pt")):
             if hub == "modelscope":
                 from modelscope import snapshot_download
                 if model_name == "InspireMusic-Base":
                     snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
                 else:
                     snapshot_download(f"iic/{model_name}", local_dir=model_dir)
+            elif hub == "huggingface":
+                from huggingface_hub import snapshot_download
+                snapshot_download(repo_id=f"FunAudioLLM/{model_name}", local_dir=model_dir)
         self.model_dir = model_dir
         self.sample_rate = sample_rate
         self.output_sample_rate = 24000 if fast else output_sample_rate
+        self.result_dir = result_dir or os.path.join("exp", model_name)
         os.makedirs(self.result_dir, exist_ok=True)
         self.min_generate_audio_seconds = min_generate_audio_seconds
         assert self.min_generate_audio_seconds <= self.max_generate_audio_seconds, "Min audio seconds must be less than or equal to max audio seconds"
         use_cuda = gpu >= 0 and torch.cuda.is_available()
+        if gpu >=0:
+            if torch.cuda.is_available():
+                self.device = torch.device('cuda')
+            elif torch.backends.mps.is_available():
+                self.device = torch.device('mps')
+            elif torch.xpu.is_available():
+                self.device = torch.device('xpu')
+        else:
+            self.device = torch.device('cpu')
+        self.model = InspireMusic(self.model_dir, load_jit=load_jit, load_onnx=load_onnx, dtype=dtype, fast=fast, fp16=fp16)
         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
                   task: str = 'text-to-music',
                   text: str = None,
                   audio_prompt: str = None, # audio prompt file path
+                  instruct: str = None,
                   chorus: str = "verse",
                   time_start: float = 0.0,
                   time_end: float = 30.0,
 def get_args():
     parser = argparse.ArgumentParser(description='Run inference with your model')
+    parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long", help='Model name')
+    parser.add_argument('-d', '--model_dir', help='Model folder path')
+    parser.add_argument('-t', '--text', default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.", help='Prompt text')
+    parser.add_argument('-a', '--audio_prompt', default=None, help='Prompt audio')
+    parser.add_argument('-c', '--chorus', default="intro", help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
+    parser.add_argument('-f', '--fast', type=bool, default=False, help='Enable fast inference mode (without flow matching)')
+    parser.add_argument('-g', '--gpu', type=int, default=1, help='GPU ID for this rank, -1 for CPU')
+    parser.add_argument('--task', default='text-to-music', choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'], help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
+    parser.add_argument('-r', '--result_dir', default="exp/inspiremusic", help='Directory to save generated audio')
+    parser.add_argument('-o', '--output_fn', default="output_audio", help='Output file name')
+    parser.add_argument('--format', type=str, default="wav", choices=["wav", "mp3", "m4a", "flac"], help='Format of output audio')
+    parser.add_argument('--sample_rate', type=int, default=24000, help='Sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000, choices=[24000, 48000], help='Sampling rate of generated output audio')
+    parser.add_argument('-s', '--time_start', type=float, default=0.0, help='Start time in seconds')
+    parser.add_argument('-e', '--time_end', type=float, default=30.0, help='End time in seconds')
+    parser.add_argument('--max_audio_prompt_length', type=float, default=5.0, help='Maximum audio prompt length in seconds')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, help='Minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, help='Maximum generated audio length in seconds')
+    parser.add_argument('--fp16', type=bool, default=True, help='Inference with fp16 model')
+    parser.add_argument('--fade_out', type=bool, default=True, help='Apply fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', type=float, default=1.0, help='Fade out duration in seconds')
+    parser.add_argument('--trim', type=bool, default=False, help='Trim the silence ending of generated audio')
     args = parser.parse_args()
     if not args.model_dir:
+        args.model_dir = os.path.join("../../pretrained_models", args.model_name)
     print(args)
     return args
 def main():
+    env_variables()
     args = get_args()
+    model = InspireMusicModel(model_name = args.model_name,
                  model_dir = args.model_dir,
                  min_generate_audio_seconds = args.min_generate_audio_seconds,
                  max_generate_audio_seconds = args.max_generate_audio_seconds,
                  output_sample_rate = args.output_sample_rate,
                  load_jit = True,
                  load_onnx = False,
+                 dtype="fp16",
                  fast = args.fast,
                  fp16 = args.fp16,
                  gpu = args.gpu,

inspiremusic/cli/inspiremusic.py CHANGED Viewed

@@ -12,32 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import time
 from tqdm import tqdm
 from hyperpyyaml import load_hyperpyyaml
 from inspiremusic.cli.frontend import InspireMusicFrontEnd
 from inspiremusic.cli.model import InspireMusicModel
 from inspiremusic.utils.file_utils import logging
 import torch
 class InspireMusic:
-    def __init__(self, model_dir, load_jit=True, load_onnx=False, fast = False, fp16=True, hub="modelscope"):
         instruct = True if '-Instruct' in model_dir else False
         if model_dir is None:
-             model_dir = f"pretrained_models/InspireMusic-1.5B-Long"
-        if not os.path.isfile(f"{model_dir}/llm.pt"):
             model_name = model_dir.split("/")[-1]
             if hub == "modelscope":
                 from modelscope import snapshot_download
                 if model_name == "InspireMusic-Base":
                     snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
                 else:
-                    snapshot_download(f"iic/{model_name}", local_dir=model_dir)
-        assert os.path.exists(f'{model_dir}/inspiremusic.yaml')
-        with open('{}/inspiremusic.yaml'.format(model_dir), 'r') as f:
             configs = load_hyperpyyaml(f)
         self.frontend = InspireMusicFrontEnd(configs,
@@ -47,15 +56,17 @@ class InspireMusic:
                                           '{}/music_tokenizer/'.format(model_dir),
                                           '{}/wavtokenizer/'.format(model_dir),
                                           instruct,
                                           fast,
                                           fp16,
                                           configs['allowed_special'])
-        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
-        self.model.load('{}/llm.pt'.format(model_dir),
-                        '{}/flow.pt'.format(model_dir),
-                        '{}/music_tokenizer/'.format(model_dir),
-                        '{}/wavtokenizer/model.pt'.format(model_dir))
         del configs
     @torch.inference_mode()

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import sys
 import time
 from tqdm import tqdm
 from hyperpyyaml import load_hyperpyyaml
 from inspiremusic.cli.frontend import InspireMusicFrontEnd
 from inspiremusic.cli.model import InspireMusicModel
 from inspiremusic.utils.file_utils import logging
+from inspiremusic.utils.utils import download_model
 import torch
 class InspireMusic:
+    def __init__(self, model_dir, load_jit=True, load_onnx=False, dtype = "fp16", fast = False, fp16=True, hub="modelscope", repo_url=None, token=None):
         instruct = True if '-Instruct' in model_dir else False
         if model_dir is None:
+            if sys.platform == "win32":
+                model_dir = f"..\..\pretrained_models\{model_name}"
+            else:
+                model_dir = f"../../pretrained_models/{model_name}"
+        if not os.path.isfile(os.path.join(model_dir, "llm.pt")):
             model_name = model_dir.split("/")[-1]
             if hub == "modelscope":
                 from modelscope import snapshot_download
                 if model_name == "InspireMusic-Base":
                     snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
                 else:
+                    snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
+            elif hub == "huggingface":
+                from huggingface_hub import snapshot_download
+                snapshot_download(repo_id=f"FunAudioLLM/{model_name}", local_dir=model_dir)
+            else:
+                download_model(repo_url, model_dir, token)
+        with open(os.path.join(model_dir, 'inspiremusic.yaml'), 'r') as f:
             configs = load_hyperpyyaml(f)
         self.frontend = InspireMusicFrontEnd(configs,
                                           '{}/music_tokenizer/'.format(model_dir),
                                           '{}/wavtokenizer/'.format(model_dir),
                                           instruct,
+                                          dtype,
                                           fast,
                                           fp16,
                                           configs['allowed_special'])
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], dtype, fast, fp16)
+        self.model.load(os.path.join(model_dir, 'llm.pt'),
+                        os.path.join(model_dir, 'flow.pt'),
+                        os.path.join(model_dir, 'music_tokenizer'),
+                        os.path.join(model_dir, 'wavtokenizer', "model.pt"),
+                        )
         del configs
     @torch.inference_mode()

inspiremusic/cli/model.py CHANGED Viewed

@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import threading
 import time
@@ -21,23 +23,37 @@ from inspiremusic.wavtokenizer.decoder.pretrained import WavTokenizer
 from torch.cuda.amp import autocast
 import logging
 import torch
-import os
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 class InspireMusicModel:
     def __init__(self,
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  music_tokenizer: torch.nn.Module,
                  wavtokenizer: torch.nn.Module,
                  fast: bool = False,
                  fp16: bool = True,
                  ):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.llm = llm
         self.flow = flow
         self.music_tokenizer = music_tokenizer
         self.wavtokenizer = wavtokenizer
@@ -66,7 +82,7 @@ class InspireMusicModel:
     def load(self, llm_model, flow_model, hift_model, wavtokenizer_model):
         if llm_model is not None:
             self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
-            self.llm.to(self.device).eval()
         else:
             self.llm = None
         if flow_model is not None:
@@ -74,19 +90,15 @@ class InspireMusicModel:
             self.flow.to(self.device).eval()
         if hift_model is not None:
             if ".pt" not in hift_model:
-                self.music_tokenizer = VQVAE( hift_model + '/config.json',
-                                    hift_model + '/model.pt', with_encoder=True)
             else:
-                self.music_tokenizer = VQVAE(os.path.dirname(hift_model) + '/config.json',
-                                    hift_model, with_encoder=True)
             self.music_tokenizer.to(self.device).eval()
         if wavtokenizer_model is not None:
             if ".pt" not in wavtokenizer_model:
-                self.wavtokenizer = WavTokenizer.from_pretrained_feat( wavtokenizer_model + '/config.yaml',
-                                    wavtokenizer_model + '/model.pt')
             else:
-                self.wavtokenizer = WavTokenizer.from_pretrained_feat( os.path.dirname(wavtokenizer_model) + '/config.yaml',
-                                    wavtokenizer_model )
             self.wavtokenizer.to(self.device)
     def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
@@ -110,7 +122,7 @@ class InspireMusicModel:
     def llm_job(self, text, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, uuid, duration_to_gen, task):
         with self.llm_context:
             local_res = []
-            with autocast(enabled=self.fp16):
                 inference_kwargs = {
                     'text': text.to(self.device),
                     'text_len': torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import sys
 import numpy as np
 import threading
 import time
 from torch.cuda.amp import autocast
 import logging
 import torch
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 class InspireMusicModel:
     def __init__(self,
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  music_tokenizer: torch.nn.Module,
                  wavtokenizer: torch.nn.Module,
+                 dtype: str = "fp16",
                  fast: bool = False,
                  fp16: bool = True,
                  ):
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        elif torch.backends.mps.is_available():
+            self.device = torch.device('mps')
+        elif torch.xpu.is_available():
+            self.device = torch.device('xpu')
+        else:
+            self.device = torch.device('cpu')
+        if dtype == "fp16":
+            self.dtype = torch.float16
+        elif dtype == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
+        self.llm = llm.to(self.dtype)
         self.flow = flow
         self.music_tokenizer = music_tokenizer
         self.wavtokenizer = wavtokenizer
     def load(self, llm_model, flow_model, hift_model, wavtokenizer_model):
         if llm_model is not None:
             self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+            self.llm.to(self.device).to(self.dtype).eval()
         else:
             self.llm = None
         if flow_model is not None:
             self.flow.to(self.device).eval()
         if hift_model is not None:
             if ".pt" not in hift_model:
+                self.music_tokenizer = VQVAE(os.path.join(hift_model, 'config.json'), os.path.join(hift_model, 'model.pt'), with_encoder=True)
             else:
+                self.music_tokenizer = VQVAE(os.path.join(os.path.dirname(hift_model), 'config.json'), hift_model, with_encoder=True)
             self.music_tokenizer.to(self.device).eval()
         if wavtokenizer_model is not None:
             if ".pt" not in wavtokenizer_model:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat(os.path.join(wavtokenizer_model, 'config.yaml'), os.path.join(wavtokenizer_model, 'model.pt'))
             else:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat(os.path.join(os.path.dirname(wavtokenizer_model), 'config.yaml'), wavtokenizer_model)
             self.wavtokenizer.to(self.device)
     def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
     def llm_job(self, text, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, uuid, duration_to_gen, task):
         with self.llm_context:
             local_res = []
+            with autocast(enabled=self.fp16, dtype=self.dtype, cache_enabled=True):
                 inference_kwargs = {
                     'text': text.to(self.device),
                     'text_len': torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),

inspiremusic/flow/flow.py CHANGED Viewed

@@ -39,7 +39,7 @@ class MaskedDiff(torch.nn.Module):
                                                           'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
                  mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 128, 'sampling_rate': 48000,
                                         'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 48000},
-                generator_model_dir: str = "pretrained_models/InspireMusic-Base/music_tokenizer",
                 num_codebooks: int = 4
                 ):
         super().__init__()

                                                           'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
                  mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 128, 'sampling_rate': 48000,
                                         'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 48000},
+                generator_model_dir: str = "../../pretrained_models/InspireMusic-Base/music_tokenizer",
                 num_codebooks: int = 4
                 ):
         super().__init__()

inspiremusic/llm/llm.py CHANGED Viewed

@@ -50,9 +50,19 @@ class LLM(torch.nn.Module):
             length_normalized_loss: bool = True,
             lsm_weight: float = 0.0,
             frozen_input_embed: bool = False,
             **kwargs,
     ):
         super().__init__()
         self.llm_input_size = llm_input_size
         self.audio_token_size = audio_token_size
         # 1. build text token inputs related modules
@@ -115,34 +125,9 @@ class LLM(torch.nn.Module):
         encoder_name = encoder_conf.pop("name", "transformer")
         model = None
-        if encoder_name == "transformer":
-            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
-            model = ConformerEncoder(
-                    **encoder_conf,
-                    input_size=self.input_size,
-                    use_cnn_module=False,
-                    macaron_style=False,
-            )
-        elif encoder_name == "conformer":
-            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
-            model = ConformerEncoder(
-                    **encoder_conf,
-                    input_size=self.input_size,
-            )
-        elif encoder_name == "llama_encoder":
-            from inspiremusic.transformer.encoder.llama_encoder import LlamaEncoder
-            model = LlamaEncoder(
-                    **encoder_conf,
-                    input_size=self.input_size,
-            )
-        elif encoder_name == "qwen2":
-            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
-            model = QwenEncoder(
-                    **encoder_conf,
-                    input_size=self.input_size,
-            )
-        elif encoder_name == "qwen2.5":
-            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
             model = QwenEncoder(
                     **encoder_conf,
                     input_size=self.input_size,
@@ -237,8 +222,7 @@ class LLM(torch.nn.Module):
             time_end_embed = self.time_embedding(time_end).to(text_token.dtype)
             chorus_embed = self.chorus_embedding(chorus)
-            lm_target = [torch.tensor(
-                [IGNORE_ID] * (4 + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
         lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
@@ -250,18 +234,9 @@ class LLM(torch.nn.Module):
         audio_token = self.speech_embedding(audio_token)
         # 5. unpad and pad
-        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb,
-                                                         [time_start_embed,
-                                                          time_end_embed,
-                                                          chorus_embed],
-                                                         text_token,
-                                                         text_token_len,
-                                                         task_id_emb,
-                                                         audio_token,
-                                                         audio_token_len,
-                                                         seg_len)
         # 6. run lm forward
-        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
         logits = self.llm_decoder(lm_output)
         loss = self.criterion_ce(logits, lm_target)
@@ -290,7 +265,7 @@ class LLM(torch.nn.Module):
             prompt_audio_token: torch.Tensor,
             prompt_audio_token_len: torch.Tensor,
             embeddings: List,
-            duration_to_gen: float = 300,
             task: str = "continuation",
             token_rate: int = 75,
             limit_audio_prompt_len: int = 5,
@@ -317,8 +292,7 @@ class LLM(torch.nn.Module):
                 time_end_embed = self.time_embedding(time_end).reshape(1, 1, -1)  # .half()
                 chorus_embed = self.chorus_embedding(chorus).reshape(1, 1, -1)  # .half()
             else:
-                time_start_embed = self.time_embedding(
-                    time_start.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
                 time_end_embed = self.time_embedding(time_end.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
                 chorus_embed = self.chorus_embedding(chorus)  # .half()
@@ -332,10 +306,10 @@ class LLM(torch.nn.Module):
         else:
             audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
-        if prompt_audio_token_len:
-            prompt_audio_token_emb = self.speech_embedding(prompt_audio_token)
-        else:
-            prompt_audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
         # Check if removing prompt audio token will fail decoding.
         if task == "continuation":
@@ -344,31 +318,18 @@ class LLM(torch.nn.Module):
                      chorus_embed, text, task_id_emb, audio_token_emb], dim=1)
             if infer_cfg:
-                audio_cfg = self.speech_embedding(
-                    audio_token.new_zeros(audio_token.shape))
-                lm_cf_input = torch.concat(
-                        [sos_eos_emb, torch.rand_like(time_start_embed),
-                         torch.rand_like(time_end_embed),
-                         torch.rand_like(chorus_embed), text_cfg, task_id_emb,
-                         audio_cfg], dim=1)
                 lm_input = torch.cat([lm_input, lm_cf_input], 0)
         else:
-            lm_input = torch.concat(
-                    [sos_eos_emb, time_start_embed, time_end_embed,
-                     chorus_embed, text, task_id_emb], dim=1)
             if infer_cfg:
-                lm_cf_input = torch.concat(
-                        [sos_eos_emb, torch.rand_like(time_start_embed),
-                         torch.rand_like(time_end_embed),
-                         torch.rand_like(chorus_embed), text_cfg, task_id_emb],
-                        dim=1)
                 lm_input = torch.cat([lm_input, lm_cf_input], 0)
         # 4. cal min/max_length
-        min_len = 0.9 * duration_to_gen * token_rate
         max_len = duration_to_gen * token_rate
-        logging.info(
-            f"LLM generation sequence length: {max_len}, generate audio length {duration_to_gen}s.")
         # 5. step by step decode
         out_tokens = []
@@ -376,7 +337,7 @@ class LLM(torch.nn.Module):
         state = None
         for i in range(int(max_len)):
-            y_pred, _, state = self.llm.forward_one_step(lm_input, torch.ones(lm_input.shape[0], lm_input.shape[1], device=lm_input.device).to(torch.bool), cache=state)
             logits = self.llm_decoder(y_pred[:, -1])
             if infer_cfg:
                 # perform context free guidance
@@ -389,10 +350,7 @@ class LLM(torch.nn.Module):
             logp = logp.squeeze(dim=0)
             if i < int(min_len):
-                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=torch.float16)
-            if i < int(min_len):
-                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=torch.float16)
             top_ids = self.sampling_ids(logp, out_tokens, ignore_eos=i < min_len).item()

             length_normalized_loss: bool = True,
             lsm_weight: float = 0.0,
             frozen_input_embed: bool = False,
+            dtype: str = "fp16",
+            text_token_size: int = 151643,
             **kwargs,
     ):
         super().__init__()
+        if dtype == "fp16":
+            self.dtype = torch.float16
+        elif dtype == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
         self.llm_input_size = llm_input_size
         self.audio_token_size = audio_token_size
         # 1. build text token inputs related modules
         encoder_name = encoder_conf.pop("name", "transformer")
         model = None
+        if "qwen" in encoder_name:
+            from inspiremusic.transformer.qwen_encoder import QwenEncoder
             model = QwenEncoder(
                     **encoder_conf,
                     input_size=self.input_size,
             time_end_embed = self.time_embedding(time_end).to(text_token.dtype)
             chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor([IGNORE_ID] * (4 + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
         lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
         audio_token = self.speech_embedding(audio_token)
         # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, [time_start_embed, time_end_embed, chorus_embed], text_token, text_token_len, task_id_emb, audio_token, audio_token_len, seg_len)
         # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input.to(self.dtype), lm_input_len.to(device))
         logits = self.llm_decoder(lm_output)
         loss = self.criterion_ce(logits, lm_target)
             prompt_audio_token: torch.Tensor,
             prompt_audio_token_len: torch.Tensor,
             embeddings: List,
+            duration_to_gen: float = 30,
             task: str = "continuation",
             token_rate: int = 75,
             limit_audio_prompt_len: int = 5,
                 time_end_embed = self.time_embedding(time_end).reshape(1, 1, -1)  # .half()
                 chorus_embed = self.chorus_embedding(chorus).reshape(1, 1, -1)  # .half()
             else:
+                time_start_embed = self.time_embedding(time_start.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
                 time_end_embed = self.time_embedding(time_end.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
                 chorus_embed = self.chorus_embedding(chorus)  # .half()
         else:
             audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        #if prompt_audio_token_len:
+        #    prompt_audio_token_emb = self.speech_embedding(prompt_audio_token)
+        #else:
+        #    prompt_audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
         # Check if removing prompt audio token will fail decoding.
         if task == "continuation":
                      chorus_embed, text, task_id_emb, audio_token_emb], dim=1)
             if infer_cfg:
+                audio_cfg = self.speech_embedding(audio_token.new_zeros(audio_token.shape))
+                lm_cf_input = torch.concat([sos_eos_emb, torch.rand_like(time_start_embed), torch.rand_like(time_end_embed), torch.rand_like(chorus_embed), text_cfg, task_id_emb, audio_cfg], dim=1)
                 lm_input = torch.cat([lm_input, lm_cf_input], 0)
         else:
+            lm_input = torch.concat([sos_eos_emb, time_start_embed, time_end_embed, chorus_embed, text, task_id_emb], dim=1)
             if infer_cfg:
+                lm_cf_input = torch.concat([sos_eos_emb, torch.rand_like(time_start_embed), torch.rand_like(time_end_embed), torch.rand_like(chorus_embed), text_cfg, task_id_emb], dim=1)
                 lm_input = torch.cat([lm_input, lm_cf_input], 0)
         # 4. cal min/max_length
+        min_len = int(0.9 * duration_to_gen * token_rate)
         max_len = duration_to_gen * token_rate
         # 5. step by step decode
         out_tokens = []
         state = None
         for i in range(int(max_len)):
+            y_pred, _, state = self.llm.forward_one_step(lm_input.to(self.dtype), torch.ones(lm_input.shape[0], lm_input.shape[1], device=lm_input.device).to(torch.bool), cache=state)
             logits = self.llm_decoder(y_pred[:, -1])
             if infer_cfg:
                 # perform context free guidance
             logp = logp.squeeze(dim=0)
             if i < int(min_len):
+                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=self.dtype)
             top_ids = self.sampling_ids(logp, out_tokens, ignore_eos=i < min_len).item()

inspiremusic/transformer/qwen_encoder.py CHANGED Viewed

@@ -22,6 +22,7 @@ class QwenEncoder(nn.Module):
     def __init__(
             self,
             input_size: int,
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
             trainable: bool = False,
             do_fusion_emb: bool = False,
@@ -30,7 +31,15 @@ class QwenEncoder(nn.Module):
         super(QwenEncoder, self).__init__()
         self.input_size = input_size
         self.trainable = trainable
-        self.model = AutoModelForCausalLM.from_pretrained(pretrain_path, device_map="cpu")
         self._output_size = self.model.config.hidden_size
         self.do_fusion_emb = do_fusion_emb
         self.hidden_norm = torch.nn.LayerNorm(self._output_size)
@@ -88,14 +97,19 @@ class QwenEmbeddingEncoder(nn.Module):
     def __init__(
             self,
             input_size: int,
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
     ):
         super(QwenEmbeddingEncoder, self).__init__()
         self.input_size = input_size
         from transformers import Qwen2ForCausalLM
-        # self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="cpu", attn_implementation="flash_attention_2")
-        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path,
-                                                      device_map="cpu")
         self._output_size = self.model.config.hidden_size
     def output_size(self) -> int:
@@ -137,14 +151,19 @@ class QwenInputOnlyEncoder(nn.Module):
     def __init__(
             self,
             input_size: int,
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
     ):
         super(QwenInputOnlyEncoder, self).__init__()
         self.input_size = input_size
         from transformers import Qwen2ForCausalLM
-        # model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="cpu", attn_implementation="flash_attention_2")
-        model = Qwen2ForCausalLM.from_pretrained(pretrain_path,
-                                                 device_map="cpu")
         self.embed = model.model.embed_tokens
         for p in self.embed.parameters():
             p.requires_grad = False

     def __init__(
             self,
             input_size: int,
+            dtype: str = "fp16",
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
             trainable: bool = False,
             do_fusion_emb: bool = False,
         super(QwenEncoder, self).__init__()
         self.input_size = input_size
         self.trainable = trainable
+        if dtype == "fp16":
+            self.dtype = torch.float16
+        elif dtype == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
+        self.model = AutoModelForCausalLM.from_pretrained(pretrain_path, device_map="auto", attn_implementation="flash_attention_2", torch_dtype=self.dtype)
         self._output_size = self.model.config.hidden_size
         self.do_fusion_emb = do_fusion_emb
         self.hidden_norm = torch.nn.LayerNorm(self._output_size)
     def __init__(
             self,
             input_size: int,
+            dtype: str = "fp16",
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
     ):
         super(QwenEmbeddingEncoder, self).__init__()
         self.input_size = input_size
+        if dtype == "fp16":
+            self.dtype = torch.float16
+        elif dtype == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
         from transformers import Qwen2ForCausalLM
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="auto", attn_implementation="flash_attention_2", torch_dtype=self.dtype)
         self._output_size = self.model.config.hidden_size
     def output_size(self) -> int:
     def __init__(
             self,
             input_size: int,
+            dtype: str = "fp16",
             pretrain_path: str = "Qwen/Qwen2.0-0.5B",
     ):
         super(QwenInputOnlyEncoder, self).__init__()
         self.input_size = input_size
+        if dtype == "fp16":
+            self.dtype = torch.float16
+        elif dtype == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
         from transformers import Qwen2ForCausalLM
+        model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="auto", attn_implementation="flash_attention_2", torch_dtype=self.dtype)
         self.embed = model.model.embed_tokens
         for p in self.embed.parameters():
             p.requires_grad = False

inspiremusic/utils/common.py CHANGED Viewed

@@ -16,12 +16,9 @@
 """Unility functions for Transformer."""
 from typing import List
 import torch
 IGNORE_ID = -1
-MUSIC_STRUCTURE_LABELS = ["intro", "verse1", "chorus", "verse2", "outro"]
 def pad_list(xs: List[torch.Tensor], pad_value: int):
     """Perform padding for the list of tensors.
@@ -92,16 +89,61 @@ def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
     denominator = torch.sum(mask)
     return (numerator / denominator).detach()
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size * dilation - dilation) / 2)
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
     if classname.find("Conv") != -1:
         m.weight.data.normal_(mean, std)
 def topk_sampling(weighted_scores, decoded_tokens, top_k=25):
     zeros = weighted_scores.new_ones(weighted_scores.shape) * float('-inf')
     values,indices =  torch.topk(weighted_scores,top_k)

 """Unility functions for Transformer."""
 from typing import List
 import torch
 IGNORE_ID = -1
 def pad_list(xs: List[torch.Tensor], pad_value: int):
     """Perform padding for the list of tensors.
     denominator = torch.sum(mask)
     return (numerator / denominator).detach()
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size * dilation - dilation) / 2)
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
     if classname.find("Conv") != -1:
         m.weight.data.normal_(mean, std)
+def keep_rhythm(next_token, current_time_signature):
+    allowed_durations = get_allowed_durations(current_time_signature)
+    if next_token not in allowed_durations:
+        next_token = random.choice(allowed_durations)
+    return next_token
+def keep_harmony(next_token, current_chord):
+    allowed_notes = get_allowed_notes(current_chord)  # Define allowed notes for the chord
+    if next_token not in allowed_notes:
+        next_token = random.choice(allowed_notes)  # Replace with a valid note
+    return next_token
+def relieve_repetition(weighted_scores, recent_tokens, repetition_penalty=1.2):
+    for token in recent_tokens:
+        if weighted_scores[token] > 0:
+            weighted_scores[token] /= repetition_penalty
+    return weighted_scores
+def top_p_sampling_with_constraints(weighted_scores, decoded_tokens, top_p=0.85, temperature=1.1, current_chord=None, current_time_signature=None, recent_tokens=None):
+    # Apply temperature scaling
+    weighted_scores = weighted_scores ** (1 / temperature)
+    weighted_scores /= weighted_scores.sum()
+    if recent_tokens:
+        weighted_scores = relieve_repetition(weighted_scores, recent_tokens)
+    # Sort weighted scores in descending order
+    sorted_weighted_scores, _ = torch.sort(weighted_scores, descending=True)
+    # Compute cumulative weighted scores
+    cumulative_weighted_scores = torch.cumsum(sorted_weighted_scores, dim=0)
+    # Find the threthold index of top-p
+    cutoff_index = torch.where(cumulative_weighted_scores >= top_p)[0][0]
+    selected_weighted_scores = sorted_weighted_scores[:cutoff_index + 1]
+    # Apply domain-specific constraints
+    if current_chord:
+        selected_weighted_scores = keep_harmony(selected_weighted_scores, current_chord)
+    if current_time_signature:
+        selected_weighted_scores = keep_rhythm(selected_weighted_scores, current_time_signature)
+    # Normalize selected probabilities
+    selected_weighted_scores /= selected_weighted_scores.sum()
+    # Sample top-p tokens from the distribution
+    return random_sampling(selected_weighted_scores, decoded_tokens)
 def topk_sampling(weighted_scores, decoded_tokens, top_k=25):
     zeros = weighted_scores.new_ones(weighted_scores.shape) * float('-inf')
     values,indices =  torch.topk(weighted_scores,top_k)

inspiremusic/utils/executor.py CHANGED Viewed

@@ -24,13 +24,19 @@ from inspiremusic.utils.train_utils import update_parameter_and_lr, log_per_step
 from torch.cuda.amp import GradScaler, autocast
 class Executor:
     def __init__(self):
         self.step = 0
         self.epoch = 0
         self.rank = int(os.environ.get('RANK', 0))
-        self.device = torch.device('cuda:{}'.format(self.rank))
     def train_one_epoch(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join, scaler=None):
         ''' Train one epoch
         '''

 from torch.cuda.amp import GradScaler, autocast
 class Executor:
     def __init__(self):
         self.step = 0
         self.epoch = 0
         self.rank = int(os.environ.get('RANK', 0))
+        if torch.cuda.is_available():
+            if torch.cuda.is_available():
+                self.device = torch.device('cuda:{}'.format(self.rank))
+            elif torch.backends.mps.is_available():
+                self.device = torch.device('mps')
+            elif torch.xpu.is_available():
+                self.device = torch.device('xpu')
+        else:
+            self.device = torch.device('cpu')
     def train_one_epoch(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join, scaler=None):
         ''' Train one epoch
         '''

inspiremusic/utils/utils.py CHANGED Viewed

@@ -1,5 +1,27 @@
 import os
 import sys
 def align_trans_scp_file(trans, scp):
     trans_dict = {}
@@ -14,9 +36,4 @@ def align_trans_scp_file(trans, scp):
             scp_dict[sec[0]] = sec[1]
     with open("text", "w") as f:
         for k, v in scp_dict.items():
-            f.write("%s\t%s\n"%(k,trans_dict[k]))
-if __name__ == '__main__':
-    trans = sys.argv[1]
-    scp = sys.argv[2]
-    align_trans_scp_file(trans, scp)

 import os
 import sys
+import subprocess
+def download_model(repo_url: str, output_dir: str = None, token: str = None):
+    try:
+        if token:
+            repo_url = repo_url.replace("https://", f"https://USER:{token}@")
+        else:
+            repo_url = f"https://www.modelscope.cn/models/iic/{repo_url}"
+        cmd = ["git", "clone", repo_url]
+        if output_dir:
+            cmd.append(output_dir)
+        result = subprocess.run(
+            cmd,
+            check=True,
+            capture_output=True,
+            text=True
+        )
+        print("Success:", result.stdout)
+    except subprocess.CalledProcessError as e:
+        print("Error:", e.stderr)
 def align_trans_scp_file(trans, scp):
     trans_dict = {}
             scp_dict[sec[0]] = sec[1]
     with open("text", "w") as f:
         for k, v in scp_dict.items():
+            f.write("%s\t%s\n"%(k,trans_dict[k]))

inspiremusic/wavtokenizer/.DS_Store DELETED Viewed

Binary file (6.15 kB)