Spaces:

tencent
/

SongGeneration

Running on L40S

App Files Files Community

waytan22 commited on 30 days ago

Commit

f7c5fc0

1 Parent(s): 3c8f8cf

delete useless message

Browse files

Files changed (7) hide show

app.py +3 -1
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/README.md +0 -65
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml +1 -1
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/mert_dataset.py +9 -52
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/chroma_torch.py +0 -12
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/mert_model.py +0 -4
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/tasks/mert_pretraining.py +2 -35

app.py CHANGED Viewed

@@ -66,12 +66,14 @@ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_co
     # format lyric
     lyric = lyric.replace("[intro]", "[intro-short]").replace("[inst]", "[inst-short]").replace("[outro]", "[outro-short]")
     paragraphs = [p.strip() for p in lyric.strip().split('\n\n') if p.strip()]
     paragraphs_norm = []
     for para in paragraphs:
         lines = para.splitlines()
         struct_tag = lines[0].strip().lower()
         if struct_tag not in STRUCTS:
-            return None, json.dumps(f"segments should start with a structure tag in {STRUCTS}")
         if struct_tag in ['[verse]', '[chorus]', '[bridge]']:
             if len(lines) < 2 or not [line.strip() for line in lines[1:] if line.strip()]:
                 return None, json.dumps("The following segments require lyrics: [verse], [chorus], [bridge]")

     # format lyric
     lyric = lyric.replace("[intro]", "[intro-short]").replace("[inst]", "[inst-short]").replace("[outro]", "[outro-short]")
     paragraphs = [p.strip() for p in lyric.strip().split('\n\n') if p.strip()]
+    if len(paragraphs) < 1:
+        return None, json.dumps("Lyrics can not be left blank")
     paragraphs_norm = []
     for para in paragraphs:
         lines = para.splitlines()
         struct_tag = lines[0].strip().lower()
         if struct_tag not in STRUCTS:
+            return None, json.dumps(f"Segments should start with a structure tag in {STRUCTS}")
         if struct_tag in ['[verse]', '[chorus]', '[bridge]']:
             if len(lines) < 2 or not [line.strip() for line in lines[1:] if line.strip()]:
                 return None, json.dumps("The following segments require lyrics: [verse], [chorus], [bridge]")

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/README.md DELETED Viewed

@@ -1,65 +0,0 @@
-# Our MERT & BEST-RQ
-Our implementation on MERT model. Files modified:
-- mert_fairseq/models/mert/mert_model.py
-- mert_fairseq/data/mert_dataset.py
-- run_training_mulNodes_wotorchdist_womodelparsize.sh
-# Prepare
-The MERT training is implemented with [fairseq](https://github.com/pytorch/fairseq). You need to clone the fairseq repo inside our repo at ./src/fairseq and MERT implementation codes as a fairseq example projcet.
-You can do that by following the steps:
-```
-mkdir -c ./src/fairseq
-cd ./src
-git clone https://github.com/pytorch/fairseq
-```
-# Docker
-```
-mirrors.tencent.com/cloudezhou/mert:v3
-```
-# Start
-### 1-node training
-```
-bash run_training_sglNodes.sh 0 dummy MERT_RVQ-VAE_CQT_330M_multinodes_debug1node
-```
-### 1-node training (BEST-RQ)
-```
-bash run_training_sglNodes.sh 0 dummy MERT_RVQ-VAE_CQT_95M_bestrq
-```
-### 4-node training
-```
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh 0 dummy MERT_RVQ-VAE_CQT_330M_multinodes
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh 1 dummy MERT_RVQ-VAE_CQT_330M_multinodes
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh 2 dummy MERT_RVQ-VAE_CQT_330M_multinodes
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh 3 dummy MERT_RVQ-VAE_CQT_330M_multinodes
-```
-### 4-node training (BEST-RQ)
-```
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh $INDEX dummy MERT_RVQ-VAE_CQT_95M_bestrq_multinodes BEST_RQ $CHIEF_IP
-```
-### 4-node training (MusicFM)
-```
-bash run_training_mulNodes_wotorchdist_womodelparsize.sh $INDEX dummy MusicFM_95M_multinodes MUSICFM $CHIEF_IP
-```
-### 4-node training (EAT)
-```
-bash run_training_eat.sh $INDEX dummy EAT_pretraining_music_multinodes EAT $CHIEF_IP
-```
-You could set the parameters in [mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml](mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml)
-Our latest checkpoints is loaded at [data/fairseq_savedir/ckpt_MERT_RVQ-VAE_CQT/MERT_RVQ-VAE_CQT_330M/checkpoint_last.pt](data/fairseq_savedir/ckpt_MERT_RVQ-VAE_CQT/MERT_RVQ-VAE_CQT_330M/checkpoint_last.pt)

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml CHANGED Viewed

@@ -18,7 +18,7 @@ checkpoint:
 task:
   _name: mae_image_pretraining
-  data: /hpc_stor03/sjtu_home/wenxi.chen/mydata/audio/unbalanced_train
   rebuild_batches: true
   key: source
   precompute_mask_config: {}

 task:
   _name: mae_image_pretraining
+  data: unbalanced_train
   rebuild_batches: true
   key: source
   precompute_mask_config: {}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/mert_dataset.py CHANGED Viewed

@@ -274,11 +274,6 @@ class MERTDataset(FairseqDataset):
         dataset_len:int = 128*3000,
         clip_secs = 5,
     ):
-        # self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio(
-        #     manifest_path, max_keep_sample_size, min_keep_sample_size
-        # )
-        # manifest_path = '/apdcephfs_cq2/share_1297902/speech_user/erichtchen/shixisheng/zhouyz/MERT/music_data/all_v4/train.json'
         self.sample_rate = sample_rate
         self.shuffle = shuffle
         self.random_crop = random_crop
@@ -308,15 +303,8 @@ class MERTDataset(FairseqDataset):
                 self.label_list = [load_label(p, inds, tot) for p in label_paths]
         else:
             self.label_paths = label_paths
-            # self.label_offsets_list = [
-            #     load_label_offset(p, inds, tot) for p in label_paths
-            # ]
         assert label_processors is None or len(label_processors) == self.num_labels
-        # logger.info('skip verify labels and audio lengths')
-        # for label_path, label_rate in zip(label_paths, self.label_rates):
-        #     verify_label_lengths(
-        #         self.sizes, sample_rate, label_path, label_rate, inds, tot
-        #     )
         self.max_sample_size = (
             max_sample_size if max_sample_size is not None else sys.maxsize
@@ -330,9 +318,6 @@ class MERTDataset(FairseqDataset):
         self.augmentation_effects = augmentation_effects
         self.augmentation_probs = augmentation_probs
-        # if len(self.augmentation_effects) > 0:
-            # self.augmentor_init()
-            # self.apply_augmentation = self.augmentation_factry(sample_rate)
         self.inbatch_noise_augment_len_range = inbatch_noise_augment_len_range
         self.inbatch_noise_augment_number_range = inbatch_noise_augment_number_range
@@ -397,10 +382,7 @@ class MERTDataset(FairseqDataset):
         return augmented_audio
     def get_audio_by_slice(self,index):
-        # wav_path = os.path.join('/apdcephfs/share_1316500/cloudezhou/MERT/MERT/converted', self.audio_names[index])
         wav_path = self.datas[index]['path']
-        # print(wav_path)
         audio_info =  torchaudio.info(wav_path)
         origin_sample_rate = audio_info.sample_rate
         origin_duration = audio_info.num_frames / origin_sample_rate
@@ -408,32 +390,14 @@ class MERTDataset(FairseqDataset):
         wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
         wav = wav.float()
-        # _path, slice_ptr = parse_path(wav_path) #这个应该也要改
-        # original way
-        # if len(slice_ptr) == 0:
-        #     wav, cur_sample_rate = sf.read(_path)
-        # else:
-        #     assert _path.endswith(".zip")
-        #     data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
-        #     f = io.BytesIO(data)
-        #     wav, cur_sample_rate = sf.read(f)
-        # wav = torch.from_numpy(wav).float()
-        # print(wav.shape)
         wav = wav.permute(1,0)
         wav = self.postprocess(wav, self.sample_rate) #降至单个声道，确认采样率，归一化
-        # print(wav.shape)
-        # wav = wav.squeeze(0)
         return wav
     def get_audio(self, index):
         import soundfile as sf
-        # wav_path = os.path.join(self.audio_root, self.audio_names[index])
-        wav_path = os.path.join('/apdcephfs/share_1316500/cloudezhou/MERT/MERT/converted', self.audio_names[index])
-        # print(wav_path)
-        # self.reader()
-        _path, slice_ptr = parse_path(wav_path) #这个应该也要改
-        # original way
         if len(slice_ptr) == 0:
             wav, cur_sample_rate = sf.read(_path)
         else:
@@ -448,8 +412,6 @@ class MERTDataset(FairseqDataset):
         return wav
     def get_label(self, index, label_idx):
-        #label_idx 表示第label_idx个字典，默认8个
         if self.store_labels and (not self.npmemmap):
             label = self.label_list[label_idx][index]
         elif self.store_labels and self.npmemmap:
@@ -570,11 +532,6 @@ class MERTDataset(FairseqDataset):
             cqt_labels = self.encoder_cqt_model(collated_audios.float(), forward_type='compute_cqt')
         for i, _ in enumerate(audios):
-            # compute cqt labels in advance
-            # cqt_labels
-            # yizhilll: apply audio augmentation effects here
-            # the audio should be as the type torch.Tensor, in the shape [1, length] TODO?
             if len(self.augmentation_effects) > 0:
                 with torch.no_grad():
                     for effect, prob in zip(self.augmentation_effects, self.augmentation_probs):
@@ -597,12 +554,12 @@ class MERTDataset(FairseqDataset):
     def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
         assert label_rate > 0
-        s2f = label_rate / self.sample_rate  # @yizhilll: 0.00625 for 100Hz and 16k sr
-        frm_starts = [int(round(s * s2f)) for s in audio_starts] # @yizhilll: should be all 0 if the audios are not croped
-        frm_size = int(round(audio_size * s2f)) # @yizhilll: this is the expected total number of given pseudo labels
         if not self.pad_audio:
-            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)] # @yizhilll: what does this mean?
-            frm_size = min(frm_size, *rem_size) # @yizhilll: anyway, this should keep 3000 for 30s audio
         targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)]
         logger.debug(f"audio_starts={audio_starts}")
         logger.debug(f"frame_starts={frm_starts}")

         dataset_len:int = 128*3000,
         clip_secs = 5,
     ):
         self.sample_rate = sample_rate
         self.shuffle = shuffle
         self.random_crop = random_crop
                 self.label_list = [load_label(p, inds, tot) for p in label_paths]
         else:
             self.label_paths = label_paths
         assert label_processors is None or len(label_processors) == self.num_labels
         self.max_sample_size = (
             max_sample_size if max_sample_size is not None else sys.maxsize
         self.augmentation_effects = augmentation_effects
         self.augmentation_probs = augmentation_probs
         self.inbatch_noise_augment_len_range = inbatch_noise_augment_len_range
         self.inbatch_noise_augment_number_range = inbatch_noise_augment_number_range
         return augmented_audio
     def get_audio_by_slice(self,index):
         wav_path = self.datas[index]['path']
         audio_info =  torchaudio.info(wav_path)
         origin_sample_rate = audio_info.sample_rate
         origin_duration = audio_info.num_frames / origin_sample_rate
         wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
         wav = wav.float()
         wav = wav.permute(1,0)
         wav = self.postprocess(wav, self.sample_rate) #降至单个声道，确认采样率，归一化
         return wav
     def get_audio(self, index):
         import soundfile as sf
+        wav_path = self.audio_names[index]
+        _path, slice_ptr = parse_path(wav_path)
         if len(slice_ptr) == 0:
             wav, cur_sample_rate = sf.read(_path)
         else:
         return wav
     def get_label(self, index, label_idx):
         if self.store_labels and (not self.npmemmap):
             label = self.label_list[label_idx][index]
         elif self.store_labels and self.npmemmap:
             cqt_labels = self.encoder_cqt_model(collated_audios.float(), forward_type='compute_cqt')
         for i, _ in enumerate(audios):
             if len(self.augmentation_effects) > 0:
                 with torch.no_grad():
                     for effect, prob in zip(self.augmentation_effects, self.augmentation_probs):
     def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
         assert label_rate > 0
+        s2f = label_rate / self.sample_rate
+        frm_starts = [int(round(s * s2f)) for s in audio_starts]
+        frm_size = int(round(audio_size * s2f))
         if not self.pad_audio:
+            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)]
+            frm_size = min(frm_size, *rem_size)
         targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)]
         logger.debug(f"audio_starts={audio_starts}")
         logger.debug(f"frame_starts={frm_starts}")

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/chroma_torch.py CHANGED Viewed

@@ -247,15 +247,3 @@ class ChromaSpectrogram(torch.nn.Module):
             chroma_spectrogram[chroma_spectrogram < 0] = 0.0
             chroma_spectrogram = torch.nn.functional.normalize(chroma_spectrogram, p=2, dim=-2)
         return chroma_spectrogram
-if __name__ == '__main__':
-    import numpy as np
-    import librosa
-    audio_path = 'speech_data/pretrain/music_42/226849998.flac'
-    sr = 24000
-    freq = 75
-    hop = int(sr // freq)
-    y, _sr = librosa.load(audio_path, duration=5, sr=sr)
-    chroma_extractor = ChromaSpectrogram(sample_rate=sr, hop_length=hop, n_fft=2048, use_cqt=True)
-    chroma_tr = chroma_extractor(torch.from_numpy(y)).numpy()

             chroma_spectrogram[chroma_spectrogram < 0] = 0.0
             chroma_spectrogram = torch.nn.functional.normalize(chroma_spectrogram, p=2, dim=-2)
         return chroma_spectrogram

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/mert_model.py CHANGED Viewed

@@ -1293,10 +1293,8 @@ class MERTModel(BaseFairseqModel):
         feat_tsz = features.size(2)
         targ_tsz = min([t.size(1) for t in target_list])
         if self.feat2tar_ratio * feat_tsz > targ_tsz:
-            # @yizhilll: if feature * 2 > 3000, then crop the features
             feat_tsz = int(targ_tsz / self.feat2tar_ratio)
             features = features[..., :feat_tsz]
-        # @yizhilll: select only the first pseoudo label if there are multiple labels
         target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio
         target_list = [t[:, target_inds.long()] for t in target_list]
         return features, target_list
@@ -1507,8 +1505,6 @@ class MERTModel(BaseFairseqModel):
         if not self.skip_masked:
             masked_indices = torch.logical_and(~padding_mask, mask_indices)
-            # @yizhilll: TODO merge the codes heredui
             if self.random_codebook <= 0:
                 proj_x_m = self.final_proj(x[masked_indices]) #将特征投射到一个更低维的空间
                 if self.untie_final_proj:

         feat_tsz = features.size(2)
         targ_tsz = min([t.size(1) for t in target_list])
         if self.feat2tar_ratio * feat_tsz > targ_tsz:
             feat_tsz = int(targ_tsz / self.feat2tar_ratio)
             features = features[..., :feat_tsz]
         target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio
         target_list = [t[:, target_inds.long()] for t in target_list]
         return features, target_list
         if not self.skip_masked:
             masked_indices = torch.logical_and(~padding_mask, mask_indices)
             if self.random_codebook <= 0:
                 proj_x_m = self.final_proj(x[masked_indices]) #将特征投射到一个更低维的空间
                 if self.untie_final_proj:

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/tasks/mert_pretraining.py CHANGED Viewed

@@ -20,8 +20,7 @@ from fairseq.tasks import register_task
 from fairseq.tasks.fairseq_task import FairseqTask
 from omegaconf import MISSING
-# from ..data.mert_dataset import MERTDataset
-from ..data.mert_dataset import MERTDataset #这么做感觉有大问题，得换个办法
 from ..data.ark_dataset import ArkDataset
 logger = logging.getLogger(__name__)
@@ -32,8 +31,6 @@ class LabelEncoder(object):
         self.dictionary = dictionary
     def __call__(self, label: str) -> List[str]:
-        # @yizhilll: https://fairseq.readthedocs.io/en/latest/_modules/fairseq/data/dictionary.html \
-        # encode_line return a torch.IntTensor, should be all 1 for vanila HuBERT
         return self.dictionary.encode_line(
             label,
             append_eos=False,
@@ -45,17 +42,6 @@ class PaddedNumpyLabelEncoder(object):
         pass
     def __call__(self, label):
-        # @yizhilll: https://fairseq.readthedocs.io/en/latest/_modules/fairseq/data/dictionary.html \
-        # encode_line return a torch.IntTensor, should be all 1 for vanila HuBERT
-        # return self.dictionary.encode_line(
-        #     label,
-        #     append_eos=False,
-        #     add_if_not_exist=False,
-        # )
-        # if isisntance(label, np.memmap):
-        # assert isisntance(label, np.memmap)
-        # t = torch.IntTensor(np.asarray(label).copy())
         t = torch.IntTensor(np.asarray(label))
         t = t[t>=0] # remove padded -1 values at the end
         return t
@@ -262,9 +248,7 @@ class MERTPretrainingTask(FairseqTask):
         else:
             self.state.add_factory("dictionaries", self.load_dictionaries)
-        self.blank_symbol = "<s>"
-        # @yizhilll: use eval() to pass list parameters, skirt the fairseq/torch error:  Can't pickle <enum 'Choices'>: attribute lookup Choices on fairseq.dataclass.constants failed
         self.augmentation_effects = eval(self.cfg.augmentation_effects)
         self.augmentation_probs = eval(self.cfg.augmentation_probs)
         if len(self.augmentation_effects) > 0:
@@ -321,14 +305,6 @@ class MERTPretrainingTask(FairseqTask):
             return self.cfg.data
         return self.cfg.label_dir
-    # def has_sharded_data(self, split):
-    #     """overwrite this function for let the trainier do dataset reload for changing the the dynamic croppings"""
-    #     logger.info(f"check whether to re-load dataset for epoch {epoch} by overwritting task.has_sharded_data()")
-    #     # find the threshold that holds epoch \in [threshold, next_threshold)
-    #     is_reload_dataset = epoch in self.dynamic_crops_epoches
-    #     return os.pathsep in getattr(self.cfg, "data", "") or is_reload_dataset
-    # def is_force_load_dataset(self, epoch):
     def is_force_load_dataset(self, epoch, training_restore=False):
         # find the threshold that holds epoch \in [threshold, next_threshold)
         return (epoch in self.dynamic_crops_epoches) or training_restore or (self.cfg.sharding_data > 1)
@@ -340,15 +316,6 @@ class MERTPretrainingTask(FairseqTask):
     def set_dynamic_crop_max_sample(self, epoch):
         """ force to set the max_sample_size config for the dynamic cropping function"""
-        # pass
-        # @yizhilll: the parameter "epoch" is passed into this funciton in trainer.py#688,
-        # containing in "**kwargs"
-        # if 'train' in split:
-            # epoch = kwargs['epoch']
-        # find the threshold that holds epoch \in [threshold, next_threshold)
-        # is_reload_dataset = epoch in self.dynamic_crops_epoches # test again
-        # if is_reload_dataset:
         if epoch in self.dynamic_crops_epoches:
             for idx in range(len(self.dynamic_crops_epoches)):
                 if (idx == len(self.dynamic_crops_epoches)-1) or \

 from fairseq.tasks.fairseq_task import FairseqTask
 from omegaconf import MISSING
+from ..data.mert_dataset import MERTDataset
 from ..data.ark_dataset import ArkDataset
 logger = logging.getLogger(__name__)
         self.dictionary = dictionary
     def __call__(self, label: str) -> List[str]:
         return self.dictionary.encode_line(
             label,
             append_eos=False,
         pass
     def __call__(self, label):
         t = torch.IntTensor(np.asarray(label))
         t = t[t>=0] # remove padded -1 values at the end
         return t
         else:
             self.state.add_factory("dictionaries", self.load_dictionaries)
+        self.blank_symbol = "<s>"
         self.augmentation_effects = eval(self.cfg.augmentation_effects)
         self.augmentation_probs = eval(self.cfg.augmentation_probs)
         if len(self.augmentation_effects) > 0:
             return self.cfg.data
         return self.cfg.label_dir
     def is_force_load_dataset(self, epoch, training_restore=False):
         # find the threshold that holds epoch \in [threshold, next_threshold)
         return (epoch in self.dynamic_crops_epoches) or training_restore or (self.cfg.sharding_data > 1)
     def set_dynamic_crop_max_sample(self, epoch):
         """ force to set the max_sample_size config for the dynamic cropping function"""
         if epoch in self.dynamic_crops_epoches:
             for idx in range(len(self.dynamic_crops_epoches)):
                 if (idx == len(self.dynamic_crops_epoches)-1) or \