Spaces:
Running
on
L40S
Running
on
L40S
import hydra | |
import librosa | |
import torch | |
import yaml | |
from prodict import Prodict | |
import torchaudio | |
from musiclm_pytorch import AudioSpectrogramTransformerPretrained, TextTransformerPretrained, MuLaN, MuLaNEmbedder | |
from omegaconf import DictConfig | |
import os | |
def get_pretrained_config(root, name): | |
if root is None: | |
return name | |
path = os.path.join(root, name) | |
#获取snapshots目录下的目录 | |
config_dir = os.path.join(path, 'snapshots') | |
config_files = os.listdir(config_dir) | |
assert len(config_files) == 1 | |
config_path = os.path.join(config_dir, config_files[0]) | |
return config_path | |
def create_MuLaN_from_config(config: DictConfig): | |
""" | |
Create a MuLaN model from a configuration file. | |
""" | |
pretraind_root = config.model.pretraind_root | |
audio_model_name = get_pretrained_config(pretraind_root, config.model.audio_model.name) | |
audio_transformer = AudioSpectrogramTransformerPretrained( | |
model_name = audio_model_name, | |
model_dim = config.model.audio_model.model_dim, | |
use_layer_idx = config.model.audio_model.use_layer_idx, | |
**config.model.audio_transformer | |
) | |
text_model_name = get_pretrained_config(pretraind_root, config.model.text_model.name) | |
text_transformer = TextTransformerPretrained( | |
model_name = text_model_name, | |
**config.model.text_transformer | |
) | |
mulan = MuLaN( | |
audio_transformer = audio_transformer, | |
text_transformer = text_transformer, | |
**config.model.mulan | |
) | |
return mulan | |
def create_CLAP_model( model_kwargs = {}, ckpt_path = None ): | |
from musiclm_pytorch import SoftmaxContrastiveLearning | |
import laion_clap | |
from torch import nn | |
import torch | |
from torchaudio.functional import resample | |
import numpy as np | |
from functools import partial | |
# quantization | |
def int16_to_float32(x): | |
return (x / 32767.0).float() | |
def float32_to_int16(x): | |
x = torch.clip(x, min=-1., max=1.) | |
return (x * 32767.).int() | |
model = laion_clap.CLAP_Module(enable_fusion=False, **model_kwargs) | |
if ckpt_path is not None: | |
model.load_ckpt(ckpt_path) | |
else: | |
model.load_ckpt() | |
class CLAP_Model(nn.Module): | |
def __init__(self, model, sr = 24000, decoupled_contrastive_learning = True): | |
super().__init__() | |
self.model = model | |
self.model.eval() | |
self.orig_sr = sr | |
klass = partial(SoftmaxContrastiveLearning, decoupled_contrastive_learning = decoupled_contrastive_learning) | |
self.contrast = klass() | |
def forward(self, wavs, raw_texts): | |
with torch.no_grad(): | |
wavs = int16_to_float32(float32_to_int16(resample(wavs, self.orig_sr, 48000))) | |
audio_latents = self.model.get_audio_embedding_from_data(x = wavs, use_tensor=True).float() | |
text_latents = model.get_text_embedding(raw_texts, use_tensor=True) | |
cl_loss = self.contrast(audio_latents, text_latents) | |
return cl_loss | |
clap = CLAP_Model(model) | |
return clap | |
def get_mulan(config): | |
with open(config, "r") as stream: | |
mulan_config = yaml.safe_load(stream) | |
mulan_config = Prodict.from_dict(mulan_config) | |
ckpt_path = mulan_config.checkpoint_path | |
mulan = create_MuLaN_from_config(mulan_config) | |
mulan_embedder = MuLaNEmbedder(mulan, checkpoint_path = ckpt_path) | |
mulan_embedder.eval() | |
return mulan_embedder | |
def extract_mert_embeds(mulan_embd_extractor, layer_num, filename): | |
input_audios, fs = torchaudio.load(filename) | |
mulan_sr = 24000 | |
if(fs!=mulan_sr): | |
input_audios = torchaudio.functional.resample(input_audios, fs, mulan_sr) | |
fs = mulan_sr | |
# print(input_audios.shape) | |
inputs = mulan_embd_extractor.mulan.audio.processor(input_audios, sampling_rate=mulan_embd_extractor.mulan.audio.sr, return_tensors="pt") | |
input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype) | |
prompt_embeds = mulan_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states[layer_num] # batch_size, Time steps, 1024 feature_dim | |
return prompt_embeds | |