Spaces:

tencent
/

SongGeneration

Running on L40S

File size: 3,957 Bytes

258fd02

import torch,torchaudio
import os,sys,json
from tqdm import tqdm

#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
from generate_septoken import Tango
import kaldiio
from kaldiio import WriteHelper
from audio import AudioFile

def read_wav(fname, sample_rate=48_000):
    try:
        orig_samples, fs = torchaudio.load(fname)
    except:
        af = AudioFile(fname)
        orig_samples = af.read()
        fs = af.samplerate()
        orig_samples = orig_samples[0]
    if(fs!=sample_rate):
        orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
        fs = sample_rate
    if orig_samples.shape[0] == 1:
        orig_samples = torch.cat([orig_samples, orig_samples], 0)
    return orig_samples

if __name__ == "__main__":
    # Define Model
    json_path = sys.argv[1]
    outdir = sys.argv[2]
    
    mus_infos = []
    with open(json_path) as f:
        for line in f:
            item = json.loads(line)
            mus_infos.append(item)

    tango = Tango(model_path="./saved/model_septoken/model_2.safetensors")
    
    
    # Feature extraction loop
    # for i in tqdm(range(2000)):
    first_time = True
    with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal,  WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm:
        print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir))
        print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir))
        for item in tqdm(mus_infos):
            try:
            # if True:
                idx = item['idx']
                # print(idx)
                if(os.path.exists(item['path'])):
                    full_path = item['path']
                else:
                    full_path = '/mnt/share/' + item['path']
                if(os.path.exists(item['vocal_path'])):
                    vocal_path = item['vocal_path']
                    bgm_paths = item['bgm_path']
                else:
                    vocal_path = '/mnt/share/' + item['vocal_path']
                    bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']]
                vocal_tensor = read_wav(vocal_path)
                # full_tensor = read_wav(full_path)
                # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
                # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
                # bgm_tensor = full_tensor - vocal_tensor
                bgm_tensor = sum([read_wav(p) for p in bgm_paths])
                codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
                writer_vocal(str(idx), codes_vocal.cpu())
                writer_bgm(str(idx), codes_bgm.cpu())
                if(first_time):
                    first_time = False
                    print(codes_vocal.shape, codes_bgm.shape)
            except:
                print(item['vocal_path'])
                print(item['bgm_path'])
                continue
            
            # idx = item['idx']
            # # print(idx)
            # full_path = item['path']
            # vocal_path = item['vocal_path']
            # bgm_paths = item['bgm_path']
            # full_tensor = read_wav(full_path)
            # vocal_tensor = read_wav(vocal_path)
            # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
            # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
            # bgm_tensor = full_tensor - vocal_tensor
            # codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
            # writer_vocal(str(idx), codes_vocal.cpu())
            # writer_bgm(str(idx), codes_bgm.cpu())
            # if(first_time):
            #     first_time = False
            #     print(codes_vocal.shape, codes_bgm.shape)