Spaces:
Running
on
L40S
Running
on
L40S
import torch,torchaudio | |
import os,sys,json | |
from tqdm import tqdm | |
#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango | |
from generate_septoken import Tango | |
import kaldiio | |
from kaldiio import WriteHelper | |
from audio import AudioFile | |
def read_wav(fname, sample_rate=48_000): | |
try: | |
orig_samples, fs = torchaudio.load(fname) | |
except: | |
af = AudioFile(fname) | |
orig_samples = af.read() | |
fs = af.samplerate() | |
orig_samples = orig_samples[0] | |
if(fs!=sample_rate): | |
orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate) | |
fs = sample_rate | |
if orig_samples.shape[0] == 1: | |
orig_samples = torch.cat([orig_samples, orig_samples], 0) | |
return orig_samples | |
if __name__ == "__main__": | |
# Define Model | |
json_path = sys.argv[1] | |
outdir = sys.argv[2] | |
mus_infos = [] | |
with open(json_path) as f: | |
for line in f: | |
item = json.loads(line) | |
mus_infos.append(item) | |
tango = Tango(model_path="./saved/model_septoken/model_2.safetensors") | |
# Feature extraction loop | |
# for i in tqdm(range(2000)): | |
first_time = True | |
with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal, WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm: | |
print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir)) | |
print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir)) | |
for item in tqdm(mus_infos): | |
try: | |
# if True: | |
idx = item['idx'] | |
# print(idx) | |
if(os.path.exists(item['path'])): | |
full_path = item['path'] | |
else: | |
full_path = '/mnt/share/' + item['path'] | |
if(os.path.exists(item['vocal_path'])): | |
vocal_path = item['vocal_path'] | |
bgm_paths = item['bgm_path'] | |
else: | |
vocal_path = '/mnt/share/' + item['vocal_path'] | |
bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']] | |
vocal_tensor = read_wav(vocal_path) | |
# full_tensor = read_wav(full_path) | |
# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1]) | |
# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length] | |
# bgm_tensor = full_tensor - vocal_tensor | |
bgm_tensor = sum([read_wav(p) for p in bgm_paths]) | |
codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor) | |
writer_vocal(str(idx), codes_vocal.cpu()) | |
writer_bgm(str(idx), codes_bgm.cpu()) | |
if(first_time): | |
first_time = False | |
print(codes_vocal.shape, codes_bgm.shape) | |
except: | |
print(item['vocal_path']) | |
print(item['bgm_path']) | |
continue | |
# idx = item['idx'] | |
# # print(idx) | |
# full_path = item['path'] | |
# vocal_path = item['vocal_path'] | |
# bgm_paths = item['bgm_path'] | |
# full_tensor = read_wav(full_path) | |
# vocal_tensor = read_wav(vocal_path) | |
# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1]) | |
# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length] | |
# bgm_tensor = full_tensor - vocal_tensor | |
# codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor) | |
# writer_vocal(str(idx), codes_vocal.cpu()) | |
# writer_bgm(str(idx), codes_bgm.cpu()) | |
# if(first_time): | |
# first_time = False | |
# print(codes_vocal.shape, codes_bgm.shape) | |