SongGeneration / codeclm /tokenizer /Flow1dVAE /extract_codes_stereo_7_1x1_sep.py
hainazhu
Add application file
258fd02
raw
history blame
3.96 kB
import torch,torchaudio
import os,sys,json
from tqdm import tqdm
#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
from generate_septoken import Tango
import kaldiio
from kaldiio import WriteHelper
from audio import AudioFile
def read_wav(fname, sample_rate=48_000):
try:
orig_samples, fs = torchaudio.load(fname)
except:
af = AudioFile(fname)
orig_samples = af.read()
fs = af.samplerate()
orig_samples = orig_samples[0]
if(fs!=sample_rate):
orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
fs = sample_rate
if orig_samples.shape[0] == 1:
orig_samples = torch.cat([orig_samples, orig_samples], 0)
return orig_samples
if __name__ == "__main__":
# Define Model
json_path = sys.argv[1]
outdir = sys.argv[2]
mus_infos = []
with open(json_path) as f:
for line in f:
item = json.loads(line)
mus_infos.append(item)
tango = Tango(model_path="./saved/model_septoken/model_2.safetensors")
# Feature extraction loop
# for i in tqdm(range(2000)):
first_time = True
with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal, WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm:
print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir))
print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir))
for item in tqdm(mus_infos):
try:
# if True:
idx = item['idx']
# print(idx)
if(os.path.exists(item['path'])):
full_path = item['path']
else:
full_path = '/mnt/share/' + item['path']
if(os.path.exists(item['vocal_path'])):
vocal_path = item['vocal_path']
bgm_paths = item['bgm_path']
else:
vocal_path = '/mnt/share/' + item['vocal_path']
bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']]
vocal_tensor = read_wav(vocal_path)
# full_tensor = read_wav(full_path)
# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
# bgm_tensor = full_tensor - vocal_tensor
bgm_tensor = sum([read_wav(p) for p in bgm_paths])
codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
writer_vocal(str(idx), codes_vocal.cpu())
writer_bgm(str(idx), codes_bgm.cpu())
if(first_time):
first_time = False
print(codes_vocal.shape, codes_bgm.shape)
except:
print(item['vocal_path'])
print(item['bgm_path'])
continue
# idx = item['idx']
# # print(idx)
# full_path = item['path']
# vocal_path = item['vocal_path']
# bgm_paths = item['bgm_path']
# full_tensor = read_wav(full_path)
# vocal_tensor = read_wav(vocal_path)
# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
# bgm_tensor = full_tensor - vocal_tensor
# codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
# writer_vocal(str(idx), codes_vocal.cpu())
# writer_bgm(str(idx), codes_bgm.cpu())
# if(first_time):
# first_time = False
# print(codes_vocal.shape, codes_bgm.shape)