Spaces:

tencent
/

SongGeneration

Running on L40S

SongGeneration / codeclm /tokenizer /Flow1dVAE /extract_codes_stereo_7_1x1_sep.py

hainazhu

Add application file

258fd02 14 days ago

3.96 kB

	import torch,torchaudio
	import os,sys,json
	from tqdm import tqdm

	#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
	from generate_septoken import Tango
	import kaldiio
	from kaldiio import WriteHelper
	from audio import AudioFile

	def read_wav(fname, sample_rate=48_000):
	try:
	orig_samples, fs = torchaudio.load(fname)
	except:
	af = AudioFile(fname)
	orig_samples = af.read()
	fs = af.samplerate()
	orig_samples = orig_samples[0]
	if(fs!=sample_rate):
	orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
	fs = sample_rate
	if orig_samples.shape[0] == 1:
	orig_samples = torch.cat([orig_samples, orig_samples], 0)
	return orig_samples

	if __name__ == "__main__":
	# Define Model
	json_path = sys.argv[1]
	outdir = sys.argv[2]

	mus_infos = []
	with open(json_path) as f:
	for line in f:
	item = json.loads(line)
	mus_infos.append(item)

	tango = Tango(model_path="./saved/model_septoken/model_2.safetensors")


	# Feature extraction loop
	# for i in tqdm(range(2000)):
	first_time = True
	with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal, WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm:
	print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir))
	print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir))
	for item in tqdm(mus_infos):
	try:
	# if True:
	idx = item['idx']
	# print(idx)
	if(os.path.exists(item['path'])):
	full_path = item['path']
	else:
	full_path = '/mnt/share/' + item['path']
	if(os.path.exists(item['vocal_path'])):
	vocal_path = item['vocal_path']
	bgm_paths = item['bgm_path']
	else:
	vocal_path = '/mnt/share/' + item['vocal_path']
	bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']]
	vocal_tensor = read_wav(vocal_path)
	# full_tensor = read_wav(full_path)
	# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
	# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
	# bgm_tensor = full_tensor - vocal_tensor
	bgm_tensor = sum([read_wav(p) for p in bgm_paths])
	codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
	writer_vocal(str(idx), codes_vocal.cpu())
	writer_bgm(str(idx), codes_bgm.cpu())
	if(first_time):
	first_time = False
	print(codes_vocal.shape, codes_bgm.shape)
	except:
	print(item['vocal_path'])
	print(item['bgm_path'])
	continue

	# idx = item['idx']
	# # print(idx)
	# full_path = item['path']
	# vocal_path = item['vocal_path']
	# bgm_paths = item['bgm_path']
	# full_tensor = read_wav(full_path)
	# vocal_tensor = read_wav(vocal_path)
	# length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
	# full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
	# bgm_tensor = full_tensor - vocal_tensor
	# codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
	# writer_vocal(str(idx), codes_vocal.cpu())
	# writer_bgm(str(idx), codes_bgm.cpu())
	# if(first_time):
	# first_time = False
	# print(codes_vocal.shape, codes_bgm.shape)