Spaces:

caslabs
/

midi-autocompletion

Build error

App Files Files Community

midi-autocompletion / musicautobot /multitask_transformer /learner.py

caslabs

Upload 37 files

f35cc94 almost 2 years ago

raw

history blame contribute delete

14.6 kB

	from fastai.basics import *
	from ..vocab import *
	from ..utils.top_k_top_p import top_k_top_p
	from ..utils.midifile import is_empty_midi
	from ..music_transformer.transform import *
	from ..music_transformer.learner import filter_invalid_indexes
	from .model import get_multitask_model
	from .dataloader import *

	def multitask_model_learner(data:DataBunch, config:dict=None, drop_mult:float=1.,
	pretrained_path:PathOrStr=None, **learn_kwargs) -> 'LanguageLearner':
	"Create a `Learner` with a language model from `data` and `arch`."
	vocab = data.vocab
	vocab_size = len(vocab)

	if pretrained_path:
	state = torch.load(pretrained_path, map_location='cpu')
	if config is None: config = state['config']

	model = get_multitask_model(vocab_size, config=config, drop_mult=drop_mult, pad_idx=vocab.pad_idx)
	metrics = [AverageMultiMetric(partial(m, pad_idx=vocab.pad_idx)) for m in [mask_acc, lm_acc, c2m_acc, m2c_acc]]
	loss_func = MultiLoss(ignore_index=data.vocab.pad_idx)
	learn = MultitaskLearner(data, model, loss_func=loss_func, metrics=metrics, **learn_kwargs)

	if pretrained_path:
	get_model(model).load_state_dict(state['model'], strict=False)
	if not hasattr(learn, 'opt'): learn.create_opt(defaults.lr, learn.wd)
	try: learn.opt.load_state_dict(state['opt'])
	except: pass
	del state
	gc.collect()

	return learn

	class MultitaskLearner(Learner):
	def save(self, file:PathLikeOrBinaryStream=None, with_opt:bool=True, config=None):
	"Save model and optimizer state (if `with_opt`) with `file` to `self.model_dir`. `file` can be file-like (file or buffer)"
	out_path = super().save(file, return_path=True, with_opt=with_opt)
	if config and out_path:
	state = torch.load(out_path)
	state['config'] = config
	torch.save(state, out_path)
	del state
	gc.collect()
	return out_path

	def predict_nw(self, item:MusicItem, n_words:int=128,
	temperatures:float=(1.0,1.0), min_bars=4,
	top_k=30, top_p=0.6):
	"Return the `n_words` that come after `text`."
	self.model.reset()
	new_idx = []
	vocab = self.data.vocab
	x, pos = item.to_tensor(), item.get_pos_tensor()
	last_pos = pos[-1] if len(pos) else 0
	y = torch.tensor([0])

	start_pos = last_pos

	sep_count = 0
	bar_len = SAMPLE_FREQ * 4 # assuming 4/4 time
	vocab = self.data.vocab

	repeat_count = 0

	for i in progress_bar(range(n_words), leave=True):
	batch = { 'lm': { 'x': x[None], 'pos': pos[None] } }, y
	logits = self.pred_batch(batch=batch)['lm'][-1][-1]

	prev_idx = new_idx[-1] if len(new_idx) else vocab.pad_idx

	# Temperature
	# Use first temperatures value if last prediction was duration
	temperature = temperatures[0] if vocab.is_duration_or_pad(prev_idx) else temperatures[1]
	repeat_penalty = max(0, np.log((repeat_count+1)/4)/5) * temperature
	temperature += repeat_penalty
	if temperature != 1.: logits = logits / temperature


	# Filter
	# bar = 16 beats
	filter_value = -float('Inf')
	if ((last_pos - start_pos) // 16) <= min_bars: logits[vocab.bos_idx] = filter_value

	logits = filter_invalid_indexes(logits, prev_idx, vocab, filter_value=filter_value)
	logits = top_k_top_p(logits, top_k=top_k, top_p=top_p, filter_value=filter_value)

	# Sample
	probs = F.softmax(logits, dim=-1)
	idx = torch.multinomial(probs, 1).item()

	# Update repeat count
	num_choices = len(probs.nonzero().view(-1))
	if num_choices <= 2: repeat_count += 1
	else: repeat_count = repeat_count // 2

	if prev_idx==vocab.sep_idx:
	duration = idx - vocab.dur_range[0]
	last_pos = last_pos + duration

	bars_pred = (last_pos - start_pos) // 16
	abs_bar = last_pos // 16
	# if (bars % 8 == 0) and (bars_pred > min_bars): break
	if (i / n_words > 0.80) and (abs_bar % 4 == 0): break


	if idx==vocab.bos_idx:
	print('Predicted BOS token. Returning prediction...')
	break

	new_idx.append(idx)
	x = x.new_tensor([idx])
	pos = pos.new_tensor([last_pos])

	pred = vocab.to_music_item(np.array(new_idx))
	full = item.append(pred)
	return pred, full

	def predict_mask(self, masked_item:MusicItem,
	temperatures:float=(1.0,1.0),
	top_k=20, top_p=0.8):
	x = masked_item.to_tensor()
	pos = masked_item.get_pos_tensor()
	y = torch.tensor([0])
	vocab = self.data.vocab
	self.model.reset()
	mask_idxs = (x == vocab.mask_idx).nonzero().view(-1)

	repeat_count = 0

	for midx in progress_bar(mask_idxs, leave=True):
	prev_idx = x[midx-1]

	# Using original positions, otherwise model gets too off track
	# pos = torch.tensor(-position_enc(xb[0].cpu().numpy()), device=xb.device)[None]

	# Next Word
	logits = self.pred_batch(batch=({ 'msk': { 'x': x[None], 'pos': pos[None] } }, y) )['msk'][0][midx]

	# Temperature
	# Use first temperatures value if last prediction was duration
	temperature = temperatures[0] if vocab.is_duration_or_pad(prev_idx) else temperatures[1]
	repeat_penalty = max(0, np.log((repeat_count+1)/4)/5) * temperature
	temperature += repeat_penalty
	if temperature != 1.: logits = logits / temperature

	# Filter
	filter_value = -float('Inf')
	special_idxs = [vocab.bos_idx, vocab.sep_idx, vocab.stoi[EOS]]
	logits[special_idxs] = filter_value # Don't allow any special tokens (as we are only removing notes and durations)
	logits = filter_invalid_indexes(logits, prev_idx, vocab, filter_value=filter_value)
	logits = top_k_top_p(logits, top_k=top_k, top_p=top_p, filter_value=filter_value)

	# Sampling
	probs = F.softmax(logits, dim=-1)
	idx = torch.multinomial(probs, 1).item()

	# Update repeat count
	num_choices = len(probs.nonzero().view(-1))
	if num_choices <= 2: repeat_count += 1
	else: repeat_count = repeat_count // 2

	x[midx] = idx

	return vocab.to_music_item(x.cpu().numpy())

	def predict_s2s(self, input_item:MusicItem, target_item:MusicItem, n_words:int=256,
	temperatures:float=(1.0,1.0), top_k=30, top_p=0.8,
	use_memory=True):
	vocab = self.data.vocab

	# Input doesn't change. We can reuse the encoder output on each prediction
	with torch.no_grad():
	inp, inp_pos = input_item.to_tensor(), input_item.get_pos_tensor()
	x_enc = self.model.encoder(inp[None], inp_pos[None])

	# target
	targ = target_item.data.tolist()
	targ_pos = target_item.position.tolist()
	last_pos = targ_pos[-1]
	self.model.reset()

	repeat_count = 0

	max_pos = input_item.position[-1] + SAMPLE_FREQ * 4 # Only predict until both tracks/parts have the same length
	x, pos = inp.new_tensor(targ), inp_pos.new_tensor(targ_pos)

	for i in progress_bar(range(n_words), leave=True):
	# Predict
	with torch.no_grad():
	dec = self.model.decoder(x[None], pos[None], x_enc)
	logits = self.model.head(dec)[-1, -1]

	# Temperature
	# Use first temperatures value if last prediction was duration
	prev_idx = targ[-1] if len(targ) else vocab.pad_idx
	temperature = temperatures[0] if vocab.is_duration_or_pad(prev_idx) else temperatures[1]
	repeat_penalty = max(0, np.log((repeat_count+1)/4)/5) * temperature
	temperature += repeat_penalty
	if temperature != 1.: logits = logits / temperature

	# Filter
	filter_value = -float('Inf')
	logits = filter_invalid_indexes(logits, prev_idx, vocab, filter_value=filter_value)
	logits = top_k_top_p(logits, top_k=top_k, top_p=top_p, filter_value=filter_value)

	# Sample
	probs = F.softmax(logits, dim=-1)
	idx = torch.multinomial(probs, 1).item()

	# Update repeat count
	num_choices = len(probs.nonzero().view(-1))
	if num_choices <= 2: repeat_count += 1
	else: repeat_count = repeat_count // 2

	if idx == vocab.bos_idx \| idx == vocab.stoi[EOS]:
	print('Predicting BOS/EOS')
	break

	if prev_idx == vocab.sep_idx:
	duration = idx - vocab.dur_range[0]
	last_pos = last_pos + duration
	if last_pos > max_pos:
	print('Predicted past counter-part length. Returning early')
	break

	targ_pos.append(last_pos)
	targ.append(idx)

	if use_memory:
	# Relying on memory for kv. Only need last prediction index
	x, pos = inp.new_tensor([targ[-1]]), inp_pos.new_tensor([targ_pos[-1]])
	else:
	# Reset memory after each prediction, since we feeding the whole sequence every time
	self.model.reset()
	x, pos = inp.new_tensor(targ), inp_pos.new_tensor(targ_pos)

	return vocab.to_music_item(np.array(targ))

	# High level prediction functions from midi file
	def nw_predict_from_midi(learn, midi=None, n_words=400,
	temperatures=(1.0,1.0), top_k=30, top_p=0.6, seed_len=None, **kwargs):
	vocab = learn.data.vocab
	seed = MusicItem.from_file(midi, vocab) if not is_empty_midi(midi) else MusicItem.empty(vocab)
	if seed_len is not None: seed = seed.trim_to_beat(seed_len)

	pred, full = learn.predict_nw(seed, n_words=n_words, temperatures=temperatures, top_k=top_k, top_p=top_p, **kwargs)
	return full

	def s2s_predict_from_midi(learn, midi=None, n_words=200,
	temperatures=(1.0,1.0), top_k=24, top_p=0.7, seed_len=None, pred_melody=True, **kwargs):
	multitrack_item = MultitrackItem.from_file(midi, learn.data.vocab)
	melody, chords = multitrack_item.melody, multitrack_item.chords
	inp, targ = (chords, melody) if pred_melody else (melody, chords)

	# if seed_len is passed, cutoff sequence so we can predict the rest
	if seed_len is not None: targ = targ.trim_to_beat(seed_len)
	targ = targ.remove_eos()

	pred = learn.predict_s2s(inp, targ, n_words=n_words, temperatures=temperatures, top_k=top_k, top_p=top_p, **kwargs)

	part_order = (pred, inp) if pred_melody else (inp, pred)
	return MultitrackItem(*part_order)

	def mask_predict_from_midi(learn, midi=None, predict_notes=True,
	temperatures=(1.0,1.0), top_k=30, top_p=0.7, section=None, **kwargs):
	item = MusicItem.from_file(midi, learn.data.vocab)
	masked_item = item.mask_pitch(section) if predict_notes else item.mask_duration(section)
	pred = learn.predict_mask(masked_item, temperatures=temperatures, top_k=top_k, top_p=top_p, **kwargs)
	return pred

	# LOSS AND METRICS

	class MultiLoss():
	def __init__(self, ignore_index=None):
	"Loss mult - Mask, NextWord, Seq2Seq"
	self.loss = CrossEntropyFlat(ignore_index=ignore_index)

	def __call__(self, inputs:Dict[str,Tensor], targets:Dict[str,Tensor])->Rank0Tensor:
	losses = [self.loss(inputs[key], target) for key,target in targets.items()]
	return sum(losses)

	def acc_ignore_pad(input:Tensor, targ:Tensor, pad_idx)->Rank0Tensor:
	if input is None or targ is None: return None
	n = targ.shape[0]
	input = input.argmax(dim=-1).view(n,-1)
	targ = targ.view(n,-1)
	mask = targ != pad_idx
	return (input[mask]==targ[mask]).float().mean()

	def acc_index(inputs, targets, key, pad_idx):
	return acc_ignore_pad(inputs.get(key), targets.get(key), pad_idx)

	def mask_acc(inputs, targets, pad_idx): return acc_index(inputs, targets, 'msk', pad_idx)
	def lm_acc(inputs, targets, pad_idx): return acc_index(inputs, targets, 'lm', pad_idx)
	def c2m_acc(inputs, targets, pad_idx): return acc_index(inputs, targets, 'c2m', pad_idx)
	def m2c_acc(inputs, targets, pad_idx): return acc_index(inputs, targets, 'm2c', pad_idx)


	class AverageMultiMetric(AverageMetric):
	"Updated fastai.AverageMetric to support multi task metrics."
	def on_batch_end(self, last_output, last_target, **kwargs):
	"Update metric computation with `last_output` and `last_target`."
	if not is_listy(last_target): last_target=[last_target]
	val = self.func(last_output, *last_target)
	if val is None: return
	self.count += first_el(last_target).size(0)
	if self.world:
	val = val.clone()
	dist.all_reduce(val, op=dist.ReduceOp.SUM)
	val /= self.world
	self.val += first_el(last_target).size(0) * val.detach().cpu()

	def on_epoch_end(self, last_metrics, **kwargs):
	"Set the final result in `last_metrics`."
	if self.count == 0: return add_metrics(last_metrics, 0)
	return add_metrics(last_metrics, self.val/self.count)


	# MODEL LOADING
	class MTTrainer(LearnerCallback):
	"`Callback` that regroups lr adjustment to seq_len, AR and TAR."
	def __init__(self, learn:Learner, dataloaders=None, starting_mask_window=1):
	super().__init__(learn)
	self.count = 1
	self.mw_start = starting_mask_window
	self.dataloaders = dataloaders

	def on_epoch_begin(self, **kwargs):
	"Reset the hidden state of the model."
	model = get_model(self.learn.model)
	model.reset()
	model.encoder.mask_steps = max(self.count+self.mw_start, 100)

	def on_epoch_end(self, last_metrics, **kwargs):
	"Finish the computation and sends the result to the Recorder."
	if self.dataloaders is not None:
	self.learn.data = self.dataloaders[self.count % len(self.dataloaders)]
	self.count += 1