sweetcocoa susnato commited on
Commit
1a572e4
·
1 Parent(s): 705ff7f

codebase_change (#6)

Browse files

- codebase switched to HF transformers (26cb74172642e5881ada1f6b423c37f43e16c1e0)
- removed the last 2 examples (46b58a01fee498aabd4d3498364679fd1c7e0ad9)


Co-authored-by: Susnato Dhar <[email protected]>

README.md CHANGED
@@ -1,10 +1,12 @@
1
  ---
2
- title: Pop2Piano Demo
3
- emoji: 🎹
4
- colorFrom: black
5
- colorTo: white
6
  sdk: gradio
7
- sdk_version: 3.8.2
8
  app_file: app.py
9
- pinned: true
10
- ---
 
 
 
1
  ---
2
+ title: Pop2piano Dev
3
+ emoji: 🏢
4
+ colorFrom: pink
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.42.0
8
  app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,54 +1,83 @@
 
1
  import torch
 
 
 
 
 
 
2
  import gradio as gr
3
- import os
4
- from transformer_wrapper import TransformerWrapper
5
- from omegaconf import OmegaConf
6
-
7
-
8
- def get_file_content_as_string(path):
9
- return open(path, "r", encoding="utf-8").read()
10
-
11
-
12
- def model_load():
13
- config = OmegaConf.load("config.yaml")
14
- wrapper = TransformerWrapper(config)
15
- wrapper = wrapper.load_from_checkpoint(
16
- "https://huggingface.co/sweetcocoa/pop2piano/resolve/main/model-1999-val_0.67311615.ckpt",
17
- config=config,
18
- map_location="cpu",
19
- )
20
- model_id = "dpipqxiy"
21
- wrapper.eval()
22
- if torch.cuda.is_available():
23
- wrapper = wrapper.cuda()
24
-
25
- return wrapper, model_id, config
26
-
27
-
28
- wrapper, model_id, config = model_load()
29
- composers = list(config.composer_to_feature_token.keys())
30
- dest_dir = "ytsamples"
31
- os.makedirs(dest_dir, exist_ok=True)
32
-
33
-
34
- def inference(file_up, composer):
35
-
36
- midi, arranger, mix_path, midi_path = wrapper.generate(
37
- audio_path=file_up,
38
- composer=composer,
39
- model=model_id,
40
- ignore_duplicate=True,
41
- show_plot=False,
42
- save_midi=True,
43
- save_mix=True,
44
- midi_path="output.mid",
45
- )
46
-
47
- return mix_path, midi_path
48
-
49
-
50
- block = gr.Blocks()
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  with block:
54
  gr.HTML(
@@ -67,38 +96,77 @@ with block:
67
  </h1>
68
  </div>
69
  <p style="margin-bottom: 10px; font-size: 94%">
70
- A demo for Pop2Piano:Pop Audio-based Piano Cover Generation. Please select the composer and upload the pop audio to submit.
 
71
  </p>
72
  </div>
73
  """
74
  )
75
  with gr.Group():
76
- with gr.Box():
77
- with gr.Row().style(mobile_collapse=False, equal_height=True):
78
- file_up = gr.Audio(label="Upload an audio", type="filepath")
79
- composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
80
- btn = gr.Button("Convert")
81
- with gr.Box():
82
- with gr.Row().style(mobile_collapse=False, equal_height=True):
83
- out = gr.Audio(label="Output")
84
- midi_out = gr.File(label="Download Midi")
85
- btn.click(inference, inputs=[file_up, composer], outputs=[out, midi_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  gr.Examples([
87
- ["./examples/BornThisWay.mp3", "composer1"],
88
- ["./examples/Sk8erBoi.mp3", "composer2"]
89
  ],
90
  fn=inference,
91
- inputs=[file_up, composer],
92
- outputs=[out, midi_out],
93
  cache_examples=True
94
  )
95
  gr.HTML(
96
  """
97
  <div class="footer">
98
- <p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
 
 
 
 
 
 
 
 
 
 
99
  </p>
100
  </div>
101
  """
102
  )
103
 
104
- block.launch(debug=True)
 
1
+ import os
2
  import torch
3
+ import librosa
4
+ import binascii
5
+ import warnings
6
+ import midi2audio # to convert midi to wav
7
+ import numpy as np
8
+ import pytube as pt # to download the youtube videos as audios
9
  import gradio as gr
10
+ import soundfile as sf # to make the stereo mix
11
+ from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
12
+
13
+
14
+ yt_video_dir = "./yt_dir"
15
+ outputs_dir = "./midi_wav_outputs"
16
+ os.makedirs(outputs_dir, exist_ok=True)
17
+ os.makedirs(yt_video_dir, exist_ok=True)
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
21
+ processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
22
+ composers = model.generation_config.composer_to_feature_token.keys()
23
+
24
+
25
+ def get_audio_from_yt_video(yt_link):
26
+ try:
27
+ yt = pt.YouTube(yt_link)
28
+ t = yt.streams.filter(only_audio=True)
29
+ filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
30
+ t[0].download(filename=filename)
31
+ except:
32
+ warnings.warn(f"Video Not Found at {yt_link}")
33
+ filename = None
34
+
35
+ return filename, filename
36
+
37
+ def inference(file_uploaded, composer):
38
+ # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
39
+ # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
40
+ waveform, sr = librosa.load(file_uploaded, sr=None)
41
+
42
+ inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
43
+ model_output = model.generate(input_features=inputs["input_features"], composer=composer)
44
+ tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"]
45
+
46
+ return prepare_output_file(tokenizer_output, sr)
47
+
48
+ def prepare_output_file(tokenizer_output, sr):
49
+ # Add some random values so that no two file names are same
50
+ output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
51
+ midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
52
+
53
+ # write the .mid file
54
+ tokenizer_output[0].write(midi_output)
55
+
56
+ # convert .mid file to .wav using `midi2audio`
57
+ wav_output = midi_output.replace(".mid", ".wav")
58
+ midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output)
59
+
60
+ return wav_output, wav_output, midi_output
61
+
62
+ def get_stereo(pop_path, midi, pop_scale=0.5):
63
+ pop_y, sr = librosa.load(pop_path, sr=None)
64
+ midi_y, _ = librosa.load(midi.name, sr=None)
65
+
66
+ if len(pop_y) > len(midi_y):
67
+ midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
68
+ elif len(pop_y) < len(midi_y):
69
+ pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
70
+ stereo = np.stack((midi_y, pop_y * pop_scale))
71
+
72
+ stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
73
+ sf.write(file=stereo_mix_path, data=stereo.T, samplerate=sr, format="wav",)
74
+
75
+ return stereo_mix_path, stereo_mix_path
76
+
77
+
78
+ # Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
79
+ # taken from https://huggingface.co/spaces/NoCrypt/miku
80
+ block = gr.Blocks(theme="Taithrah/Minimal")
81
 
82
  with block:
83
  gr.HTML(
 
96
  </h1>
97
  </div>
98
  <p style="margin-bottom: 10px; font-size: 94%">
99
+ A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
100
+ Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
101
  </p>
102
  </div>
103
  """
104
  )
105
  with gr.Group():
106
+ with gr.Row(equal_height=True):
107
+ with gr.Column():
108
+ file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
109
+ with gr.Column():
110
+ with gr.Row():
111
+ yt_link = gr.Textbox(label="Enter YouTube Link of the Video", autofocus=True, lines=3)
112
+ yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
113
+
114
+ yt_audio_path = gr.Audio(label="Audio Extracted from the YouTube Video", interactive=False)
115
+ yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])
116
+
117
+ with gr.Group():
118
+ with gr.Column():
119
+ composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
120
+ generate_btn = gr.Button("Generate")
121
+
122
+ with gr.Row().style(mobile_collapse=False, equal_height=True):
123
+ wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
124
+ wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
125
+ midi_output = gr.File(label="Download the Generated MIDI (.mid)")
126
+ generate_btn.click(inference,
127
+ inputs=[file_uploaded, composer],
128
+ outputs=[wav_output1, wav_output2, midi_output])
129
+
130
+ with gr.Group():
131
+ gr.HTML(
132
+ """
133
+ <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
134
+ """
135
+ )
136
+ pop_scale = gr.Slider(0, 1, value=0.5, label="Choose the ratio between Pop and MIDI", info="1.0 = Only Pop, 0.0=Only MIDI", interactive=True),
137
+ stereo_btn = gr.Button("Get Stereo Mix")
138
+ with gr.Row():
139
+ stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
140
+ stereo_mix2 = gr.File(label="Download the Stereo Mix")
141
+
142
+ stereo_btn.click(get_stereo, inputs=[file_uploaded, wav_output2, pop_scale[0]], outputs=[stereo_mix1, stereo_mix2])
143
+
144
+ with gr.Group():
145
  gr.Examples([
146
+ ["./examples/custom_song.mp3", "composer1"],
 
147
  ],
148
  fn=inference,
149
+ inputs=[file_uploaded, composer],
150
+ outputs=[wav_output1, wav_output2, midi_output],
151
  cache_examples=True
152
  )
153
  gr.HTML(
154
  """
155
  <div class="footer">
156
+ <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
157
+ </div>
158
+ """
159
+ )
160
+
161
+ gr.HTML(
162
+ """
163
+ <div class="footer">
164
+ <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
165
+ <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
166
+ <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
167
  </p>
168
  </div>
169
  """
170
  )
171
 
172
+ block.launch(debug=False)
config.yaml DELETED
@@ -1,61 +0,0 @@
1
- project: pop2piano
2
- dataset:
3
- target_length: 256
4
- input_length: 1024
5
- n_bars: 2
6
- sample_rate: 22050
7
- use_mel: true
8
- mel_is_conditioned: true
9
- composer_to_feature_token:
10
- composer1: 2052
11
- composer2: 2053
12
- composer3: 2054
13
- composer4: 2055
14
- composer5: 2056
15
- composer6: 2057
16
- composer7: 2058
17
- composer8: 2059
18
- composer9: 2060
19
- composer10: 2061
20
- composer11: 2062
21
- composer12: 2063
22
- composer13: 2064
23
- composer14: 2065
24
- composer15: 2066
25
- composer16: 2067
26
- composer17: 2068
27
- composer18: 2069
28
- composer19: 2070
29
- composer20: 2071
30
- composer21: 2072
31
- t5:
32
- feed_forward_proj: gated-gelu
33
- tie_word_embeddings: false
34
- tie_encoder_decoder: false
35
- vocab_size: 2400
36
- n_positions: 1024
37
- relative_attention_num_buckets: 32
38
- tokenizer:
39
- vocab_size:
40
- special: 4
41
- note: 128
42
- velocity: 2
43
- time: 100
44
- training:
45
- seed: 3407
46
- resume: false
47
- offline: false
48
- num_gpu: 1
49
- max_epochs: 5000
50
- accumulate_grad_batches: 1
51
- check_val_every_n_epoch: 20
52
- find_lr: false
53
- optimizer: adafactor
54
- version: none
55
- lr: 0.001
56
- lr_min: 1.0e-06
57
- lr_scheduler: false
58
- lr_decay: 0.99
59
- batch_size: 32
60
- num_workers: 32
61
- gradient_clip_val: 3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/BornThisWay.mp3 DELETED
Binary file (482 kB)
 
examples/Sk8erBoi.mp3 DELETED
Binary file (673 kB)
 
examples/custom_song.mp3 ADDED
Binary file (247 kB). View file
 
layer/__init__.py DELETED
File without changes
layer/input.py DELETED
@@ -1,46 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torchaudio
4
-
5
-
6
- class LogMelSpectrogram(nn.Module):
7
- def __init__(self) -> None:
8
- super().__init__()
9
- self.melspectrogram = torchaudio.transforms.MelSpectrogram(
10
- sample_rate=22050,
11
- n_fft=4096,
12
- hop_length=1024,
13
- f_min=10.0,
14
- n_mels=512,
15
- )
16
-
17
- def forward(self, x):
18
- # x : audio(batch, sample)
19
- # X : melspec (batch, freq, frame)
20
- with torch.no_grad():
21
- with torch.cuda.amp.autocast(enabled=False):
22
- X = self.melspectrogram(x)
23
- X = X.clamp(min=1e-6).log()
24
-
25
- return X
26
-
27
-
28
- class ConcatEmbeddingToMel(nn.Module):
29
- def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
30
- super().__init__()
31
- self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
32
- self.embedding_offset = embedding_offset
33
-
34
- def forward(self, feature, index_value):
35
- """
36
- index_value : (batch, )
37
- feature : (batch, time, feature_dim)
38
- """
39
- index_shifted = index_value - self.embedding_offset
40
-
41
- # (batch, 1, feature_dim)
42
- composer_embedding = self.embedding(index_shifted).unsqueeze(1)
43
- # print(composer_embedding.shape, feature.shape)
44
- # (batch, 1 + time, feature_dim)
45
- inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
46
- return inputs_embeds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
midi_tokenizer.py DELETED
@@ -1,430 +0,0 @@
1
- import numpy as np
2
- from numba import jit
3
- import pretty_midi
4
- import scipy.interpolate as interp
5
-
6
- TOKEN_SPECIAL: int = 0
7
- TOKEN_NOTE: int = 1
8
- TOKEN_VELOCITY: int = 2
9
- TOKEN_TIME: int = 3
10
-
11
- DEFAULT_VELOCITY: int = 77
12
-
13
- TIE: int = 2
14
- EOS: int = 1
15
- PAD: int = 0
16
-
17
-
18
- def extrapolate_beat_times(beat_times, n_extend=1):
19
- beat_times_function = interp.interp1d(
20
- np.arange(beat_times.size),
21
- beat_times,
22
- bounds_error=False,
23
- fill_value="extrapolate",
24
- )
25
-
26
- ext_beats = beat_times_function(
27
- np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
28
- )
29
-
30
- return ext_beats
31
-
32
-
33
- @jit(nopython=True, cache=True)
34
- def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
35
- if token_type == TOKEN_TIME:
36
- return n_special + n_note + n_velocity + idx
37
- elif token_type == TOKEN_VELOCITY:
38
- return n_special + n_note + idx
39
- elif token_type == TOKEN_NOTE:
40
- return n_special + idx
41
- elif token_type == TOKEN_SPECIAL:
42
- return idx
43
- else:
44
- return -1
45
-
46
-
47
- @jit(nopython=True, cache=True)
48
- def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
49
- if idx >= n_special + n_note + n_velocity:
50
- return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
51
- elif idx >= n_special + n_note:
52
- return TOKEN_VELOCITY, idx - (n_special + n_note)
53
- elif idx >= n_special:
54
- return TOKEN_NOTE, idx - n_special
55
- else:
56
- return TOKEN_SPECIAL, idx
57
-
58
-
59
- class MidiTokenizer:
60
- def __init__(self, config) -> None:
61
- self.config = config
62
-
63
- def tokenize_note(self, idx, token_type):
64
- rt = fast_tokenize(
65
- idx,
66
- token_type,
67
- self.config.vocab_size.special,
68
- self.config.vocab_size.note,
69
- self.config.vocab_size.velocity,
70
- )
71
- if rt == -1:
72
- raise ValueError(f"type {type} is not a predefined token type.")
73
- else:
74
- return rt
75
-
76
- def notes_to_tokens(self, notes):
77
- """
78
- notes : (onset idx, offset idx, pitch, velocity)
79
- """
80
- max_time_idx = notes[:, :2].max()
81
-
82
- times = [[] for i in range((max_time_idx + 1))]
83
- for onset, offset, pitch, velocity in notes:
84
- times[onset].append([pitch, velocity])
85
- times[offset].append([pitch, 0])
86
-
87
- tokens = []
88
- current_velocity = 0
89
- for i, time in enumerate(times):
90
- if len(time) == 0:
91
- continue
92
- tokens.append(self.tokenize_note(i, TOKEN_TIME))
93
- for pitch, velocity in time:
94
- velocity = int(velocity > 0)
95
- if current_velocity != velocity:
96
- current_velocity = velocity
97
- tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
98
- tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
99
-
100
- return np.array(tokens, dtype=int)
101
-
102
- def detokenize(self, token, time_idx_offset):
103
- type, value = fast_detokenize(
104
- token,
105
- n_special=self.config.vocab_size.special,
106
- n_note=self.config.vocab_size.note,
107
- n_velocity=self.config.vocab_size.velocity,
108
- time_idx_offset=time_idx_offset,
109
- )
110
- if type != TOKEN_TIME:
111
- value = int(value)
112
- return [type, value]
113
-
114
- def to_string(self, tokens, time_idx_offset=0):
115
- nums = [
116
- self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
117
- ]
118
- strings = []
119
- for i in range(len(nums)):
120
- type = nums[i][0]
121
- value = nums[i][1]
122
-
123
- if type == TOKEN_TIME:
124
- type = "time"
125
- elif type == TOKEN_SPECIAL:
126
- if value == EOS:
127
- value = "EOS"
128
- elif value == PAD:
129
- value = "PAD"
130
- elif value == TIE:
131
- value = "TIE"
132
- else:
133
- value = "Unknown Special"
134
- elif type == TOKEN_NOTE:
135
- type = "note"
136
- elif type == TOKEN_VELOCITY:
137
- type = "velocity"
138
- strings.append((type, value))
139
- return strings
140
-
141
- def split_notes(self, notes, beatsteps, time_from, time_to):
142
- """
143
- Assumptions
144
- - notes are sorted by onset time
145
- - beatsteps are sorted by time
146
- """
147
- start_idx = np.searchsorted(beatsteps, time_from)
148
- start_note = np.searchsorted(notes[:, 0], start_idx)
149
-
150
- end_idx = np.searchsorted(beatsteps, time_to)
151
- end_note = np.searchsorted(notes[:, 0], end_idx)
152
- splited_notes = notes[start_note:end_note]
153
-
154
- return splited_notes, (start_idx, end_idx, start_note, end_note)
155
-
156
- def notes_to_relative_tokens(
157
- self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
158
- ):
159
- """
160
- notes : (onset idx, offset idx, pitch, velocity)
161
- """
162
-
163
- def _add_eos(tokens):
164
- tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
165
- return tokens
166
-
167
- def _add_composer(tokens, composer_value):
168
- tokens = np.concatenate(
169
- (np.array([composer_value], dtype=tokens.dtype), tokens)
170
- )
171
- return tokens
172
-
173
- if len(notes) == 0:
174
- tokens = np.array([], dtype=int)
175
- if add_eos:
176
- tokens = _add_eos(tokens)
177
- if add_composer:
178
- tokens = _add_composer(tokens, composer_value=composer_value)
179
- return tokens
180
-
181
- max_time_idx = notes[:, :2].max()
182
-
183
- # times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
184
- times = [[] for i in range((max_time_idx + 1 - offset_idx))]
185
- for abs_onset, abs_offset, pitch, velocity in notes:
186
- rel_onset = abs_onset - offset_idx
187
- rel_offset = abs_offset - offset_idx
188
- times[rel_onset].append([pitch, velocity])
189
- times[rel_offset].append([pitch, 0])
190
-
191
- # 여기서부터는 전부 시간 0(offset) 기준
192
- tokens = []
193
- current_velocity = 0
194
- current_time_idx = 0
195
-
196
- for rel_idx, time in enumerate(times):
197
- if len(time) == 0:
198
- continue
199
- time_idx_shift = rel_idx - current_time_idx
200
- current_time_idx = rel_idx
201
-
202
- tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
203
- for pitch, velocity in time:
204
- velocity = int(velocity > 0)
205
- if current_velocity != velocity:
206
- current_velocity = velocity
207
- tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
208
- tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
209
-
210
- tokens = np.array(tokens, dtype=int)
211
- if add_eos:
212
- tokens = _add_eos(tokens)
213
- if add_composer:
214
- tokens = _add_composer(tokens, composer_value=composer_value)
215
- return tokens
216
-
217
- def relative_batch_tokens_to_midi(
218
- self,
219
- tokens,
220
- beatstep,
221
- beat_offset_idx=None,
222
- bars_per_batch=None,
223
- cutoff_time_idx=None,
224
- ):
225
- """
226
- tokens : (batch, sequence)
227
- beatstep : (times, )
228
- """
229
- beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
230
- notes = None
231
- bars_per_batch = 2 if bars_per_batch is None else bars_per_batch
232
-
233
- N = len(tokens)
234
- for n in range(N):
235
- _tokens = tokens[n]
236
- _start_idx = beat_offset_idx + n * bars_per_batch * 4
237
- _cutoff_time_idx = cutoff_time_idx + _start_idx
238
- _notes = self.relative_tokens_to_notes(
239
- _tokens,
240
- start_idx=_start_idx,
241
- cutoff_time_idx=_cutoff_time_idx,
242
- )
243
- # print(_notes, "\n-------")
244
- if len(_notes) == 0:
245
- pass
246
- # print("_notes zero")
247
- elif notes is None:
248
- notes = _notes
249
- else:
250
- notes = np.concatenate((notes, _notes), axis=0)
251
-
252
- if notes is None:
253
- notes = []
254
- midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
255
- return midi, notes
256
-
257
- def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
258
- # TODO remove legacy
259
- # decoding 첫토큰이 편곡자인 경우
260
- if tokens[0] >= sum(self.config.vocab_size.values()):
261
- tokens = tokens[1:]
262
-
263
- words = [self.detokenize(token, time_idx_offset=0) for token in tokens]
264
-
265
- if hasattr(start_idx, "item"):
266
- """
267
- if numpy or torch tensor
268
- """
269
- start_idx = start_idx.item()
270
-
271
- current_idx = start_idx
272
- current_velocity = 0
273
- note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
274
- notes = []
275
- for type, number in words:
276
- if type == TOKEN_SPECIAL:
277
- if number == EOS:
278
- break
279
- elif type == TOKEN_TIME:
280
- current_idx += number
281
- if cutoff_time_idx is not None:
282
- current_idx = min(current_idx, cutoff_time_idx)
283
-
284
- elif type == TOKEN_VELOCITY:
285
- current_velocity = number
286
- elif type == TOKEN_NOTE:
287
- pitch = number
288
- if current_velocity == 0:
289
- # note_offset
290
- if note_onsets_ready[pitch] is None:
291
- # offset without onset
292
- pass
293
- else:
294
- onset_idx = note_onsets_ready[pitch]
295
- if onset_idx >= current_idx:
296
- # No time shift after previous note_on
297
- pass
298
- else:
299
- offset_idx = current_idx
300
- notes.append(
301
- [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
302
- )
303
- note_onsets_ready[pitch] = None
304
- else:
305
- # note_on
306
- if note_onsets_ready[pitch] is None:
307
- note_onsets_ready[pitch] = current_idx
308
- else:
309
- # note-on already exists
310
- onset_idx = note_onsets_ready[pitch]
311
- if onset_idx >= current_idx:
312
- # No time shift after previous note_on
313
- pass
314
- else:
315
- offset_idx = current_idx
316
- notes.append(
317
- [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
318
- )
319
- note_onsets_ready[pitch] = current_idx
320
- else:
321
- raise ValueError
322
-
323
- for pitch, note_on in enumerate(note_onsets_ready):
324
- # force offset if no offset for each pitch
325
- if note_on is not None:
326
- if cutoff_time_idx is None:
327
- cutoff = note_on + 1
328
- else:
329
- cutoff = max(cutoff_time_idx, note_on + 1)
330
-
331
- offset_idx = max(current_idx, cutoff)
332
- notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])
333
-
334
- if len(notes) == 0:
335
- return []
336
- else:
337
- notes = np.array(notes)
338
- note_order = notes[:, 0] * 128 + notes[:, 1]
339
- notes = notes[note_order.argsort()]
340
- return notes
341
-
342
- def notes_to_midi(self, notes, beatstep, offset_sec=None):
343
- new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
344
- new_inst = pretty_midi.Instrument(program=0)
345
- new_notes = []
346
- if offset_sec is None:
347
- offset_sec = 0.0
348
-
349
- for onset_idx, offset_idx, pitch, velocity in notes:
350
- new_note = pretty_midi.Note(
351
- velocity=velocity,
352
- pitch=pitch,
353
- start=beatstep[onset_idx] - offset_sec,
354
- end=beatstep[offset_idx] - offset_sec,
355
- )
356
- new_notes.append(new_note)
357
- new_inst.notes = new_notes
358
- new_pm.instruments.append(new_inst)
359
- new_pm.remove_invalid_notes()
360
- return new_pm
361
-
362
-
363
- @jit(nopython=True, cache=False)
364
- def fast_notes_to_relative_tokens(
365
- notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
366
- ):
367
- """
368
- notes : (onset idx, offset idx, pitch, velocity)
369
- """
370
-
371
- times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
372
- times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
373
-
374
- for abs_onset, abs_offset, pitch, velocity in notes:
375
- rel_onset = abs_onset - offset_idx
376
- rel_offset = abs_offset - offset_idx
377
- times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
378
- times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
379
- times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
380
- times_v[rel_offset] = np.append(times_v[rel_offset], velocity)
381
-
382
- # 여기서부터는 전부 시간 0(offset) 기준
383
- tokens = []
384
- current_velocity = np.array([0])
385
- current_time_idx = np.array([0])
386
-
387
- # range가 0일 수도 있으니까..
388
- for i in range(len(times_p)):
389
- rel_idx = i
390
- notes_at_time = times_p[i]
391
- if len(notes_at_time) == 0:
392
- continue
393
-
394
- time_idx_shift = rel_idx - current_time_idx[0]
395
- current_time_idx[0] = rel_idx
396
-
397
- token = fast_tokenize(
398
- time_idx_shift,
399
- TOKEN_TIME,
400
- n_special=n_special,
401
- n_note=n_note,
402
- n_velocity=n_velocity,
403
- )
404
- tokens.append(token)
405
-
406
- for j in range(len(notes_at_time)):
407
- pitch = times_p[j]
408
- velocity = times_v[j]
409
- # for pitch, velocity in time:
410
- velocity = int(velocity > 0)
411
- if current_velocity[0] != velocity:
412
- current_velocity[0] = velocity
413
- token = fast_tokenize(
414
- velocity,
415
- TOKEN_VELOCITY,
416
- n_special=n_special,
417
- n_note=n_note,
418
- n_velocity=n_velocity,
419
- )
420
- tokens.append(token)
421
- token = fast_tokenize(
422
- pitch,
423
- TOKEN_NOTE,
424
- n_special=n_special,
425
- n_note=n_note,
426
- n_velocity=n_velocity,
427
- )
428
- tokens.append(token)
429
-
430
- return np.array(tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
packages.txt CHANGED
@@ -1 +1,2 @@
1
- fluidsynth
 
 
1
+ fluidsynth
2
+ ffmpeg
preprocess/README.md DELETED
@@ -1,36 +0,0 @@
1
- # Preprocess Scripts
2
- ---
3
- - Note : the order of these scripts is IMPORTANT.
4
- - the preprocessing step is easy. but environment setting is not. please understand.
5
- - If you encounter any problems, please do not hesitate to email me or open an issue to the github.
6
-
7
- 1. Transcribe piano wavs to midi
8
- - You should transcribe {piano_cover_file.wav} -> {piano_cover_file.mid}
9
- - I recommend you to use original codes from this repo : [High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times](https://github.com/qiuqiangkong/piano_transcription_inference)
10
-
11
- - Instead, you can also you my docker script.
12
- ```bash
13
- docker run -it --gpus all --rm -v /DIRECTORY_THAT_CONTAINS_PIANO_WAV/:/input -v /DIRECTORY_THAT_MIDI_OUTPUT/:/output jonghochoi/piano_transcribe:bytedance1
14
- ```
15
- - If you are using GPU RTX 30XX or higher, this script may not work properly. It's because the version of pytorch is too low(1.4).
16
- - then upgrade the version of pytorch in the docker..
17
-
18
- 2. Estimate Pop's beats
19
- ```bash
20
- python bpm_quantize.py DATA_DIR
21
- ```
22
-
23
- 3. synchronize midi
24
- ```bash
25
- python pop_align.py DATA_DIR
26
- ```
27
-
28
- 4. get separated vocal track
29
- ```bash
30
- python split_spleeter.py DATA_DIR
31
- ```
32
-
33
- 5. caculate melody chroma accuracy
34
- ```bash
35
- python melody_accuracy.py DATA_DIR
36
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/beat_quantizer.py DELETED
@@ -1,111 +0,0 @@
1
- import copy
2
- import librosa
3
- import essentia
4
- import essentia.standard
5
- import numpy as np
6
- import scipy.interpolate as interp
7
- import note_seq
8
-
9
- SAMPLERATE = 44100
10
-
11
-
12
- def nearest_onset_offset_digitize(on, off, bins):
13
- intermediate = (bins[1:] + bins[:-1]) / 2
14
- on_idx = np.digitize(on, intermediate)
15
- off_idx = np.digitize(off, intermediate)
16
- off_idx[on_idx == off_idx] += 1
17
- # off_idx = np.clip(off_idx, a_min=0, a_max=len(bins) - 1)
18
- return on_idx, off_idx
19
-
20
-
21
- def apply_sustain_pedal(pm):
22
- ns = note_seq.midi_to_note_sequence(pm)
23
- susns = note_seq.apply_sustain_control_changes(ns)
24
- suspm = note_seq.note_sequence_to_pretty_midi(susns)
25
- return suspm
26
-
27
-
28
- def interpolate_beat_times(beat_times, steps_per_beat, extend=False):
29
- beat_times_function = interp.interp1d(
30
- np.arange(beat_times.size),
31
- beat_times,
32
- bounds_error=False,
33
- fill_value="extrapolate",
34
- )
35
- if extend:
36
- beat_steps_8th = beat_times_function(
37
- np.linspace(0, beat_times.size, beat_times.size * steps_per_beat + 1)
38
- )
39
- else:
40
- beat_steps_8th = beat_times_function(
41
- np.linspace(0, beat_times.size - 1, beat_times.size * steps_per_beat - 1)
42
- )
43
- return beat_steps_8th
44
-
45
-
46
- def midi_quantize_by_beats(
47
- sample, beat_times, steps_per_beat, ignore_sustain_pedal=False
48
- ):
49
- ns = note_seq.midi_file_to_note_sequence(sample.midi)
50
- if ignore_sustain_pedal:
51
- susns = ns
52
- else:
53
- susns = note_seq.apply_sustain_control_changes(ns)
54
-
55
- qns = copy.deepcopy(susns)
56
-
57
- notes = np.array([[n.start_time, n.end_time] for n in susns.notes])
58
- note_attributes = np.array([[n.pitch, n.velocity] for n in susns.notes])
59
-
60
- note_ons = np.array(notes[:, 0])
61
- note_offs = np.array(notes[:, 1])
62
-
63
- beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=False)
64
-
65
- on_idx, off_idx = nearest_onset_offset_digitize(note_ons, note_offs, beat_steps_8th)
66
-
67
- beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
68
-
69
- discrete_notes = np.concatenate(
70
- (np.stack((on_idx, off_idx), axis=1), note_attributes), axis=1
71
- )
72
-
73
- def delete_duplicate_notes(dnotes):
74
- note_order = dnotes[:, 0] * 128 + dnotes[:, 2]
75
- dnotes = dnotes[note_order.argsort()]
76
- indices = []
77
- for i in range(1, len(dnotes)):
78
- if dnotes[i, 0] == dnotes[i - 1, 0] and dnotes[i, 2] == dnotes[i - 1, 2]:
79
- indices.append(i)
80
- dnotes = np.delete(dnotes, indices, axis=0)
81
- note_order = dnotes[:, 0] * 128 + dnotes[:, 1]
82
- dnotes = dnotes[note_order.argsort()]
83
- return dnotes
84
-
85
- discrete_notes = delete_duplicate_notes(discrete_notes)
86
-
87
- digitized_note_ons, digitized_note_offs = (
88
- beat_steps_8th[on_idx],
89
- beat_steps_8th[off_idx],
90
- )
91
-
92
- for i, note in enumerate(qns.notes):
93
- note.start_time = digitized_note_ons[i]
94
- note.end_time = digitized_note_offs[i]
95
-
96
- return qns, discrete_notes, beat_steps_8th
97
-
98
-
99
- def extract_rhythm(song, y=None):
100
- if y is None:
101
- y, sr = librosa.load(song, sr=SAMPLERATE)
102
-
103
- essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
104
- (
105
- bpm,
106
- beat_times,
107
- confidence,
108
- estimates,
109
- essentia_beat_intervals,
110
- ) = essentia_tracker(y)
111
- return bpm, beat_times, confidence, estimates, essentia_beat_intervals
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/bpm_quantize.py DELETED
@@ -1,98 +0,0 @@
1
- import glob
2
- import sys
3
- import os
4
-
5
-
6
- import librosa
7
- import soundfile as sf
8
- import numpy as np
9
-
10
- import note_seq
11
- from omegaconf import OmegaConf
12
- from beat_quantizer import extract_rhythm, midi_quantize_by_beats
13
-
14
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
- from midiaudiopair import MidiAudioPair
16
- from utils.dsp import get_stereo
17
-
18
-
19
- def estimate(meta_file, ignore_sustain_pedal):
20
- sample = MidiAudioPair(meta_file)
21
-
22
- if (
23
- sample.error_code == MidiAudioPair.NO_PIANO
24
- or sample.error_code == MidiAudioPair.NO_SONG_DIR
25
- or sample.error_code == MidiAudioPair.NO_SONG
26
- ):
27
- return
28
-
29
- bpm, beat_times, confidence, estimates, essentia_beat_intervals = extract_rhythm(sample.song)
30
- beat_times = np.array(beat_times)
31
- essentia_beat_intervals = np.array(essentia_beat_intervals)
32
-
33
- qns, discrete_notes, beat_steps_8th = midi_quantize_by_beats(
34
- sample, beat_times, 2, ignore_sustain_pedal=ignore_sustain_pedal
35
- )
36
-
37
- qpm = note_seq.note_sequence_to_pretty_midi(qns)
38
- qpm.instruments[0].control_changes = []
39
- qpm.write(sample.qmidi)
40
- y, sr = librosa.load(sample.song, sr=None)
41
- qpm_y = qpm.fluidsynth(sr)
42
- qmix = get_stereo(y, qpm_y, 0.4)
43
- sf.write(file=sample.qmix, data=qmix.T, samplerate=sr, format="flac")
44
-
45
- meta = OmegaConf.load(meta_file)
46
- meta.tempo = OmegaConf.create()
47
- meta.tempo.bpm = bpm
48
- meta.tempo.confidence = confidence
49
- OmegaConf.save(meta, meta_file)
50
-
51
- np.save(sample.notes, discrete_notes)
52
- np.save(sample.beatstep, beat_steps_8th)
53
- np.save(sample.beattime, beat_times)
54
- np.save(sample.beatinterval, essentia_beat_intervals)
55
-
56
-
57
- def main(meta_files, ignore_sustain_pedal):
58
- from tqdm import tqdm
59
- import multiprocessing
60
- from joblib import Parallel, delayed
61
-
62
- def files():
63
- pbar = tqdm(meta_files)
64
- for meta_file in pbar:
65
- pbar.set_description(meta_file)
66
- yield meta_file
67
-
68
- Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
69
- delayed(estimate)(meta_file, ignore_sustain_pedal) for meta_file in files()
70
- )
71
-
72
-
73
- if __name__ == "__main__":
74
- import argparse
75
-
76
- parser = argparse.ArgumentParser(description="bpm estimate using essentia")
77
-
78
- parser.add_argument(
79
- "data_dir",
80
- type=str,
81
- default=None,
82
- help="""directory contains {id}/{pop_filename.wav}
83
- """,
84
- )
85
-
86
- parser.add_argument(
87
- "--ignore_sustain_pedal",
88
- default=False,
89
- action="store_true",
90
- help="whether dry_run",
91
- )
92
-
93
- args = parser.parse_args()
94
-
95
- meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
96
- print("meta ", len(meta_files))
97
-
98
- main(meta_files, args.ignore_sustain_pedal)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/melody_accuracy.py DELETED
@@ -1,81 +0,0 @@
1
- import glob
2
- import sys
3
- import os
4
-
5
- import librosa
6
- import pretty_midi
7
-
8
- from omegaconf import OmegaConf
9
-
10
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
- from midiaudiopair import MidiAudioPair
12
- from evaluate import midi_melody_accuracy as ma
13
-
14
-
15
- def estimate(meta_file):
16
-
17
- import warnings
18
-
19
- warnings.filterwarnings(action="ignore")
20
-
21
- sample = MidiAudioPair(meta_file)
22
-
23
- if (
24
- sample.error_code == MidiAudioPair.NO_PIANO
25
- or sample.error_code == MidiAudioPair.NO_SONG_DIR
26
- or sample.error_code == MidiAudioPair.NO_SONG
27
- ):
28
- return
29
-
30
- if "vocals" in sample.invalids:
31
- print("no vocal:", meta_file)
32
- return
33
-
34
- midi = pretty_midi.PrettyMIDI(sample.qmidi)
35
- vocals, sr = librosa.load(sample.vocals, sr=44100)
36
-
37
- chroma_accuracy, pitch_accuracy = ma.evaluate_melody(
38
- midi, vocals, sr=sr, hop_length=1024
39
- )
40
- meta = OmegaConf.load(meta_file)
41
- meta.eval = OmegaConf.create()
42
- meta.eval.melody_chroma_accuracy = chroma_accuracy.item()
43
- meta.eval.melody_pitch_accuracy = pitch_accuracy.item()
44
- OmegaConf.save(meta, meta_file)
45
-
46
-
47
- def main(meta_files):
48
- from tqdm import tqdm
49
- import multiprocessing
50
- from joblib import Parallel, delayed
51
-
52
- def files():
53
- pbar = tqdm(meta_files)
54
- for meta_file in pbar:
55
- pbar.set_description(meta_file)
56
- yield meta_file
57
-
58
- Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
59
- delayed(estimate)(meta_file) for meta_file in files()
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- import argparse
65
-
66
- parser = argparse.ArgumentParser(description="bpm estimate using essentia")
67
-
68
- parser.add_argument(
69
- "data_dir",
70
- type=str,
71
- default=None,
72
- help="""directory contains {id}/{pop_filename.wav}
73
- """,
74
- )
75
-
76
- args = parser.parse_args()
77
-
78
- meta_files = sorted(glob.glob(args.data_dir + "/**/*.yaml", recursive=True))
79
- print("meta ", len(meta_files))
80
-
81
- main(meta_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/pop_align.py DELETED
@@ -1,331 +0,0 @@
1
- import librosa
2
- import soundfile as sf
3
- import glob
4
- import os
5
- import copy
6
- import sys
7
-
8
- import numpy as np
9
- import pyrubberband as pyrb
10
- import pretty_midi
11
- from omegaconf import OmegaConf
12
- from tqdm.auto import tqdm
13
-
14
- from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
15
- from synctoolbox.dtw.utils import (
16
- compute_optimal_chroma_shift,
17
- shift_chroma_vectors,
18
- make_path_strictly_monotonic,
19
- )
20
- from synctoolbox.feature.chroma import (
21
- pitch_to_chroma,
22
- quantize_chroma,
23
- quantized_chroma_to_CENS,
24
- )
25
- from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
26
- from synctoolbox.feature.pitch import audio_to_pitch_features
27
- from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
28
- from synctoolbox.feature.utils import estimate_tuning
29
-
30
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
31
- print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
32
- from utils.dsp import normalize, get_stereo
33
- from midiaudiopair import MidiAudioPair
34
-
35
- Fs = 22050
36
- feature_rate = 50
37
- step_weights = np.array([1.5, 1.5, 2.0])
38
- threshold_rec = 10 ** 6
39
-
40
-
41
- def save_delayed_song(
42
- sample,
43
- dry_run,
44
- ):
45
- import warnings
46
-
47
- warnings.filterwarnings(action="ignore")
48
-
49
- song_audio, _ = librosa.load(sample.original_song, Fs)
50
- midi_pm = pretty_midi.PrettyMIDI(sample.original_midi)
51
-
52
- if np.power(song_audio, 2).sum() < 1: # low energy: invalid file
53
- print("invalid audio :", sample.original_song)
54
- sample.delete_files_myself()
55
- return
56
-
57
- rd = get_aligned_results(midi_pm=midi_pm, song_audio=song_audio)
58
-
59
- mix_song = rd["mix_song"]
60
- song_pitch_shifted = rd["song_pitch_shifted"]
61
- midi_warped_pm = rd["midi_warped_pm"]
62
- pitch_shift_for_song_audio = rd["pitch_shift_for_song_audio"]
63
- tuning_offset_song = rd["tuning_offset_song"]
64
- tuning_offset_piano = rd["tuning_offset_piano"]
65
-
66
- try:
67
- if dry_run:
68
- print("write audio files: ", sample.song)
69
- else:
70
- sf.write(
71
- file=sample.song,
72
- data=song_pitch_shifted,
73
- samplerate=Fs,
74
- format="wav",
75
- )
76
- except:
77
- print("Fail : ", sample.song)
78
-
79
- try:
80
- if dry_run:
81
- print("write warped midi :", sample.midi)
82
- else:
83
- midi_warped_pm.write(sample.midi)
84
-
85
- except:
86
- midi_warped_pm._tick_scales = midi_pm._tick_scales
87
- try:
88
- if dry_run:
89
- print("write warped midi2 :", sample.midi)
90
- else:
91
- midi_warped_pm.write(sample.midi)
92
-
93
- except:
94
- print("ad-hoc failed midi : ", sample.midi)
95
- print("ad-hoc midi : ", sample.midi)
96
-
97
- sample.yaml.song.pitch_shift = pitch_shift_for_song_audio.item()
98
- sample.yaml.song.tuning_offset = tuning_offset_song.item()
99
- sample.yaml.piano.tuning_offset = tuning_offset_piano.item()
100
- OmegaConf.save(sample.yaml, sample.yaml_path)
101
-
102
-
103
- def get_aligned_results(midi_pm, song_audio):
104
- piano_audio = midi_pm.fluidsynth(Fs)
105
-
106
- song_audio = normalize(song_audio)
107
-
108
- # The reason for estimating tuning ::
109
- # https://www.audiolabs-erlangen.de/resources/MIR/FMP/C3/C3S1_TranspositionTuning.html
110
- tuning_offset_1 = estimate_tuning(song_audio, Fs)
111
- tuning_offset_2 = estimate_tuning(piano_audio, Fs)
112
-
113
- # DLNCO features (Sebastian Ewert, Meinard Müller, and Peter Grosche: High Resolution Audio Synchronization Using Chroma Onset Features, In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP): 1869–1872, 2009.):
114
- # helpful to increase synchronization accuracy, especially for music with clear onsets.
115
-
116
- # Quantized and smoothed chroma : CENS features
117
- # Because, MrMsDTW Requires CENS.
118
- f_chroma_quantized_1, f_DLNCO_1 = get_features_from_audio(
119
- song_audio, tuning_offset_1
120
- )
121
- f_chroma_quantized_2, f_DLNCO_2 = get_features_from_audio(
122
- piano_audio, tuning_offset_2
123
- )
124
-
125
- # Shift chroma vectors :
126
- # Otherwise, different keys of two audio leads to degradation of alignment.
127
- opt_chroma_shift = compute_optimal_chroma_shift(
128
- quantized_chroma_to_CENS(f_chroma_quantized_1, 201, 50, feature_rate)[0],
129
- quantized_chroma_to_CENS(f_chroma_quantized_2, 201, 50, feature_rate)[0],
130
- )
131
- f_chroma_quantized_2 = shift_chroma_vectors(f_chroma_quantized_2, opt_chroma_shift)
132
- f_DLNCO_2 = shift_chroma_vectors(f_DLNCO_2, opt_chroma_shift)
133
-
134
- wp = sync_via_mrmsdtw(
135
- f_chroma1=f_chroma_quantized_1,
136
- f_onset1=f_DLNCO_1,
137
- f_chroma2=f_chroma_quantized_2,
138
- f_onset2=f_DLNCO_2,
139
- input_feature_rate=feature_rate,
140
- step_weights=step_weights,
141
- threshold_rec=threshold_rec,
142
- verbose=False,
143
- )
144
-
145
- wp = make_path_strictly_monotonic(wp)
146
- pitch_shift_for_song_audio = -opt_chroma_shift % 12
147
- if pitch_shift_for_song_audio > 6:
148
- pitch_shift_for_song_audio -= 12
149
-
150
- if pitch_shift_for_song_audio != 0:
151
- song_audio_shifted = pyrb.pitch_shift(
152
- song_audio, Fs, pitch_shift_for_song_audio
153
- )
154
- else:
155
- song_audio_shifted = song_audio
156
-
157
- time_map_second = wp / feature_rate
158
- midi_pm_warped = copy.deepcopy(midi_pm)
159
-
160
- midi_pm_warped = simple_adjust_times(
161
- midi_pm_warped, time_map_second[1], time_map_second[0]
162
- )
163
- piano_audio_warped = midi_pm_warped.fluidsynth(Fs)
164
-
165
- song_audio_shifted = normalize(song_audio_shifted)
166
- stereo_sonification_piano = get_stereo(song_audio_shifted, piano_audio_warped)
167
-
168
- rd = dict(
169
- mix_song=stereo_sonification_piano,
170
- song_pitch_shifted=song_audio_shifted,
171
- midi_warped_pm=midi_pm_warped,
172
- pitch_shift_for_song_audio=pitch_shift_for_song_audio,
173
- tuning_offset_song=tuning_offset_1,
174
- tuning_offset_piano=tuning_offset_2,
175
- )
176
- return rd
177
-
178
-
179
- def simple_adjust_times(pm, original_times, new_times):
180
- """
181
- most of these codes are from original pretty_midi
182
- https://github.com/craffel/pretty-midi/blob/main/pretty_midi/pretty_midi.py
183
- """
184
- for instrument in pm.instruments:
185
- instrument.notes = [
186
- copy.deepcopy(note)
187
- for note in instrument.notes
188
- if note.start >= original_times[0] and note.end <= original_times[-1]
189
- ]
190
- # Get array of note-on locations and correct them
191
- note_ons = np.array(
192
- [note.start for instrument in pm.instruments for note in instrument.notes]
193
- )
194
- adjusted_note_ons = np.interp(note_ons, original_times, new_times)
195
- # Same for note-offs
196
- note_offs = np.array(
197
- [note.end for instrument in pm.instruments for note in instrument.notes]
198
- )
199
- adjusted_note_offs = np.interp(note_offs, original_times, new_times)
200
- # Correct notes
201
- for n, note in enumerate(
202
- [note for instrument in pm.instruments for note in instrument.notes]
203
- ):
204
- note.start = (adjusted_note_ons[n] > 0) * adjusted_note_ons[n]
205
- note.end = (adjusted_note_offs[n] > 0) * adjusted_note_offs[n]
206
- # After performing alignment, some notes may have an end time which is
207
- # on or before the start time. Remove these!
208
- pm.remove_invalid_notes()
209
-
210
- def adjust_events(event_getter):
211
- """This function calls event_getter with each instrument as the
212
- sole argument and adjusts the events which are returned."""
213
- # Sort the events by time
214
- for instrument in pm.instruments:
215
- event_getter(instrument).sort(key=lambda e: e.time)
216
- # Correct the events by interpolating
217
- event_times = np.array(
218
- [
219
- event.time
220
- for instrument in pm.instruments
221
- for event in event_getter(instrument)
222
- ]
223
- )
224
- adjusted_event_times = np.interp(event_times, original_times, new_times)
225
- for n, event in enumerate(
226
- [
227
- event
228
- for instrument in pm.instruments
229
- for event in event_getter(instrument)
230
- ]
231
- ):
232
- event.time = adjusted_event_times[n]
233
- for instrument in pm.instruments:
234
- # We want to keep only the final event which has time ==
235
- # new_times[0]
236
- valid_events = [
237
- event
238
- for event in event_getter(instrument)
239
- if event.time == new_times[0]
240
- ]
241
- if valid_events:
242
- valid_events = valid_events[-1:]
243
- # Otherwise only keep events within the new set of times
244
- valid_events.extend(
245
- event
246
- for event in event_getter(instrument)
247
- if event.time > new_times[0] and event.time < new_times[-1]
248
- )
249
- event_getter(instrument)[:] = valid_events
250
-
251
- # Correct pitch bends and control changes
252
- adjust_events(lambda i: i.pitch_bends)
253
- adjust_events(lambda i: i.control_changes)
254
-
255
- return pm
256
-
257
-
258
- def get_features_from_audio(audio, tuning_offset, visualize=False):
259
- f_pitch = audio_to_pitch_features(
260
- f_audio=audio,
261
- Fs=Fs,
262
- tuning_offset=tuning_offset,
263
- feature_rate=feature_rate,
264
- verbose=visualize,
265
- )
266
- f_chroma = pitch_to_chroma(f_pitch=f_pitch)
267
- f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)
268
-
269
- f_pitch_onset = audio_to_pitch_onset_features(
270
- f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=visualize
271
- )
272
- f_DLNCO = pitch_onset_features_to_DLNCO(
273
- f_peaks=f_pitch_onset,
274
- feature_rate=feature_rate,
275
- feature_sequence_length=f_chroma_quantized.shape[1],
276
- visualize=visualize,
277
- )
278
- return f_chroma_quantized, f_DLNCO
279
-
280
-
281
- def main(samples, dry_run):
282
- import multiprocessing
283
- from joblib import Parallel, delayed
284
-
285
- Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
286
- delayed(save_delayed_song)(sample=sample, dry_run=dry_run)
287
- for sample in tqdm(samples)
288
- )
289
-
290
-
291
- if __name__ == "__main__":
292
-
293
- import argparse
294
-
295
- parser = argparse.ArgumentParser(description="piano cover downloader")
296
-
297
- parser.add_argument(
298
- "data_dir",
299
- type=str,
300
- default=None,
301
- help="""directory contains {id}/{song_filename.wav}
302
- """,
303
- )
304
- parser.add_argument(
305
- "--dry_run", default=False, action="store_true", help="whether dry_run"
306
- )
307
-
308
- args = parser.parse_args()
309
-
310
- def getfiles():
311
- meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
312
- print("meta ", len(meta_files))
313
-
314
- samples = list()
315
- for meta_file in tqdm(meta_files):
316
- m = MidiAudioPair(meta_file, auto_remove_no_song=True)
317
- if m.error_code != MidiAudioPair.NO_SONG:
318
- aux_txt = os.path.join(
319
- m.audio_dir,
320
- m.yaml.piano.ytid,
321
- f"{m.yaml.piano.title[:50]}___{m.yaml.song.title[:50]}.txt",
322
- )
323
- with open(aux_txt, "w") as f:
324
- f.write(".")
325
- samples.append(m)
326
-
327
- print(f"files available {len(samples)}")
328
- return samples
329
-
330
- samples = getfiles()
331
- main(samples=samples, dry_run=args.dry_run)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/split_spleeter.py DELETED
@@ -1,72 +0,0 @@
1
- import glob
2
- import os
3
- import random
4
- import sys
5
-
6
- from tqdm.auto import tqdm
7
-
8
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9
- from midiaudiopair import MidiAudioPair
10
-
11
-
12
- def split_spleeter(meta_files):
13
- # Use audio loader explicitly for loading audio waveform :
14
- from spleeter.audio.adapter import AudioAdapter
15
- from spleeter.separator import Separator
16
- import spleeter
17
-
18
- sample_rate = 44100
19
- audio_loader = AudioAdapter.default()
20
-
21
- # Using embedded configuration.
22
- separator = Separator("spleeter:2stems")
23
-
24
- for meta_file in tqdm(meta_files):
25
- sample = MidiAudioPair(meta_file)
26
- if sample.error_code == MidiAudioPair.NO_SONG:
27
- continue
28
- if os.path.exists(sample.vocals):
29
- continue
30
-
31
- waveform, _ = audio_loader.load(sample.song, sample_rate=sample_rate)
32
-
33
- # Perform the separation :
34
- prediction = separator.separate(waveform)
35
-
36
- audio_loader.save(
37
- path=sample.vocals,
38
- data=prediction["vocals"][:, 0:1],
39
- codec=spleeter.audio.Codec.MP3,
40
- sample_rate=sample_rate,
41
- )
42
-
43
-
44
- if __name__ == "__main__":
45
- import argparse
46
-
47
- parser = argparse.ArgumentParser(description="bpm estimate using essentia")
48
-
49
- parser.add_argument(
50
- "data_dir",
51
- type=str,
52
- default=None,
53
- help="""directory contains {id}/{pop_filename.wav}
54
- """,
55
- )
56
-
57
- parser.add_argument(
58
- "--random_order",
59
- default=False,
60
- action="store_true",
61
- help="Random order process (to run multiple process)",
62
- )
63
-
64
- args = parser.parse_args()
65
-
66
- meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
67
- if args.random_order:
68
- random.shuffle(meta_files)
69
-
70
- print("meta ", len(meta_files))
71
-
72
- split_spleeter(meta_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,10 +1,11 @@
 
 
1
  pretty-midi==0.2.9
2
- omegaconf==2.1.1
3
- transformers==4.16.1
4
- pytorch-lightning==1.8.3
5
- essentia==2.1b6.dev1034
6
- note-seq==0.0.5
7
  pyFluidSynth==1.3.0
8
- --extra-index-url https://download.pytorch.org/whl/cpu
9
- torch==1.13.1
10
- torchaudio==0.13.1
 
 
 
 
1
+ torch
2
+ librosa
3
  pretty-midi==0.2.9
4
+ essentia==2.1b6.dev1034
 
 
 
 
5
  pyFluidSynth==1.3.0
6
+ git+https://github.com/huggingface/transformers
7
+ midi2audio
8
+ pytube
9
+ gradio
10
+ resampy
11
+ soundfile
transformer_wrapper.py DELETED
@@ -1,330 +0,0 @@
1
- import os
2
- import random
3
-
4
- import numpy as np
5
- import librosa
6
- import torch
7
-
8
- import pytorch_lightning as pl
9
- import soundfile as sf
10
- from torch.nn.utils.rnn import pad_sequence
11
- from transformers import T5Config, T5ForConditionalGeneration
12
-
13
- from midi_tokenizer import MidiTokenizer, extrapolate_beat_times
14
- from layer.input import LogMelSpectrogram, ConcatEmbeddingToMel
15
- from preprocess.beat_quantizer import extract_rhythm, interpolate_beat_times
16
- from utils.dsp import get_stereo
17
-
18
-
19
- DEFAULT_COMPOSERS = {"various composer": 2052}
20
-
21
-
22
- class TransformerWrapper(pl.LightningModule):
23
- def __init__(self, config):
24
- super().__init__()
25
- self.config = config
26
-
27
- self.tokenizer = MidiTokenizer(config.tokenizer)
28
- self.t5config = T5Config.from_pretrained("t5-small")
29
-
30
- for k, v in config.t5.items():
31
- self.t5config.__setattr__(k, v)
32
-
33
- self.transformer = T5ForConditionalGeneration(self.t5config)
34
- self.use_mel = self.config.dataset.use_mel
35
- self.mel_is_conditioned = self.config.dataset.mel_is_conditioned
36
- self.composer_to_feature_token = config.composer_to_feature_token
37
-
38
- if self.use_mel and not self.mel_is_conditioned:
39
- self.composer_to_feature_token = DEFAULT_COMPOSERS
40
-
41
- if self.use_mel:
42
- self.spectrogram = LogMelSpectrogram()
43
- if self.mel_is_conditioned:
44
- n_dim = 512
45
- composer_n_vocab = len(self.composer_to_feature_token)
46
- embedding_offset = min(self.composer_to_feature_token.values())
47
- self.mel_conditioner = ConcatEmbeddingToMel(
48
- embedding_offset=embedding_offset,
49
- n_vocab=composer_n_vocab,
50
- n_dim=n_dim,
51
- )
52
- else:
53
- self.spectrogram = None
54
-
55
- self.lr = config.training.lr
56
-
57
- def forward(self, input_ids, labels):
58
- """
59
- Deprecated.
60
- """
61
- rt = self.transformer(input_ids=input_ids, labels=labels)
62
- return rt
63
-
64
- @torch.no_grad()
65
- def single_inference(
66
- self,
67
- feature_tokens=None,
68
- audio=None,
69
- beatstep=None,
70
- max_length=256,
71
- max_batch_size=64,
72
- n_bars=None,
73
- composer_value=None,
74
- ):
75
- """
76
- generate a long audio sequence
77
-
78
- feature_tokens or audio : shape (time, )
79
-
80
- beatstep : shape (time, )
81
- - input_ids가 해당하는 beatstep 값들
82
- (offset 빠짐, 즉 beatstep[0] == 0)
83
- - beatstep[-1] : input_ids가 끝나는 지점의 시간값
84
- (즉 beatstep[-1] == len(y)//sr)
85
- """
86
-
87
- assert feature_tokens is not None or audio is not None
88
- assert beatstep is not None
89
-
90
- if feature_tokens is not None:
91
- assert len(feature_tokens.shape) == 1
92
-
93
- if audio is not None:
94
- assert len(audio.shape) == 1
95
-
96
- config = self.config
97
- PAD = self.t5config.pad_token_id
98
- n_bars = config.dataset.n_bars if n_bars is None else n_bars
99
-
100
- if beatstep[0] > 0.01:
101
- print(
102
- "inference warning : beatstep[0] is not 0 ({beatstep[0]}). all beatstep will be shifted."
103
- )
104
- beatstep = beatstep - beatstep[0]
105
-
106
- if self.use_mel:
107
- input_ids = None
108
- inputs_embeds, ext_beatstep = self.prepare_inference_mel(
109
- audio,
110
- beatstep,
111
- n_bars=n_bars,
112
- padding_value=PAD,
113
- composer_value=composer_value,
114
- )
115
- batch_size = inputs_embeds.shape[0]
116
- else:
117
- raise NotImplementedError
118
-
119
- # Considering GPU capacity, some sequence would not be generated at once.
120
- relative_tokens = list()
121
- for i in range(0, batch_size, max_batch_size):
122
- start = i
123
- end = min(batch_size, i + max_batch_size)
124
-
125
- if input_ids is None:
126
- _input_ids = None
127
- _inputs_embeds = inputs_embeds[start:end]
128
- else:
129
- _input_ids = input_ids[start:end]
130
- _inputs_embeds = None
131
-
132
- _relative_tokens = self.transformer.generate(
133
- input_ids=_input_ids,
134
- inputs_embeds=_inputs_embeds,
135
- max_length=max_length,
136
- )
137
- _relative_tokens = _relative_tokens.cpu().numpy()
138
- relative_tokens.append(_relative_tokens)
139
-
140
- max_length = max([rt.shape[-1] for rt in relative_tokens])
141
- for i in range(len(relative_tokens)):
142
- relative_tokens[i] = np.pad(
143
- relative_tokens[i],
144
- [(0, 0), (0, max_length - relative_tokens[i].shape[-1])],
145
- constant_values=PAD,
146
- )
147
- relative_tokens = np.concatenate(relative_tokens)
148
-
149
- pm, notes = self.tokenizer.relative_batch_tokens_to_midi(
150
- relative_tokens,
151
- beatstep=ext_beatstep,
152
- bars_per_batch=n_bars,
153
- cutoff_time_idx=(n_bars + 1) * 4,
154
- )
155
-
156
- return relative_tokens, notes, pm
157
-
158
- def prepare_inference_mel(self, audio, beatstep, n_bars, padding_value, composer_value=None):
159
- n_steps = n_bars * 4
160
- n_target_step = len(beatstep)
161
- sample_rate = self.config.dataset.sample_rate
162
- ext_beatstep = extrapolate_beat_times(beatstep, (n_bars + 1) * 4 + 1)
163
-
164
- def split_audio(audio):
165
- # Split audio corresponding beat intervals.
166
- # Each audio's lengths are different.
167
- # Because each corresponding beat interval times are different.
168
- batch = []
169
-
170
- for i in range(0, n_target_step, n_steps):
171
-
172
- start_idx = i
173
- end_idx = min(i + n_steps, n_target_step)
174
-
175
- start_sample = int(ext_beatstep[start_idx] * sample_rate)
176
- end_sample = int(ext_beatstep[end_idx] * sample_rate)
177
- feature = audio[start_sample:end_sample]
178
- batch.append(feature)
179
- return batch
180
-
181
- def pad_and_stack_batch(batch):
182
- batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
183
- return batch
184
-
185
- batch = split_audio(audio)
186
- batch = pad_and_stack_batch(batch)
187
-
188
- inputs_embeds = self.spectrogram(batch).transpose(-1, -2)
189
- if self.mel_is_conditioned:
190
- composer_value = torch.tensor(composer_value).to(self.device)
191
- composer_value = composer_value.repeat(inputs_embeds.shape[0])
192
- inputs_embeds = self.mel_conditioner(inputs_embeds, composer_value)
193
- return inputs_embeds, ext_beatstep
194
-
195
- @torch.no_grad()
196
- def generate(
197
- self,
198
- audio_path=None,
199
- composer=None,
200
- model="generated",
201
- steps_per_beat=2,
202
- stereo_amp=0.5,
203
- n_bars=2,
204
- ignore_duplicate=True,
205
- show_plot=False,
206
- save_midi=False,
207
- save_mix=False,
208
- midi_path=None,
209
- mix_path=None,
210
- click_amp=0.2,
211
- add_click=False,
212
- max_batch_size=None,
213
- beatsteps=None,
214
- mix_sample_rate=None,
215
- audio_y=None,
216
- audio_sr=None,
217
- ):
218
- config = self.config
219
- device = self.device
220
-
221
- if audio_path is not None:
222
- extension = os.path.splitext(audio_path)[1]
223
- mix_path = (
224
- audio_path.replace(extension, f".{model}.{composer}.wav")
225
- if mix_path is None
226
- else mix_path
227
- )
228
- midi_path = (
229
- audio_path.replace(extension, f".{model}.{composer}.mid")
230
- if midi_path is None
231
- else midi_path
232
- )
233
-
234
- max_batch_size = 64 // n_bars if max_batch_size is None else max_batch_size
235
- composer_to_feature_token = self.composer_to_feature_token
236
-
237
- if composer is None:
238
- composer = random.sample(list(composer_to_feature_token.keys()), 1)[0]
239
-
240
- composer_value = composer_to_feature_token[composer]
241
- mix_sample_rate = config.dataset.sample_rate if mix_sample_rate is None else mix_sample_rate
242
-
243
- if not ignore_duplicate:
244
- if os.path.exists(midi_path):
245
- return
246
-
247
- ESSENTIA_SAMPLERATE = 44100
248
-
249
- if beatsteps is None:
250
- y, sr = librosa.load(audio_path, sr=ESSENTIA_SAMPLERATE)
251
- (
252
- bpm,
253
- beat_times,
254
- confidence,
255
- estimates,
256
- essentia_beat_intervals,
257
- ) = extract_rhythm(audio_path, y=y)
258
- beat_times = np.array(beat_times)
259
- beatsteps = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
260
- else:
261
- y = None
262
-
263
- if self.use_mel:
264
- if audio_y is None and config.dataset.sample_rate != ESSENTIA_SAMPLERATE:
265
- if y is not None:
266
- y = librosa.core.resample(
267
- y,
268
- orig_sr=ESSENTIA_SAMPLERATE,
269
- target_sr=config.dataset.sample_rate,
270
- )
271
- sr = config.dataset.sample_rate
272
- else:
273
- y, sr = librosa.load(audio_path, sr=config.dataset.sample_rate)
274
- elif audio_y is not None:
275
- if audio_sr != config.dataset.sample_rate:
276
- audio_y = librosa.core.resample(
277
- audio_y, orig_sr=audio_sr, target_sr=config.dataset.sample_rate
278
- )
279
- audio_sr = config.dataset.sample_rate
280
- y = audio_y
281
- sr = audio_sr
282
-
283
- start_sample = int(beatsteps[0] * sr)
284
- end_sample = int(beatsteps[-1] * sr)
285
- _audio = torch.from_numpy(y)[start_sample:end_sample].to(device)
286
- fzs = None
287
- else:
288
- raise NotImplementedError
289
-
290
- relative_tokens, notes, pm = self.single_inference(
291
- feature_tokens=fzs,
292
- audio=_audio,
293
- beatstep=beatsteps - beatsteps[0],
294
- max_length=config.dataset.target_length * max(1, (n_bars // config.dataset.n_bars)),
295
- max_batch_size=max_batch_size,
296
- n_bars=n_bars,
297
- composer_value=composer_value,
298
- )
299
-
300
- for n in pm.instruments[0].notes:
301
- n.start += beatsteps[0]
302
- n.end += beatsteps[0]
303
-
304
- if show_plot or save_mix:
305
- if mix_sample_rate != sr:
306
- y = librosa.core.resample(y, orig_sr=sr, target_sr=mix_sample_rate)
307
- sr = mix_sample_rate
308
- if add_click:
309
- clicks = librosa.clicks(times=beatsteps, sr=sr, length=len(y)) * click_amp
310
- y = y + clicks
311
- pm_y = pm.fluidsynth(sr)
312
- stereo = get_stereo(y, pm_y, pop_scale=stereo_amp)
313
-
314
- if show_plot:
315
- import note_seq
316
-
317
- note_seq.plot_sequence(note_seq.midi_to_note_sequence(pm))
318
-
319
- if save_mix:
320
- sf.write(
321
- file=mix_path,
322
- data=stereo.T,
323
- samplerate=sr,
324
- format="wav",
325
- )
326
-
327
- if save_midi:
328
- pm.write(midi_path)
329
-
330
- return pm, composer, mix_path, midi_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__init__.py DELETED
File without changes
utils/dsp.py DELETED
@@ -1,63 +0,0 @@
1
- import numpy as np
2
- from scipy.interpolate import interp1d
3
-
4
-
5
- def normalize(audio, min_y=-1.0, max_y=1.0, eps=1e-8):
6
- assert len(audio.shape) == 1
7
- max_y -= eps
8
- min_y += eps
9
- amax = audio.max()
10
- amin = audio.min()
11
- audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
12
- return audio
13
-
14
-
15
- def get_stereo(pop_y, midi_y, pop_scale=0.99):
16
- if len(pop_y) > len(midi_y):
17
- midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
18
- elif len(pop_y) < len(midi_y):
19
- pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
20
- stereo = np.stack((midi_y, pop_y * pop_scale))
21
- return stereo
22
-
23
-
24
- def generate_variable_f0_sine_wave(f0, len_y, sr):
25
- """
26
- integrate instant frequencies to get pure tone sine wave
27
- """
28
- x_sample = np.arange(len(f0))
29
- intp = interp1d(x_sample, f0, kind="linear")
30
- f0_audiorate = intp(np.linspace(0, len(f0) - 1, len_y))
31
- pitch_wave = np.sin((np.nan_to_num(f0_audiorate) / sr * 2 * np.pi).cumsum())
32
- return pitch_wave
33
-
34
-
35
- def fluidsynth_without_normalize(self, fs=44100, sf2_path=None):
36
- """Synthesize using fluidsynth. without signal normalize
37
- Parameters
38
- ----------
39
- fs : int
40
- Sampling rate to synthesize at.
41
- sf2_path : str
42
- Path to a .sf2 file.
43
- Default ``None``, which uses the TimGM6mb.sf2 file included with
44
- ``pretty_midi``.
45
- Returns
46
- -------
47
- synthesized : np.ndarray
48
- Waveform of the MIDI data, synthesized at ``fs``.
49
- """
50
- # If there are no instruments, or all instruments have no notes, return
51
- # an empty array
52
- if len(self.instruments) == 0 or all(len(i.notes) == 0 for i in self.instruments):
53
- return np.array([])
54
- # Get synthesized waveform for each instrument
55
- waveforms = [i.fluidsynth(fs=fs, sf2_path=sf2_path) for i in self.instruments]
56
- # Allocate output waveform, with #sample = max length of all waveforms
57
- synthesized = np.zeros(np.max([w.shape[0] for w in waveforms]))
58
- # Sum all waveforms in
59
- for waveform in waveforms:
60
- synthesized[: waveform.shape[0]] += waveform
61
- # Normalize
62
- # synthesized /= np.abs(synthesized).max()
63
- return synthesized