Surn commited on
Commit
b76a81b
1 Parent(s): 9766876

Major Advancement on Melody Conditioned.

Browse files

Full length is now consistent
Fades between sections may need more work

app.py CHANGED
@@ -15,7 +15,7 @@ import time
15
  import warnings
16
  from audiocraft.models import MusicGen
17
  from audiocraft.data.audio import audio_write
18
- from audiocraft.data.audio_utils import apply_fade
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
@@ -165,12 +165,14 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
165
  overlap_samples = overlap * MODEL.sample_rate
166
  #stack tracks and fade out/in
167
  overlapping_output_fadeout = output[:, :, -overlap_samples:]
168
- overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.9, current_device=MODEL.device)
 
169
 
170
  overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
171
- overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.1, current_device=MODEL.device)
 
172
 
173
- overlapping_output = (overlapping_output_fadeout + overlapping_output_fadein) / 2
174
  print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
175
  ##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
176
  ##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
@@ -190,7 +192,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
190
  background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
191
  audio_write(
192
  file.name, output, MODEL.sample_rate, strategy="loudness",
193
- loudness_headroom_db=19, loudness_compressor=True, add_suffix=False, channels=2)
194
  waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
195
  if MOVE_TO_CPU:
196
  MODEL.to('cpu')
@@ -245,7 +247,7 @@ def ui(**kwargs):
245
  model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
246
  with gr.Row():
247
  duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
248
- overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
249
  dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
250
  with gr.Row():
251
  topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
 
15
  import warnings
16
  from audiocraft.models import MusicGen
17
  from audiocraft.data.audio import audio_write
18
+ from audiocraft.data.audio_utils import apply_fade, apply_tafade
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
 
165
  overlap_samples = overlap * MODEL.sample_rate
166
  #stack tracks and fade out/in
167
  overlapping_output_fadeout = output[:, :, -overlap_samples:]
168
+ overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
169
+ #overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="exponential")
170
 
171
  overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
172
+ overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
173
+ #overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
174
 
175
+ overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
176
  print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
177
  ##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
178
  ##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
 
192
  background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
193
  audio_write(
194
  file.name, output, MODEL.sample_rate, strategy="loudness",
195
+ loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
196
  waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
197
  if MOVE_TO_CPU:
198
  MODEL.to('cpu')
 
247
  model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
248
  with gr.Row():
249
  duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
250
+ overlap = gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Overlap", interactive=True)
251
  dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
252
  with gr.Row():
253
  topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
audiocraft/data/audio_utils.py CHANGED
@@ -173,7 +173,66 @@ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
173
  assert wav.dtype == torch.int16
174
  return wav
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
178
  fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
179
 
 
173
  assert wav.dtype == torch.int16
174
  return wav
175
 
176
+ def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear") -> torch.Tensor:
177
+ """
178
+ Apply fade-in and/or fade-out effects to the audio tensor.
179
+
180
+ Args:
181
+ audio (torch.Tensor): The input audio tensor of shape (C, L).
182
+ sample_rate (int): The sample rate of the audio.
183
+ duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
184
+ out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
185
+ start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
186
+ shape (str, optional): The shape of the fade. Must be one of: "quarter_sine", "half_sine", "linear", "logarithmic", "exponential". Defaults to "linear".
187
+
188
+ Returns:
189
+ torch.Tensor: The audio tensor with the fade effect applied.
190
+
191
+ """
192
+ fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
193
+
194
+ # Create the fade transform
195
+ fade_transform = torchaudio.transforms.Fade(fade_in_len=fade_samples, fade_out_len=fade_samples, fade_shape=shape)
196
+
197
+ if out:
198
+ fade_transform.fade_out_len = fade_samples
199
+ fade_transform.fade_out_shape = shape
200
+
201
+ # Select the portion of the audio to apply the fade
202
+ if start:
203
+ audio_fade_section = audio[:, :fade_samples]
204
+ else:
205
+ audio_fade_section = audio[:, -fade_samples:]
206
+
207
+ # Apply the fade transform to the audio section
208
+ audio_faded = fade_transform(audio)
209
+
210
+ # Replace the selected portion of the audio with the faded section
211
+ if start:
212
+ audio_faded[:, :fade_samples] = audio_fade_section
213
+ else:
214
+ audio_faded[:, -fade_samples:] = audio_fade_section
215
+
216
+ return audio_faded
217
+
218
  def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
219
+ """
220
+ Apply fade-in and/or fade-out effects to the audio tensor.
221
+
222
+ Args:
223
+ audio (torch.Tensor): The input audio tensor of shape (C, L).
224
+ sample_rate (int): The sample rate of the audio.
225
+ duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
226
+ out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
227
+ start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
228
+ curve_start (float, optional): The starting amplitude of the fade curve. Defaults to 0.0.
229
+ curve_end (float, optional): The ending amplitude of the fade curve. Defaults to 1.0.
230
+ current_device (str, optional): The device on which the fade curve tensor should be created. Defaults to "cpu".
231
+
232
+ Returns:
233
+ torch.Tensor: The audio tensor with the fade effect applied.
234
+
235
+ """
236
  fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
237
  fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
238
 
audiocraft/models/musicgen.py CHANGED
@@ -207,8 +207,8 @@ class MusicGen:
207
  convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
208
  if wav is not None else None
209
  for wav in melody_wavs]
210
- attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
211
- melody_wavs=melody_wavs)
212
 
213
  if prompt is not None:
214
  if prompt.dim() == 2:
@@ -219,11 +219,11 @@ class MusicGen:
219
  if descriptions is None:
220
  descriptions = [None] * len(prompt)
221
 
222
- if prompt is not None:
223
- attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
224
 
225
- #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
226
- # melody_wavs=melody_wavs)
227
  if prompt is not None:
228
  assert prompt_tokens is not None
229
  else:
 
207
  convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
208
  if wav is not None else None
209
  for wav in melody_wavs]
210
+ #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
211
+ # melody_wavs=melody_wavs)
212
 
213
  if prompt is not None:
214
  if prompt.dim() == 2:
 
219
  if descriptions is None:
220
  descriptions = [None] * len(prompt)
221
 
222
+ #if prompt is not None:
223
+ # attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
224
 
225
+ attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
226
+ melody_wavs=melody_wavs)
227
  if prompt is not None:
228
  assert prompt_tokens is not None
229
  else:
audiocraft/utils/extend.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  import math
3
  from audiocraft.models import MusicGen
@@ -54,9 +55,21 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
54
 
55
  # Calculate the total number of segments
56
  total_segments = max(math.ceil(duration / segment_duration),1)
 
 
57
  #calc excess duration
58
  excess_duration = segment_duration - (total_segments * segment_duration - duration)
59
- print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
 
 
 
 
 
 
 
 
 
 
60
 
61
  # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
62
  if len(melody_segments) < total_segments:
@@ -83,24 +96,70 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
83
  torch.manual_seed(seed)
84
  for idx, verse in enumerate(melodys):
85
  if INTERRUPTING:
86
- return output_segments, duration - (segment_duration * len(output_segments))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  print(f"Generating New Melody Segment {idx + 1}: {text}\r")
89
- if output_segments:
90
- # If this isn't the first segment, use the last chunk of the previous segment as the input
91
- last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
92
  output = MODEL.generate_with_all(
93
  descriptions=[text],
94
  melody_wavs=verse,
95
  sample_rate=sr,
96
  progress=False,
97
- prompt=last_chunk if len(last_chunk) > 0 else None,
98
  )
99
 
100
  # Append the generated output to the list of segments
101
  #output_segments.append(output[:, :segment_duration])
102
  output_segments.append(output)
103
  print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
 
 
 
104
  return output_segments, excess_duration
105
 
106
  def save_image(image):
 
1
+ from tabnanny import verbose
2
  import torch
3
  import math
4
  from audiocraft.models import MusicGen
 
55
 
56
  # Calculate the total number of segments
57
  total_segments = max(math.ceil(duration / segment_duration),1)
58
+ #calculate duration loss from segment overlap
59
+ duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
60
  #calc excess duration
61
  excess_duration = segment_duration - (total_segments * segment_duration - duration)
62
+ print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
63
+ duration += duration_loss
64
+ while excess_duration + duration_loss > segment_duration:
65
+ total_segments += 1
66
+ #calculate duration loss from segment overlap
67
+ duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
68
+ #calc excess duration
69
+ excess_duration = segment_duration - (total_segments * segment_duration - duration)
70
+ print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
71
+ if excess_duration + duration_loss > segment_duration:
72
+ duration += duration_loss
73
 
74
  # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
75
  if len(melody_segments) < total_segments:
 
96
  torch.manual_seed(seed)
97
  for idx, verse in enumerate(melodys):
98
  if INTERRUPTING:
99
+ return output_segments, duration
100
+
101
+ print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
102
+ # Compensate for the length of final segment
103
+ if (idx + 1) == len(melodys):
104
+ print(f'Modify Last verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
105
+ MODEL.set_generation_params(
106
+ use_sampling=True,
107
+ top_k=MODEL.generation_params["top_k"],
108
+ top_p=MODEL.generation_params["top_p"],
109
+ temperature=MODEL.generation_params["temp"],
110
+ cfg_coef=MODEL.generation_params["cfg_coef"],
111
+ duration=duration,
112
+ two_step_cfg=False,
113
+ rep_penalty=0.5
114
+ )
115
+ try:
116
+ # get last chunk
117
+ verse = verse[:, :, -duration*MODEL.sample_rate:]
118
+ prompt_segment = prompt_segment[:, :, -duration*MODEL.sample_rate:]
119
+ except:
120
+ # get first chunk
121
+ verse = verse[:, :, :duration*MODEL.sample_rate]
122
+ prompt_segment = prompt_segment[:, :, :duration*MODEL.sample_rate]
123
+
124
+ else:
125
+ MODEL.set_generation_params(
126
+ use_sampling=True,
127
+ top_k=MODEL.generation_params["top_k"],
128
+ top_p=MODEL.generation_params["top_p"],
129
+ temperature=MODEL.generation_params["temp"],
130
+ cfg_coef=MODEL.generation_params["cfg_coef"],
131
+ duration=segment_duration,
132
+ two_step_cfg=False,
133
+ rep_penalty=0.5
134
+ )
135
+
136
+ # Generate a new prompt segment based on the first verse. This will be applied to all segments for consistency
137
+ if idx == 0:
138
+ print(f"Generating New Prompt Segment: {text}\r")
139
+ prompt_segment = MODEL.generate_with_all(
140
+ descriptions=[text],
141
+ melody_wavs=verse,
142
+ sample_rate=sr,
143
+ progress=False,
144
+ prompt=None,
145
+ )
146
 
147
  print(f"Generating New Melody Segment {idx + 1}: {text}\r")
 
 
 
148
  output = MODEL.generate_with_all(
149
  descriptions=[text],
150
  melody_wavs=verse,
151
  sample_rate=sr,
152
  progress=False,
153
+ prompt=prompt_segment,
154
  )
155
 
156
  # Append the generated output to the list of segments
157
  #output_segments.append(output[:, :segment_duration])
158
  output_segments.append(output)
159
  print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
160
+ #track duration
161
+ if duration > segment_duration:
162
+ duration -= segment_duration
163
  return output_segments, excess_duration
164
 
165
  def save_image(image):