Spaces:
Running
on
T4
Running
on
T4
Major Advancement on Melody Conditioned.
Browse filesFull length is now consistent
Fades between sections may need more work
- app.py +8 -6
- audiocraft/data/audio_utils.py +59 -0
- audiocraft/models/musicgen.py +6 -6
- audiocraft/utils/extend.py +65 -6
app.py
CHANGED
@@ -15,7 +15,7 @@ import time
|
|
15 |
import warnings
|
16 |
from audiocraft.models import MusicGen
|
17 |
from audiocraft.data.audio import audio_write
|
18 |
-
from audiocraft.data.audio_utils import apply_fade
|
19 |
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
20 |
import numpy as np
|
21 |
import random
|
@@ -165,12 +165,14 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
165 |
overlap_samples = overlap * MODEL.sample_rate
|
166 |
#stack tracks and fade out/in
|
167 |
overlapping_output_fadeout = output[:, :, -overlap_samples:]
|
168 |
-
overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.
|
|
|
169 |
|
170 |
overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
|
171 |
-
overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.
|
|
|
172 |
|
173 |
-
overlapping_output = (overlapping_output_fadeout
|
174 |
print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
175 |
##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
|
176 |
##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
@@ -190,7 +192,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
190 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
191 |
audio_write(
|
192 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
193 |
-
loudness_headroom_db=
|
194 |
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
195 |
if MOVE_TO_CPU:
|
196 |
MODEL.to('cpu')
|
@@ -245,7 +247,7 @@ def ui(**kwargs):
|
|
245 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
246 |
with gr.Row():
|
247 |
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
248 |
-
overlap = gr.Slider(minimum=1, maximum=
|
249 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
250 |
with gr.Row():
|
251 |
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
|
|
15 |
import warnings
|
16 |
from audiocraft.models import MusicGen
|
17 |
from audiocraft.data.audio import audio_write
|
18 |
+
from audiocraft.data.audio_utils import apply_fade, apply_tafade
|
19 |
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
20 |
import numpy as np
|
21 |
import random
|
|
|
165 |
overlap_samples = overlap * MODEL.sample_rate
|
166 |
#stack tracks and fade out/in
|
167 |
overlapping_output_fadeout = output[:, :, -overlap_samples:]
|
168 |
+
overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
|
169 |
+
#overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="exponential")
|
170 |
|
171 |
overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
|
172 |
+
overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
|
173 |
+
#overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
|
174 |
|
175 |
+
overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
|
176 |
print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
177 |
##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
|
178 |
##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
|
|
192 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
193 |
audio_write(
|
194 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
195 |
+
loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
|
196 |
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
197 |
if MOVE_TO_CPU:
|
198 |
MODEL.to('cpu')
|
|
|
247 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
248 |
with gr.Row():
|
249 |
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
250 |
+
overlap = gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Overlap", interactive=True)
|
251 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
252 |
with gr.Row():
|
253 |
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
audiocraft/data/audio_utils.py
CHANGED
@@ -173,7 +173,66 @@ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
|
|
173 |
assert wav.dtype == torch.int16
|
174 |
return wav
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
178 |
fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
|
179 |
|
|
|
173 |
assert wav.dtype == torch.int16
|
174 |
return wav
|
175 |
|
176 |
+
def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear") -> torch.Tensor:
|
177 |
+
"""
|
178 |
+
Apply fade-in and/or fade-out effects to the audio tensor.
|
179 |
+
|
180 |
+
Args:
|
181 |
+
audio (torch.Tensor): The input audio tensor of shape (C, L).
|
182 |
+
sample_rate (int): The sample rate of the audio.
|
183 |
+
duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
|
184 |
+
out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
|
185 |
+
start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
|
186 |
+
shape (str, optional): The shape of the fade. Must be one of: "quarter_sine", "half_sine", "linear", "logarithmic", "exponential". Defaults to "linear".
|
187 |
+
|
188 |
+
Returns:
|
189 |
+
torch.Tensor: The audio tensor with the fade effect applied.
|
190 |
+
|
191 |
+
"""
|
192 |
+
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
193 |
+
|
194 |
+
# Create the fade transform
|
195 |
+
fade_transform = torchaudio.transforms.Fade(fade_in_len=fade_samples, fade_out_len=fade_samples, fade_shape=shape)
|
196 |
+
|
197 |
+
if out:
|
198 |
+
fade_transform.fade_out_len = fade_samples
|
199 |
+
fade_transform.fade_out_shape = shape
|
200 |
+
|
201 |
+
# Select the portion of the audio to apply the fade
|
202 |
+
if start:
|
203 |
+
audio_fade_section = audio[:, :fade_samples]
|
204 |
+
else:
|
205 |
+
audio_fade_section = audio[:, -fade_samples:]
|
206 |
+
|
207 |
+
# Apply the fade transform to the audio section
|
208 |
+
audio_faded = fade_transform(audio)
|
209 |
+
|
210 |
+
# Replace the selected portion of the audio with the faded section
|
211 |
+
if start:
|
212 |
+
audio_faded[:, :fade_samples] = audio_fade_section
|
213 |
+
else:
|
214 |
+
audio_faded[:, -fade_samples:] = audio_fade_section
|
215 |
+
|
216 |
+
return audio_faded
|
217 |
+
|
218 |
def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
|
219 |
+
"""
|
220 |
+
Apply fade-in and/or fade-out effects to the audio tensor.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
audio (torch.Tensor): The input audio tensor of shape (C, L).
|
224 |
+
sample_rate (int): The sample rate of the audio.
|
225 |
+
duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
|
226 |
+
out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
|
227 |
+
start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
|
228 |
+
curve_start (float, optional): The starting amplitude of the fade curve. Defaults to 0.0.
|
229 |
+
curve_end (float, optional): The ending amplitude of the fade curve. Defaults to 1.0.
|
230 |
+
current_device (str, optional): The device on which the fade curve tensor should be created. Defaults to "cpu".
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
torch.Tensor: The audio tensor with the fade effect applied.
|
234 |
+
|
235 |
+
"""
|
236 |
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
237 |
fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
|
238 |
|
audiocraft/models/musicgen.py
CHANGED
@@ -207,8 +207,8 @@ class MusicGen:
|
|
207 |
convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
|
208 |
if wav is not None else None
|
209 |
for wav in melody_wavs]
|
210 |
-
attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
|
211 |
-
|
212 |
|
213 |
if prompt is not None:
|
214 |
if prompt.dim() == 2:
|
@@ -219,11 +219,11 @@ class MusicGen:
|
|
219 |
if descriptions is None:
|
220 |
descriptions = [None] * len(prompt)
|
221 |
|
222 |
-
if prompt is not None:
|
223 |
-
|
224 |
|
225 |
-
|
226 |
-
|
227 |
if prompt is not None:
|
228 |
assert prompt_tokens is not None
|
229 |
else:
|
|
|
207 |
convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
|
208 |
if wav is not None else None
|
209 |
for wav in melody_wavs]
|
210 |
+
#attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
|
211 |
+
# melody_wavs=melody_wavs)
|
212 |
|
213 |
if prompt is not None:
|
214 |
if prompt.dim() == 2:
|
|
|
219 |
if descriptions is None:
|
220 |
descriptions = [None] * len(prompt)
|
221 |
|
222 |
+
#if prompt is not None:
|
223 |
+
# attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
|
224 |
|
225 |
+
attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
|
226 |
+
melody_wavs=melody_wavs)
|
227 |
if prompt is not None:
|
228 |
assert prompt_tokens is not None
|
229 |
else:
|
audiocraft/utils/extend.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import torch
|
2 |
import math
|
3 |
from audiocraft.models import MusicGen
|
@@ -54,9 +55,21 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
|
|
54 |
|
55 |
# Calculate the total number of segments
|
56 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
|
|
|
|
57 |
#calc excess duration
|
58 |
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
59 |
-
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
62 |
if len(melody_segments) < total_segments:
|
@@ -83,24 +96,70 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
|
|
83 |
torch.manual_seed(seed)
|
84 |
for idx, verse in enumerate(melodys):
|
85 |
if INTERRUPTING:
|
86 |
-
return output_segments, duration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
89 |
-
if output_segments:
|
90 |
-
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
91 |
-
last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
|
92 |
output = MODEL.generate_with_all(
|
93 |
descriptions=[text],
|
94 |
melody_wavs=verse,
|
95 |
sample_rate=sr,
|
96 |
progress=False,
|
97 |
-
prompt=
|
98 |
)
|
99 |
|
100 |
# Append the generated output to the list of segments
|
101 |
#output_segments.append(output[:, :segment_duration])
|
102 |
output_segments.append(output)
|
103 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
|
|
|
|
|
|
104 |
return output_segments, excess_duration
|
105 |
|
106 |
def save_image(image):
|
|
|
1 |
+
from tabnanny import verbose
|
2 |
import torch
|
3 |
import math
|
4 |
from audiocraft.models import MusicGen
|
|
|
55 |
|
56 |
# Calculate the total number of segments
|
57 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
58 |
+
#calculate duration loss from segment overlap
|
59 |
+
duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
|
60 |
#calc excess duration
|
61 |
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
62 |
+
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
|
63 |
+
duration += duration_loss
|
64 |
+
while excess_duration + duration_loss > segment_duration:
|
65 |
+
total_segments += 1
|
66 |
+
#calculate duration loss from segment overlap
|
67 |
+
duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
|
68 |
+
#calc excess duration
|
69 |
+
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
70 |
+
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
|
71 |
+
if excess_duration + duration_loss > segment_duration:
|
72 |
+
duration += duration_loss
|
73 |
|
74 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
75 |
if len(melody_segments) < total_segments:
|
|
|
96 |
torch.manual_seed(seed)
|
97 |
for idx, verse in enumerate(melodys):
|
98 |
if INTERRUPTING:
|
99 |
+
return output_segments, duration
|
100 |
+
|
101 |
+
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
|
102 |
+
# Compensate for the length of final segment
|
103 |
+
if (idx + 1) == len(melodys):
|
104 |
+
print(f'Modify Last verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
|
105 |
+
MODEL.set_generation_params(
|
106 |
+
use_sampling=True,
|
107 |
+
top_k=MODEL.generation_params["top_k"],
|
108 |
+
top_p=MODEL.generation_params["top_p"],
|
109 |
+
temperature=MODEL.generation_params["temp"],
|
110 |
+
cfg_coef=MODEL.generation_params["cfg_coef"],
|
111 |
+
duration=duration,
|
112 |
+
two_step_cfg=False,
|
113 |
+
rep_penalty=0.5
|
114 |
+
)
|
115 |
+
try:
|
116 |
+
# get last chunk
|
117 |
+
verse = verse[:, :, -duration*MODEL.sample_rate:]
|
118 |
+
prompt_segment = prompt_segment[:, :, -duration*MODEL.sample_rate:]
|
119 |
+
except:
|
120 |
+
# get first chunk
|
121 |
+
verse = verse[:, :, :duration*MODEL.sample_rate]
|
122 |
+
prompt_segment = prompt_segment[:, :, :duration*MODEL.sample_rate]
|
123 |
+
|
124 |
+
else:
|
125 |
+
MODEL.set_generation_params(
|
126 |
+
use_sampling=True,
|
127 |
+
top_k=MODEL.generation_params["top_k"],
|
128 |
+
top_p=MODEL.generation_params["top_p"],
|
129 |
+
temperature=MODEL.generation_params["temp"],
|
130 |
+
cfg_coef=MODEL.generation_params["cfg_coef"],
|
131 |
+
duration=segment_duration,
|
132 |
+
two_step_cfg=False,
|
133 |
+
rep_penalty=0.5
|
134 |
+
)
|
135 |
+
|
136 |
+
# Generate a new prompt segment based on the first verse. This will be applied to all segments for consistency
|
137 |
+
if idx == 0:
|
138 |
+
print(f"Generating New Prompt Segment: {text}\r")
|
139 |
+
prompt_segment = MODEL.generate_with_all(
|
140 |
+
descriptions=[text],
|
141 |
+
melody_wavs=verse,
|
142 |
+
sample_rate=sr,
|
143 |
+
progress=False,
|
144 |
+
prompt=None,
|
145 |
+
)
|
146 |
|
147 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
|
|
|
|
|
|
148 |
output = MODEL.generate_with_all(
|
149 |
descriptions=[text],
|
150 |
melody_wavs=verse,
|
151 |
sample_rate=sr,
|
152 |
progress=False,
|
153 |
+
prompt=prompt_segment,
|
154 |
)
|
155 |
|
156 |
# Append the generated output to the list of segments
|
157 |
#output_segments.append(output[:, :segment_duration])
|
158 |
output_segments.append(output)
|
159 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
160 |
+
#track duration
|
161 |
+
if duration > segment_duration:
|
162 |
+
duration -= segment_duration
|
163 |
return output_segments, excess_duration
|
164 |
|
165 |
def save_image(image):
|