Spaces:

Surn
/

UnlimitedMusicGen

Running on T4

App Files Files Community

Surn commited on Jun 19, 2023

Commit

b76a81b

1 Parent(s): 9766876

Major Advancement on Melody Conditioned.

Browse files

Full length is now consistent
Fades between sections may need more work

Files changed (4) hide show

app.py +8 -6
audiocraft/data/audio_utils.py +59 -0
audiocraft/models/musicgen.py +6 -6
audiocraft/utils/extend.py +65 -6

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ import time
 import warnings
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
-from audiocraft.data.audio_utils import apply_fade
 from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
 import numpy as np
 import random
@@ -165,12 +165,14 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
                 overlap_samples = overlap * MODEL.sample_rate
                 #stack tracks and fade out/in
                 overlapping_output_fadeout = output[:, :, -overlap_samples:]
-                overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.9, current_device=MODEL.device)
                 overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
-                overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.1, current_device=MODEL.device)
-                overlapping_output = (overlapping_output_fadeout + overlapping_output_fadein) / 2
                 print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
                 ##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
                 ##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
@@ -190,7 +192,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=19, loudness_compressor=True, add_suffix=False, channels=2)
         waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
     if MOVE_TO_CPU:
         MODEL.to('cpu')
@@ -245,7 +247,7 @@ def ui(**kwargs):
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
-                    overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)

 import warnings
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
+from audiocraft.data.audio_utils import apply_fade, apply_tafade
 from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
 import numpy as np
 import random
                 overlap_samples = overlap * MODEL.sample_rate
                 #stack tracks and fade out/in
                 overlapping_output_fadeout = output[:, :, -overlap_samples:]
+                overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
+                #overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="exponential")
                 overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
+                overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
+                #overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
+                overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
                 print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
                 ##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
                 ##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
+            loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
         waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
     if MOVE_TO_CPU:
         MODEL.to('cpu')
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
+                    overlap = gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Overlap", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)

audiocraft/data/audio_utils.py CHANGED Viewed

@@ -173,7 +173,66 @@ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
         assert wav.dtype == torch.int16
         return wav
 def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
     fade_samples = int(sample_rate * duration)  # Number of samples for the fade duration
     fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device)  # Generate linear fade curve

         assert wav.dtype == torch.int16
         return wav
+def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear") -> torch.Tensor:
+    """
+    Apply fade-in and/or fade-out effects to the audio tensor.
+    Args:
+        audio (torch.Tensor): The input audio tensor of shape (C, L).
+        sample_rate (int): The sample rate of the audio.
+        duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
+        out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
+        start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
+        shape (str, optional): The shape of the fade. Must be one of: "quarter_sine", "half_sine", "linear", "logarithmic", "exponential". Defaults to "linear".
+    Returns:
+        torch.Tensor: The audio tensor with the fade effect applied.
+    """
+    fade_samples = int(sample_rate * duration)  # Number of samples for the fade duration
+    # Create the fade transform
+    fade_transform = torchaudio.transforms.Fade(fade_in_len=fade_samples, fade_out_len=fade_samples, fade_shape=shape)
+    if out:
+        fade_transform.fade_out_len = fade_samples
+        fade_transform.fade_out_shape = shape
+    # Select the portion of the audio to apply the fade
+    if start:
+        audio_fade_section = audio[:, :fade_samples]
+    else:
+        audio_fade_section = audio[:, -fade_samples:]
+    # Apply the fade transform to the audio section
+    audio_faded = fade_transform(audio)
+    # Replace the selected portion of the audio with the faded section
+    if start:
+        audio_faded[:, :fade_samples] = audio_fade_section
+    else:
+        audio_faded[:, -fade_samples:] = audio_fade_section
+    return audio_faded
 def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
+    """
+    Apply fade-in and/or fade-out effects to the audio tensor.
+    Args:
+        audio (torch.Tensor): The input audio tensor of shape (C, L).
+        sample_rate (int): The sample rate of the audio.
+        duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
+        out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
+        start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
+        curve_start (float, optional): The starting amplitude of the fade curve. Defaults to 0.0.
+        curve_end (float, optional): The ending amplitude of the fade curve. Defaults to 1.0.
+        current_device (str, optional): The device on which the fade curve tensor should be created. Defaults to "cpu".
+    Returns:
+        torch.Tensor: The audio tensor with the fade effect applied.
+    """
     fade_samples = int(sample_rate * duration)  # Number of samples for the fade duration
     fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device)  # Generate linear fade curve

audiocraft/models/musicgen.py CHANGED Viewed

@@ -207,8 +207,8 @@ class MusicGen:
             convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
             if wav is not None else None
             for wav in melody_wavs]
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
-                                                                        melody_wavs=melody_wavs)
         if prompt is not None:
             if prompt.dim() == 2:
@@ -219,11 +219,11 @@ class MusicGen:
         if descriptions is None:
             descriptions = [None] * len(prompt)
-        if prompt is not None:
-            attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
-        #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
-        #                                                                melody_wavs=melody_wavs)
         if prompt is not None:
             assert prompt_tokens is not None
         else:

             convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
             if wav is not None else None
             for wav in melody_wavs]
+        #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+        #                                                                melody_wavs=melody_wavs)
         if prompt is not None:
             if prompt.dim() == 2:
         if descriptions is None:
             descriptions = [None] * len(prompt)
+        #if prompt is not None:
+        #    attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
+                                                                        melody_wavs=melody_wavs)
         if prompt is not None:
             assert prompt_tokens is not None
         else:

audiocraft/utils/extend.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import math
 from audiocraft.models import MusicGen
@@ -54,9 +55,21 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
     # Calculate the total number of segments
     total_segments = max(math.ceil(duration / segment_duration),1)
     #calc excess duration
     excess_duration = segment_duration - (total_segments * segment_duration - duration)
-    print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
     # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
     if len(melody_segments) < total_segments:
@@ -83,24 +96,70 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         if INTERRUPTING:
-            return output_segments, duration - (segment_duration * len(output_segments))
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
-        if output_segments:
-            # If this isn't the first segment, use the last chunk of the previous segment as the input
-            last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
         output = MODEL.generate_with_all(
             descriptions=[text],
             melody_wavs=verse,
             sample_rate=sr,
             progress=False,
-            prompt=last_chunk if len(last_chunk) > 0 else None,
         )
         # Append the generated output to the list of segments
         #output_segments.append(output[:, :segment_duration])
         output_segments.append(output)
         print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
     return output_segments, excess_duration
 def save_image(image):

+from tabnanny import verbose
 import torch
 import math
 from audiocraft.models import MusicGen
     # Calculate the total number of segments
     total_segments = max(math.ceil(duration / segment_duration),1)
+    #calculate duration loss from segment overlap
+    duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
     #calc excess duration
     excess_duration = segment_duration - (total_segments * segment_duration - duration)
+    print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
+    duration += duration_loss
+    while excess_duration + duration_loss > segment_duration:
+        total_segments += 1
+        #calculate duration loss from segment overlap
+        duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
+        #calc excess duration
+        excess_duration = segment_duration - (total_segments * segment_duration - duration)
+        print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
+        if excess_duration + duration_loss > segment_duration:
+            duration += duration_loss
     # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
     if len(melody_segments) < total_segments:
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         if INTERRUPTING:
+            return output_segments, duration
+        print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
+        # Compensate for the length of final segment
+        if (idx + 1) == len(melodys):
+            print(f'Modify Last verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
+            MODEL.set_generation_params(
+                use_sampling=True,
+                top_k=MODEL.generation_params["top_k"],
+                top_p=MODEL.generation_params["top_p"],
+                temperature=MODEL.generation_params["temp"],
+                cfg_coef=MODEL.generation_params["cfg_coef"],
+                duration=duration,
+                two_step_cfg=False,
+                rep_penalty=0.5
+            )
+            try:
+                # get last chunk
+                verse = verse[:, :, -duration*MODEL.sample_rate:]
+                prompt_segment = prompt_segment[:, :, -duration*MODEL.sample_rate:]
+            except:
+                # get first chunk
+                verse = verse[:, :, :duration*MODEL.sample_rate]
+                prompt_segment = prompt_segment[:, :, :duration*MODEL.sample_rate]
+        else:
+            MODEL.set_generation_params(
+                use_sampling=True,
+                top_k=MODEL.generation_params["top_k"],
+                top_p=MODEL.generation_params["top_p"],
+                temperature=MODEL.generation_params["temp"],
+                cfg_coef=MODEL.generation_params["cfg_coef"],
+                duration=segment_duration,
+                two_step_cfg=False,
+                rep_penalty=0.5
+            )
+        # Generate a new prompt segment based on the first verse. This will be applied to all segments for consistency
+        if idx == 0:
+            print(f"Generating New Prompt Segment: {text}\r")
+            prompt_segment = MODEL.generate_with_all(
+                descriptions=[text],
+                melody_wavs=verse,
+                sample_rate=sr,
+                progress=False,
+                prompt=None,
+            )
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
         output = MODEL.generate_with_all(
             descriptions=[text],
             melody_wavs=verse,
             sample_rate=sr,
             progress=False,
+            prompt=prompt_segment,
         )
         # Append the generated output to the list of segments
         #output_segments.append(output[:, :segment_duration])
         output_segments.append(output)
         print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
+        #track duration
+        if duration > segment_duration:
+            duration -= segment_duration
     return output_segments, excess_duration
 def save_image(image):