MusicGen

Build error

App Files Files Community

adefossez commited on Jun 9, 2023

Commit

f714a9b

2 Parent(s): 898c175 756be9c

Merge branch 'main' into our_hf

Browse files

Files changed (9) hide show

README.md +11 -5
app.py +32 -23
app_batched.py +24 -25
audiocraft/models/loaders.py +37 -10
audiocraft/models/musicgen.py +15 -20
audiocraft/utils/utils.py +1 -1
hf_loading.py +0 -61
mypy.ini +1 -1
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -56,15 +56,21 @@ You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./d
 ## API
 We provide a simple API and 4 pre-trained models. The pre trained models are:
-- `small`: 300M model, text to music only,
-- `medium`: 1.5B model, text to music only,
-- `melody`: 1.5B model, text to music and text+melody to music,
-- `large`: 3.3B model, text to music only.
 We observe the best trade-off between quality and compute with the `medium` or `melody` model.
 In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
 GPUs will be able to generate short sequences, or longer sequences with the `small` model.
 See after a quick example for using the API.
 ```python
@@ -84,7 +90,7 @@ wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), s
 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
-    audio_write(f'{idx}', one_wav, model.sample_rate, strategy="loudness")
 ```

 ## API
 We provide a simple API and 4 pre-trained models. The pre trained models are:
+- `small`: 300M model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-small)
+- `medium`: 1.5B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-medium)
+- `melody`: 1.5B model, text to music and text+melody to music - [🤗 Hub](https://huggingface.co/facebook/musicgen-melody)
+- `large`: 3.3B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-large)
 We observe the best trade-off between quality and compute with the `medium` or `melody` model.
 In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
 GPUs will be able to generate short sequences, or longer sequences with the `small` model.
+**Note**: Please make sure to have [ffmpeg](https://ffmpeg.org/download.html) installed when using newer version of `torchaudio`.
+You can install it with:
+```
+apt get install ffmpeg
+```
 See after a quick example for using the API.
 ```python
 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness")
 ```

app.py CHANGED Viewed

@@ -6,9 +6,12 @@ This source code is licensed under the license found in the
 LICENSE file in the root directory of this source tree.
 """
 import torch
 import gradio as gr
-from hf_loading import get_pretrained
 MODEL = None
@@ -16,7 +19,7 @@ MODEL = None
 def load_model(version):
     print("Loading model", version)
-    return get_pretrained(version)
 def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
@@ -51,8 +54,11 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     else:
         output = MODEL.generate(descriptions=[text], progress=False)
-    output = output.detach().cpu().numpy()
-    return MODEL.sample_rate, output
 with gr.Blocks() as demo:
@@ -60,25 +66,12 @@ with gr.Blocks() as demo:
         """
         # MusicGen
-        This is the demo for MusicGen, a simple and controllable model for music generation presented at: "Simple and Controllable Music Generation".
-        Below we present 3 model variations:
-        1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
-        2. Small -- a 300M transformer decoder conditioned on text only.
-        3. Medium -- a 1.5B transformer decoder conditioned on text only.
-        4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
-        When the optional melody conditioning wav is provided, the model will extract
-        a broad melody and try to follow it in the generated samples.
-        For skipping queue, you can duplicate this space, and upgrade to GPU in the settings.
         <br/>
-        <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true">
-        <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-        </p>
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-        for more details.
         """
     )
     with gr.Row():
@@ -98,7 +91,7 @@ with gr.Blocks() as demo:
                 temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                 cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
         with gr.Column():
-            output = gr.Audio(label="Generated Music", type="numpy")
     submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
     gr.Examples(
         fn=predict,
@@ -132,5 +125,21 @@ with gr.Blocks() as demo:
         inputs=[text, melody, model],
         outputs=[output]
     )
 demo.launch()

 LICENSE file in the root directory of this source tree.
 """
+from tempfile import NamedTemporaryFile
 import torch
 import gradio as gr
+from audiocraft.models import MusicGen
+from audiocraft.data.audio import audio_write
 MODEL = None
 def load_model(version):
     print("Loading model", version)
+    return MusicGen.get_pretrained(version)
 def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     else:
         output = MODEL.generate(descriptions=[text], progress=False)
+    output = output.detach().cpu().float()[0]
+    with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+        audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
+        waveform_video = gr.make_waveform(file.name)
+    return waveform_video
 with gr.Blocks() as demo:
         """
         # MusicGen
+        This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+        presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
         <br/>
+        <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+        <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+        for longer sequences, more control and no queue.</p>
         """
     )
     with gr.Row():
                 temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                 cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
         with gr.Column():
+            output = gr.Video(label="Generated Music")
     submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
     gr.Examples(
         fn=predict,
         inputs=[text, melody, model],
         outputs=[output]
     )
+    gr.Markdown(
+        """
+        ### More details
+        By typing a description of the music you want and an optional audio used for melody conditioning,
+        We present 4 model variations:
+        1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
+        2. Small -- a 300M transformer decoder conditioned on text only.
+        3. Medium -- a 1.5B transformer decoder conditioned on text only.
+        4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
+        When the optional melody conditioning wav is provided, the model will extract
+        a broad melody and try to follow it in the generated samples.
+        """
+    )
 demo.launch()

app_batched.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
-from hf_loading import get_pretrained
 MODEL = None
@@ -19,7 +19,7 @@ MODEL = None
 def load_model():
     print("Loading model")
-    return get_pretrained("melody")
 def predict(texts, melodies):
@@ -58,8 +58,9 @@ def predict(texts, melodies):
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
-            out_files.append([file.name])
-    return out_files
 with gr.Blocks() as demo:
@@ -67,35 +68,23 @@ with gr.Blocks() as demo:
         """
         # MusicGen
-        This is the demo for MusicGen, a simple and controllable model for music generation
-        presented at: "Simple and Controllable Music Generation".
-        Enter the description of the music you want and an optional audio used for melody conditioning.
-        The model will extract the broad melody from the uploaded wav if provided.
-        This will generate a 12s extract with the `melody` model.
-        For generating longer sequences (up to 30 seconds) and skipping queue, you can duplicate
-        to full demo space, which contains more control and upgrade to GPU in the settings.
         <br/>
-        <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true">
-        <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-        </p>
-        You can also use your own GPU or a Google Colab by following the instructions on our repo.
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-        for more details.
         """
     )
     with gr.Row():
         with gr.Column():
             with gr.Row():
-                text = gr.Text(label="Input Text", interactive=True)
-                melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
             with gr.Row():
-                submit = gr.Button("Submit")
         with gr.Column():
-            output = gr.Audio(label="Generated Music", type="filepath", format="wav")
     submit.click(predict, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=12)
     gr.Examples(
         fn=predict,
@@ -124,5 +113,15 @@ with gr.Blocks() as demo:
         inputs=[text, melody],
         outputs=[output]
     )
 demo.queue(max_size=15).launch()

 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
+from audiocraft.models import MusicGen
 MODEL = None
 def load_model():
     print("Loading model")
+    return MusicGen.get_pretrained("melody")
 def predict(texts, melodies):
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
+            waveform_video = gr.make_waveform(file.name)
+            out_files.append(waveform_video)
+    return [out_files]
 with gr.Blocks() as demo:
         """
         # MusicGen
+        This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+        presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
         <br/>
+        <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+        <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+        for longer sequences, more control and no queue</p>
         """
     )
     with gr.Row():
         with gr.Column():
             with gr.Row():
+                text = gr.Text(label="Describe your music", lines=2, interactive=True)
+                melody = gr.Audio(source="upload", type="numpy", label="Condition on a melody (optional)", interactive=True)
             with gr.Row():
+                submit = gr.Button("Generate")
         with gr.Column():
+            output = gr.Video(label="Generated Music")
     submit.click(predict, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=12)
     gr.Examples(
         fn=predict,
         inputs=[text, melody],
         outputs=[output]
     )
+    gr.Markdown("""
+    ### More details
+    By typing a description of the music you want and an optional audio used for melody conditioning,
+    the model will extract the broad melody from the uploaded wav if provided and generate a 12s extract with the `melody` model.
+    You can also use your own GPU or a Google Colab by following the instructions on our repo.
+    See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+    for more details.
+    """)
 demo.queue(max_size=15).launch()

audiocraft/models/loaders.py CHANGED Viewed

@@ -20,7 +20,9 @@ of the returned model.
 """
 from pathlib import Path
 import typing as tp
 from omegaconf import OmegaConf
 import torch
@@ -28,18 +30,43 @@ import torch
 from . import builders
-def _get_state_dict(file_or_url: tp.Union[Path, str], device='cpu'):
     # Return the state dict either from a file or url
-    file_or_url = str(file_or_url)
-    assert isinstance(file_or_url, str)
-    if file_or_url.startswith('https://'):
-        return torch.hub.load_state_dict_from_url(file_or_url, map_location=device, check_hash=True)
     else:
-        return torch.load(file_or_url, device)
-def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
@@ -48,8 +75,8 @@ def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
     return model
-def load_lm_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':

 """
 from pathlib import Path
+from huggingface_hub import hf_hub_download
 import typing as tp
+import os
 from omegaconf import OmegaConf
 import torch
 from . import builders
+HF_MODEL_CHECKPOINTS_MAP = {
+    "small": "facebook/musicgen-small",
+    "medium": "facebook/musicgen-medium",
+    "large": "facebook/musicgen-large",
+    "melody": "facebook/musicgen-melody",
+}
+def _get_state_dict(
+    file_or_url_or_id: tp.Union[Path, str],
+    filename: tp.Optional[str] = None,
+    device='cpu',
+    cache_dir: tp.Optional[str] = None,
+):
     # Return the state dict either from a file or url
+    file_or_url_or_id = str(file_or_url_or_id)
+    assert isinstance(file_or_url_or_id, str)
+    if os.path.isfile(file_or_url_or_id):
+        return torch.load(file_or_url_or_id, map_location=device)
+    elif file_or_url_or_id.startswith('https://'):
+        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
+    elif file_or_url_or_id in HF_MODEL_CHECKPOINTS_MAP:
+        assert filename is not None, "filename needs to be defined if using HF checkpoints"
+        repo_id = HF_MODEL_CHECKPOINTS_MAP[file_or_url_or_id]
+        file = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
+        return torch.load(file, map_location=device)
     else:
+        raise ValueError(f"{file_or_url_or_id} is not a valid name, path or link that can be loaded.")
+def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename="compression_state_dict.bin", cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
     return model
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename="state_dict.bin", cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':

audiocraft/models/musicgen.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 from .encodec import CompressionModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model
-from .loaders import load_compression_model, load_lm_model
 from ..data.audio_utils import convert_audio
 from ..modules.conditioners import ConditioningAttributes, WavCondition
 from ..utils.autocast import TorchAutocast
@@ -67,10 +67,10 @@ class MusicGen:
     @staticmethod
     def get_pretrained(name: str = 'melody', device='cuda'):
         """Return pretrained model, we provide four models:
-        - small (300M), text to music,
-        - medium (1.5B), text to music,
-        - melody (1.5B) text to music and text+melody to music,
-        - large (3.3B), text to music.
         """
         if name == 'debug':
@@ -79,21 +79,16 @@ class MusicGen:
             lm = get_debug_lm_model(device)
             return MusicGen(name, compression_model, lm)
-        if 'MUSICGEN_ROOT' in os.environ:
-            ROOT = os.environ['MUSICGEN_ROOT']
-            if not ROOT.endswith('/'):
-                ROOT += '/'
-        else:
-            ROOT = 'https://dl.fbaipublicfiles.com/audiocraft/musicgen/v0/'
-        compression_model = load_compression_model(ROOT + 'b0dbef54-37d256b525.th', device=device)
-        names = {
-            'small': 'ba7a97ba-830fe5771e',
-            'medium': 'aa73ae27-fbc9f401db',
-            'large': '9b6e835c-1f0cf17b5e',
-            'melody': 'f79af192-61305ffc49',
-        }
-        sig = names[name]
-        lm = load_lm_model(ROOT + f'{sig}.th', device=device)
         return MusicGen(name, compression_model, lm)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,

 from .encodec import CompressionModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
 from ..data.audio_utils import convert_audio
 from ..modules.conditioners import ConditioningAttributes, WavCondition
 from ..utils.autocast import TorchAutocast
     @staticmethod
     def get_pretrained(name: str = 'melody', device='cuda'):
         """Return pretrained model, we provide four models:
+        - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
+        - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
+        - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
+        - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
         """
         if name == 'debug':
             lm = get_debug_lm_model(device)
             return MusicGen(name, compression_model, lm)
+        if name not in HF_MODEL_CHECKPOINTS_MAP:
+            raise ValueError(
+                f"{name} is not a valid checkpoint name. "
+                f"Choose one of {', '.join(HF_MODEL_CHECKPOINTS_MAP.keys())}"
+            )
+        cache_dir = os.environ.get('MUSICGEN_ROOT', None)
+        compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
+        lm = load_lm_model(name, device=device, cache_dir=cache_dir)
         return MusicGen(name, compression_model, lm)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,

audiocraft/utils/utils.py CHANGED Viewed

@@ -122,7 +122,7 @@ def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     mask = probs_sum - probs_sort > p
-    probs_sort *= (~mask).float(0)
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
     next_token = multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)

     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
     next_token = multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)

hf_loading.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""Utility for loading the models from HF."""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf
-from huggingface_hub import hf_hub_download
-import torch
-from audiocraft.models import builders, MusicGen
-MODEL_CHECKPOINTS_MAP = {
-    "small": "facebook/musicgen-small",
-    "medium": "facebook/musicgen-medium",
-    "large": "facebook/musicgen-large",
-    "melody": "facebook/musicgen-melody",
-}
-def _get_state_dict(file_or_url: tp.Union[Path, str],
-                    filename="state_dict.bin", device='cpu'):
-    # Return the state dict either from a file or url
-    print("loading", file_or_url, filename)
-    file_or_url = str(file_or_url)
-    assert isinstance(file_or_url, str)
-    return torch.load(
-        hf_hub_download(repo_id=file_or_url, filename=filename), map_location=device)
-def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url, filename="compression_state_dict.bin")
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    model = builders.get_compression_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_lm_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.transformer_lm.memory_efficient = False
-        cfg.transformer_lm.custom = True
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def get_pretrained(name: str = 'small', device='cuda'):
-    model_id = MODEL_CHECKPOINTS_MAP[name]
-    compression_model = load_compression_model(model_id, device=device)
-    lm = load_lm_model(model_id, device=device)
-    return MusicGen(name, compression_model, lm)

mypy.ini CHANGED Viewed

@@ -1,4 +1,4 @@
 [mypy]
-[mypy-treetable,torchaudio.*,soundfile,einops.*,av.*,tqdm.*,num2words.*,spacy,xformers.*,scipy]
 ignore_missing_imports = True

 [mypy]
+[mypy-treetable,torchaudio.*,soundfile,einops.*,av.*,tqdm.*,num2words.*,spacy,xformers.*,scipy,huggingface_hub]
 ignore_missing_imports = True

requirements.txt CHANGED Viewed

@@ -11,6 +11,7 @@ sentencepiece
 spacy==3.5.2
 torch>=2.0.0
 torchaudio>=2.0.0
 tqdm
 transformers
 xformers

 spacy==3.5.2
 torch>=2.0.0
 torchaudio>=2.0.0
+huggingface_hub
 tqdm
 transformers
 xformers