MusicGen

Build error

App Files Files Community

adefossez commited on Jun 9, 2023

Commit

756be9c

2 Parent(s): 1897b6f 9138f15

Merge branch 'main' into our_hf2

Browse files

Files changed (9) hide show

README.md +11 -5
app.py +2 -2
app_batched.py +2 -2
audiocraft/models/loaders.py +37 -10
audiocraft/models/musicgen.py +15 -20
audiocraft/utils/utils.py +1 -1
hf_loading.py +0 -61
mypy.ini +1 -1
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -56,15 +56,21 @@ You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./d
 ## API
 We provide a simple API and 4 pre-trained models. The pre trained models are:
-- `small`: 300M model, text to music only,
-- `medium`: 1.5B model, text to music only,
-- `melody`: 1.5B model, text to music and text+melody to music,
-- `large`: 3.3B model, text to music only.
 We observe the best trade-off between quality and compute with the `medium` or `melody` model.
 In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
 GPUs will be able to generate short sequences, or longer sequences with the `small` model.
 See after a quick example for using the API.
 ```python
@@ -84,7 +90,7 @@ wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), s
 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
-    audio_write(f'{idx}', one_wav, model.sample_rate, strategy="loudness")
 ```

 ## API
 We provide a simple API and 4 pre-trained models. The pre trained models are:
+- `small`: 300M model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-small)
+- `medium`: 1.5B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-medium)
+- `melody`: 1.5B model, text to music and text+melody to music - [🤗 Hub](https://huggingface.co/facebook/musicgen-melody)
+- `large`: 3.3B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-large)
 We observe the best trade-off between quality and compute with the `medium` or `melody` model.
 In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
 GPUs will be able to generate short sequences, or longer sequences with the `small` model.
+**Note**: Please make sure to have [ffmpeg](https://ffmpeg.org/download.html) installed when using newer version of `torchaudio`.
+You can install it with:
+```
+apt get install ffmpeg
+```
 See after a quick example for using the API.
 ```python
 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness")
 ```

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ LICENSE file in the root directory of this source tree.
 from tempfile import NamedTemporaryFile
 import torch
 import gradio as gr
-from hf_loading import get_pretrained
 from audiocraft.data.audio import audio_write
@@ -19,7 +19,7 @@ MODEL = None
 def load_model(version):
     print("Loading model", version)
-    return get_pretrained(version)
 def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):

 from tempfile import NamedTemporaryFile
 import torch
 import gradio as gr
+from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
 def load_model(version):
     print("Loading model", version)
+    return MusicGen.get_pretrained(version)
 def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):

app_batched.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
-from hf_loading import get_pretrained
 MODEL = None
@@ -19,7 +19,7 @@ MODEL = None
 def load_model():
     print("Loading model")
-    return get_pretrained("melody")
 def predict(texts, melodies):

 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
+from audiocraft.models import MusicGen
 MODEL = None
 def load_model():
     print("Loading model")
+    return MusicGen.get_pretrained("melody")
 def predict(texts, melodies):

audiocraft/models/loaders.py CHANGED Viewed

@@ -20,7 +20,9 @@ of the returned model.
 """
 from pathlib import Path
 import typing as tp
 from omegaconf import OmegaConf
 import torch
@@ -28,18 +30,43 @@ import torch
 from . import builders
-def _get_state_dict(file_or_url: tp.Union[Path, str], device='cpu'):
     # Return the state dict either from a file or url
-    file_or_url = str(file_or_url)
-    assert isinstance(file_or_url, str)
-    if file_or_url.startswith('https://'):
-        return torch.hub.load_state_dict_from_url(file_or_url, map_location=device, check_hash=True)
     else:
-        return torch.load(file_or_url, device)
-def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
@@ -48,8 +75,8 @@ def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
     return model
-def load_lm_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':

 """
 from pathlib import Path
+from huggingface_hub import hf_hub_download
 import typing as tp
+import os
 from omegaconf import OmegaConf
 import torch
 from . import builders
+HF_MODEL_CHECKPOINTS_MAP = {
+    "small": "facebook/musicgen-small",
+    "medium": "facebook/musicgen-medium",
+    "large": "facebook/musicgen-large",
+    "melody": "facebook/musicgen-melody",
+}
+def _get_state_dict(
+    file_or_url_or_id: tp.Union[Path, str],
+    filename: tp.Optional[str] = None,
+    device='cpu',
+    cache_dir: tp.Optional[str] = None,
+):
     # Return the state dict either from a file or url
+    file_or_url_or_id = str(file_or_url_or_id)
+    assert isinstance(file_or_url_or_id, str)
+    if os.path.isfile(file_or_url_or_id):
+        return torch.load(file_or_url_or_id, map_location=device)
+    elif file_or_url_or_id.startswith('https://'):
+        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
+    elif file_or_url_or_id in HF_MODEL_CHECKPOINTS_MAP:
+        assert filename is not None, "filename needs to be defined if using HF checkpoints"
+        repo_id = HF_MODEL_CHECKPOINTS_MAP[file_or_url_or_id]
+        file = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
+        return torch.load(file, map_location=device)
     else:
+        raise ValueError(f"{file_or_url_or_id} is not a valid name, path or link that can be loaded.")
+def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename="compression_state_dict.bin", cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
     return model
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename="state_dict.bin", cache_dir=cache_dir)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':

audiocraft/models/musicgen.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 from .encodec import CompressionModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model
-from .loaders import load_compression_model, load_lm_model
 from ..data.audio_utils import convert_audio
 from ..modules.conditioners import ConditioningAttributes, WavCondition
 from ..utils.autocast import TorchAutocast
@@ -67,10 +67,10 @@ class MusicGen:
     @staticmethod
     def get_pretrained(name: str = 'melody', device='cuda'):
         """Return pretrained model, we provide four models:
-        - small (300M), text to music,
-        - medium (1.5B), text to music,
-        - melody (1.5B) text to music and text+melody to music,
-        - large (3.3B), text to music.
         """
         if name == 'debug':
@@ -79,21 +79,16 @@ class MusicGen:
             lm = get_debug_lm_model(device)
             return MusicGen(name, compression_model, lm)
-        if 'MUSICGEN_ROOT' in os.environ:
-            ROOT = os.environ['MUSICGEN_ROOT']
-            if not ROOT.endswith('/'):
-                ROOT += '/'
-        else:
-            ROOT = 'https://dl.fbaipublicfiles.com/audiocraft/musicgen/v0/'
-        compression_model = load_compression_model(ROOT + 'b0dbef54-37d256b525.th', device=device)
-        names = {
-            'small': 'ba7a97ba-830fe5771e',
-            'medium': 'aa73ae27-fbc9f401db',
-            'large': '9b6e835c-1f0cf17b5e',
-            'melody': 'f79af192-61305ffc49',
-        }
-        sig = names[name]
-        lm = load_lm_model(ROOT + f'{sig}.th', device=device)
         return MusicGen(name, compression_model, lm)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,

 from .encodec import CompressionModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
 from ..data.audio_utils import convert_audio
 from ..modules.conditioners import ConditioningAttributes, WavCondition
 from ..utils.autocast import TorchAutocast
     @staticmethod
     def get_pretrained(name: str = 'melody', device='cuda'):
         """Return pretrained model, we provide four models:
+        - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
+        - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
+        - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
+        - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
         """
         if name == 'debug':
             lm = get_debug_lm_model(device)
             return MusicGen(name, compression_model, lm)
+        if name not in HF_MODEL_CHECKPOINTS_MAP:
+            raise ValueError(
+                f"{name} is not a valid checkpoint name. "
+                f"Choose one of {', '.join(HF_MODEL_CHECKPOINTS_MAP.keys())}"
+            )
+        cache_dir = os.environ.get('MUSICGEN_ROOT', None)
+        compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
+        lm = load_lm_model(name, device=device, cache_dir=cache_dir)
         return MusicGen(name, compression_model, lm)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,

audiocraft/utils/utils.py CHANGED Viewed

@@ -122,7 +122,7 @@ def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     mask = probs_sum - probs_sort > p
-    probs_sort *= (~mask).float(0)
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
     next_token = multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)

     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
     next_token = multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)

hf_loading.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""Utility for loading the models from HF."""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf
-from huggingface_hub import hf_hub_download
-import torch
-from audiocraft.models import builders, MusicGen
-MODEL_CHECKPOINTS_MAP = {
-    "small": "facebook/musicgen-small",
-    "medium": "facebook/musicgen-medium",
-    "large": "facebook/musicgen-large",
-    "melody": "facebook/musicgen-melody",
-}
-def _get_state_dict(file_or_url: tp.Union[Path, str],
-                    filename="state_dict.bin", device='cpu'):
-    # Return the state dict either from a file or url
-    print("loading", file_or_url, filename)
-    file_or_url = str(file_or_url)
-    assert isinstance(file_or_url, str)
-    return torch.load(
-        hf_hub_download(repo_id=file_or_url, filename=filename), map_location=device)
-def load_compression_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url, filename="compression_state_dict.bin")
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    model = builders.get_compression_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_lm_model(file_or_url: tp.Union[Path, str], device='cpu'):
-    pkg = _get_state_dict(file_or_url)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.transformer_lm.memory_efficient = False
-        cfg.transformer_lm.custom = True
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def get_pretrained(name: str = 'small', device='cuda'):
-    model_id = MODEL_CHECKPOINTS_MAP[name]
-    compression_model = load_compression_model(model_id, device=device)
-    lm = load_lm_model(model_id, device=device)
-    return MusicGen(name, compression_model, lm)

mypy.ini CHANGED Viewed

@@ -1,4 +1,4 @@
 [mypy]
-[mypy-treetable,torchaudio.*,soundfile,einops.*,av.*,tqdm.*,num2words.*,spacy,xformers.*,scipy]
 ignore_missing_imports = True

 [mypy]
+[mypy-treetable,torchaudio.*,soundfile,einops.*,av.*,tqdm.*,num2words.*,spacy,xformers.*,scipy,huggingface_hub]
 ignore_missing_imports = True

requirements.txt CHANGED Viewed

@@ -11,6 +11,7 @@ sentencepiece
 spacy==3.5.2
 torch>=2.0.0
 torchaudio>=2.0.0
 tqdm
 transformers
 xformers

 spacy==3.5.2
 torch>=2.0.0
 torchaudio>=2.0.0
+huggingface_hub
 tqdm
 transformers
 xformers