Healthydater
/

musicgen-melody-large-endpoint

PyTorch

musicgen

Inference Endpoints

Model card Files Files and versions Community

Phoenixak99 commited on Nov 9

Commit

dc480b5

•

1 Parent(s): 30b75e1

Update handler.py

Browse files

Files changed (1) hide show

handler.py +88 -44

handler.py CHANGED Viewed

@@ -2,62 +2,104 @@ import logging
 from typing import Dict, Any
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import torch
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path=""):
-        # Load the processor and model from the specified path
-        self.processor = AutoProcessor.from_pretrained(path)
         self.model = MusicgenForConditionalGeneration.from_pretrained(
-            path, torch_dtype=torch.float16
         ).to("cuda")
         self.sampling_rate = self.model.config.audio_encoder.sampling_rate
     def __call__(self, data: Dict[str, Any]) -> Any:
-        """
-        Args:
-            data (dict): The payload with the text prompt and generation parameters.
-        """
         try:
-            # Extract inputs and parameters from the payload
             inputs = data.get("inputs", data)
             parameters = data.get("parameters", {})
-            # Handle inputs
-            if isinstance(inputs, str):
-                prompt = inputs
-                duration = 10  # Default duration
-            elif isinstance(inputs, dict):
                 prompt = inputs.get("text") or inputs.get("prompt")
                 duration = inputs.get("duration", 10)
             else:
-                prompt = None
                 duration = 10
-            # Override duration if provided in parameters
             if 'duration' in parameters:
                 duration = parameters.pop('duration')
-            # Validate the prompt
             if not prompt:
                 return {"error": "No prompt provided."}
-            # Preprocess the prompt
             input_ids = self.processor(
                 text=[prompt],
                 padding=True,
                 return_tensors="pt",
             ).to("cuda")
-            # Set generation parameters
             gen_kwargs = {
-                "max_new_tokens": int(duration * 50),  # MusicGen uses 50 tokens per second
             }
-            # Filter out unsupported parameters
             supported_params = [
                 "max_length", "min_length", "do_sample", "early_stopping", "num_beams",
                 "temperature", "top_k", "top_p", "repetition_penalty", "bad_words_ids",
@@ -66,24 +108,26 @@ class EndpointHandler:
             for param in supported_params:
                 if param in parameters:
                     gen_kwargs[param] = parameters[param]
-            logger.info(f"Received prompt: {prompt}")
             logger.info(f"Generation parameters: {gen_kwargs}")
-            # Generate audio
-            with torch.autocast("cuda"):
                 outputs = self.model.generate(**input_ids, **gen_kwargs)
-            # Convert the output audio tensor to a list of lists (channel-wise)
-            audio_tensor = outputs[0].cpu()  # Shape: [num_channels, seq_len]
-            audio_list = audio_tensor.numpy().tolist()  # [[channel1_data], [channel2_data]]
-            return [
-                {
-                    "generated_audio": audio_list,
-                    "sample_rate": self.sampling_rate,
-                }
-            ]
         except Exception as e:
-            logger.error(f"Exception during generation: {e}")
-            return {"error": str(e)}

 from typing import Dict, Any
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import torch
+import gc
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path=""):
+        # Enable CUDA optimization
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True
+        # Load processor with optimizations
+        logger.info("Loading processor...")
+        self.processor = AutoProcessor.from_pretrained(
+            path,
+            use_fast=True  # Use faster tokenizer
+        )
+        logger.info("Loading model...")
         self.model = MusicgenForConditionalGeneration.from_pretrained(
+            path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
         ).to("cuda")
+        # Set model to eval mode
+        self.model.eval()
+        # Cache sampling rate
         self.sampling_rate = self.model.config.audio_encoder.sampling_rate
+        # Clear CUDA cache
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Quick warmup
+        logger.info("Warming up model...")
+        self._warmup()
+    def _warmup(self):
+        """Perform a minimal forward pass to warm up the model"""
+        try:
+            with torch.no_grad():
+                dummy_input = self.processor(
+                    text=["test"],
+                    padding=True,
+                    return_tensors="pt"
+                ).to("cuda")
+                # Minimal generation
+                self.model.generate(
+                    **dummy_input,
+                    max_new_tokens=10,
+                    do_sample=False
+                )
+        except Exception as e:
+            logger.warning(f"Warmup failed (non-critical): {e}")
     def __call__(self, data: Dict[str, Any]) -> Any:
         try:
+            # Extract inputs and parameters
             inputs = data.get("inputs", data)
             parameters = data.get("parameters", {})
+            # Efficient input handling
+            if isinstance(inputs, dict):
                 prompt = inputs.get("text") or inputs.get("prompt")
                 duration = inputs.get("duration", 10)
             else:
+                prompt = inputs if isinstance(inputs, str) else None
                 duration = 10
             if 'duration' in parameters:
                 duration = parameters.pop('duration')
             if not prompt:
                 return {"error": "No prompt provided."}
+            # Preprocess with optimized settings
             input_ids = self.processor(
                 text=[prompt],
                 padding=True,
                 return_tensors="pt",
+                truncation=True,
+                max_length=512  # Limit input length
             ).to("cuda")
+            # Optimized generation settings
             gen_kwargs = {
+                "max_new_tokens": int(duration * 50),
+                "use_cache": True,  # Enable KV-cache
+                "do_sample": True,
+                "temperature": 0.8,
+                "top_k": 50,
+                "top_p": 0.95
             }
+            # Add any custom parameters
             supported_params = [
                 "max_length", "min_length", "do_sample", "early_stopping", "num_beams",
                 "temperature", "top_k", "top_p", "repetition_penalty", "bad_words_ids",
             for param in supported_params:
                 if param in parameters:
                     gen_kwargs[param] = parameters[param]
+            logger.info(f"Generating with prompt: {prompt}")
             logger.info(f"Generation parameters: {gen_kwargs}")
+            # Generate with optimized settings
+            with torch.inference_mode(), torch.autocast("cuda"):
                 outputs = self.model.generate(**input_ids, **gen_kwargs)
+            # Convert output
+            audio_tensor = outputs[0].cpu()
+            audio_list = audio_tensor.numpy().tolist()
+            # Clear cache
+            torch.cuda.empty_cache()
+            return [{
+                "generated_audio": audio_list,
+                "sample_rate": self.sampling_rate,
+            }]
         except Exception as e:
+            logger.error(f"Generation failed: {e}")
+            return {"error": str(e)}