Healthydater
/

musicgen-melody-large-endpoint

PyTorch

musicgen

Inference Endpoints

Model card Files Files and versions Community

Phoenixak99 commited on Nov 8, 2024

Commit

3570981

•

1 Parent(s): 9d2438d

Update handler.py

Browse files

Files changed (1) hide show

handler.py +41 -68

handler.py CHANGED Viewed

@@ -1,78 +1,51 @@
-# app.py
-from fastapi import FastAPI, Request
-from handler import EndpointHandler
-import json
-app = FastAPI()
-handler = None
-@app.on_event("startup")
-async def startup_event():
-    global handler
-    handler = EndpointHandler()
-@app.post("/")
-async def process_request(request: Request):
-    body = await request.json()
-    response = handler(body)
-    return response
-# handler.py
 from typing import Dict, Any
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import torch
 class EndpointHandler:
-    def __init__(self, path="/repository"):
-        """Initialize the model and processor."""
         self.processor = AutoProcessor.from_pretrained(path)
         self.model = MusicgenForConditionalGeneration.from_pretrained(
-            path,
-            torch_dtype=torch.float16,
-            device_map="auto"
         ).to("cuda")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Process the input data and generate audio."""
-        try:
-            # Extract inputs and parameters
-            inputs = data.pop("inputs", data)
-            parameters = data.pop("parameters", {})
-            # Get prompt and duration
-            prompt = inputs.get("prompt", "")
-            duration = inputs.get("duration", 30)
-            # Calculate max_new_tokens based on duration
-            samples_per_token = 1024
-            sampling_rate = 32000
-            max_new_tokens = int((duration * sampling_rate) / samples_per_token)
-            # Process input text
-            model_inputs = self.processor(
-                text=[prompt],
-                padding=True,
-                return_tensors="pt"
-            ).to("cuda")
-            # Set default generation parameters
-            generation_params = {
-                "do_sample": True,
-                "guidance_scale": 3,
-                "max_new_tokens": max_new_tokens
             }
-            # Update with any user-provided parameters
-            generation_params.update(parameters)
-            # Generate audio with autocast for memory efficiency
-            with torch.cuda.amp.autocast():
-                audio_values = self.model.generate(**model_inputs, **generation_params)
-            # Convert to list for JSON serialization
-            audio_data = audio_values.cpu().numpy().tolist()
-            return [{"generated_audio": audio_data}]
-        except Exception as e:
-            return {"error": str(e)}

 from typing import Dict, Any
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import torch
 class EndpointHandler:
+    def __init__(self, path=""):
+        # Load the processor and model from the specified path
         self.processor = AutoProcessor.from_pretrained(path)
         self.model = MusicgenForConditionalGeneration.from_pretrained(
+            path, torch_dtype=torch.float16
         ).to("cuda")
+        self.sampling_rate = self.model.config.audio_encoder.sampling_rate
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Args:
+            data (dict): The payload with the text prompt and generation parameters.
+        """
+        # Extract inputs and parameters from the payload
+        inputs = data.get("inputs", {})
+        prompt = inputs.get("prompt", "")
+        duration = inputs.get("duration", 10)
+        parameters = data.get("parameters", {})
+        # Preprocess the prompt
+        input_ids = self.processor(
+            text=[prompt],
+            padding=True,
+            return_tensors="pt",
+        ).to("cuda")
+        # Set generation parameters
+        gen_kwargs = {
+            "max_new_tokens": int(duration * 50),  # MusicGen uses 50 tokens per second
+            **parameters,
+        }
+        # Generate audio
+        with torch.autocast("cuda"):
+            outputs = self.model.generate(**input_ids, **gen_kwargs)
+        # Convert the output audio tensor to a list of lists (channel-wise)
+        audio_tensor = outputs[0].cpu()  # Shape: [num_channels, seq_len]
+        audio_list = audio_tensor.numpy().tolist()  # [[channel1_data], [channel2_data]]
+        return [
+            {
+                "generated_audio": audio_list,
+                "sample_rate": self.sampling_rate,
             }
+        ]