File size: 5,517 Bytes
1f45bd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ab8e51
1f45bd9
1ab8e51
1f45bd9
 
 
 
1ab8e51
1f45bd9
1ab8e51
 
 
1f45bd9
 
 
 
1ab8e51
1f45bd9
 
 
 
 
 
 
 
 
 
 
 
854113f
1f45bd9
 
 
 
 
 
854113f
 
 
 
 
 
 
 
 
 
 
 
 
 
1f45bd9
854113f
 
 
 
 
20fcd2b
 
 
 
 
 
1f45bd9
 
991a5af
1f45bd9
 
 
 
 
 
 
20fcd2b
1f45bd9
20fcd2b
1f45bd9
 
 
 
20fcd2b
1f45bd9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import requests
import json
# from volcenginesdkarkruntime import Ark
import torch
import torchaudio
from einops import rearrange
import argparse
import json
import os
import spaces
from tqdm import tqdm
import random
import numpy as np
import sys
import base64
from diffrhythm.infer.infer_utils import (
    get_reference_latent,
    get_lrc_token,
    get_style_prompt,
    prepare_model,
    get_negative_style_prompt
)
from diffrhythm.infer.infer import inference

MAX_SEED = np.iinfo(np.int32).max
device='cuda'
cfm, tokenizer, muq, vae = prepare_model(device)
cfm = torch.compile(cfm)

def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):

    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    torch.manual_seed(seed)
    sway_sampling_coef = -1 if steps < 32 else None
    lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
    style_prompt = get_style_prompt(muq, ref_audio_path)
    negative_style_prompt = get_negative_style_prompt(device)
    latent_prompt = get_reference_latent(device, max_frames)
    generated_song = inference(cfm_model=cfm, 
                               vae_model=vae, 
                               cond=latent_prompt, 
                               text=lrc_prompt, 
                               duration=max_frames, 
                               style_prompt=style_prompt,
                               negative_style_prompt=negative_style_prompt,
                               steps=steps,
                               sway_sampling_coef=sway_sampling_coef,
                               start_time=start_time,
                               file_type=file_type
                               )
    return generated_song

import re 
from transformers import pipeline

zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

def prepare_lyrics_with_llm(theme, tags, lyrics):

    language = "en"
    standard_sys = f"""
Please generate a complete song with lyrics in {language}, following the {tags} style and centered around the theme "{theme}". If {lyrics} is provided, format it accordingly. If {lyrics} is None, generate original lyrics based on the given theme and style. Strictly adhere to the following requirements:

### Mandatory Formatting Rules
1. Only output the formatted lyrics—do not include any explanations, introductions, or additional messages.
2. Only include timestamps and lyrics. Do not use brackets, side notes, or section markers (e.g., chorus, instrumental, outro).
3. **Each line must start with a timestamp**, following the format [mm:ss.xx]Lyrics content, with no spaces between the timestamp and lyrics. The lyrics should be continuous and complete.
4. The total song length must not exceed 1 minute 30 seconds.
5. Timestamps should be naturally distributed. **The first lyric must not start at [00:00.00]**—there should always be an intro with no lyrics, and the first lyric should start around 8 to 10 seconds into the song. Do not start timestamps at [00:00.00].
6. The intro time should always be left blank (with no lyrics) before the first lyric, ensuring the song naturally begins after an intro section.
7. **Every single line must begin with a timestamp.** No line should be missing a timestamp.

### Prohibited Examples (Do Not Include)
- Incorrect: [01:30.00](Piano solo)
- Incorrect: [00:45.00][Chorus]
- Incorrect: Lyrics without a timestamp at the beginning of the line.
"""

    instruction = f"""
<|system|>
{standard_sys}</s>
<|user|>
theme: {theme}
tags: {tags}
lyrics: {lyrics}
"""
    
    prompt = f"{instruction.strip()}</s>"    
    outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
    
    print(f"SUGGESTED Lyrics: {cleaned_text}")
    return cleaned_text.lstrip("\n")

from gradio_client import Client
def generate_audio_ref(tags):

    client = Client("declare-lab/mustango")
    result = client.predict(
		prompt=tags,
		steps=200,
		guidance=3,
		api_name="/predict"
    )
    print(result)

    return result

def general_process(theme, tags, lyrics):
    gr.Info("Generating Lyrics")
    lyrics_result = prepare_lyrics_with_llm(theme, tags, lyrics)
    
    gr.Info("Generating audio ref")
    audio_ref = generate_audio_ref(tags)

    if lyrics_result and audio_ref:
        gr.Info("Generating Song")
        generated_song = infer_music(lyrics_result, audio_ref)
        
    return audio_ref, lyrics_result, generated_song

    
with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown("# Simpler Diff Rythm")

        theme_song = gr.Textbox(label="Theme")
        style_tags = gr.Textbox(label="Music style tags")
        lyrics = gr.Textbox(label="Lyrics optional")
        submit_btn = gr.Button("Submit")
        audio_ref = gr.Audio(label="Audio ref used")
        generated_lyrics = gr.Textbox(label="Generated Lyrics")
        song_result = gr.Audio(label="Your generated Song")

    submit_btn.click(
        fn = general_process,
        inputs = [theme_song, style_tags, lyrics],
        outputs = [audio_ref, generated_lyrics, song_result]
    )

demo.queue().launch(show_api=False, show_error=True, ssr_mode=False)