Spaces:
Running
on
Zero
Running
on
Zero
full model
Browse files- app.py +21 -15
- diffrhythm/infer/infer_utils.py +14 -5
- diffrhythm/model/cfm.py +1 -1
- diffrhythm/model/dit.py +6 -5
app.py
CHANGED
@@ -27,12 +27,18 @@ from diffrhythm.infer.infer import inference
|
|
27 |
|
28 |
MAX_SEED = np.iinfo(np.int32).max
|
29 |
device='cuda'
|
30 |
-
cfm, tokenizer, muq, vae = prepare_model(device)
|
31 |
cfm = torch.compile(cfm)
|
|
|
32 |
|
33 |
@spaces.GPU(duration=20)
|
34 |
-
def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav', odeint_method='euler',
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
if randomize_seed:
|
37 |
seed = random.randint(0, MAX_SEED)
|
38 |
torch.manual_seed(seed)
|
@@ -48,7 +54,7 @@ def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42,
|
|
48 |
raise gr.Error(f"Error: {str(e)}")
|
49 |
negative_style_prompt = get_negative_style_prompt(device)
|
50 |
latent_prompt = get_reference_latent(device, max_frames)
|
51 |
-
generated_song = inference(cfm_model=
|
52 |
vae_model=vae,
|
53 |
cond=latent_prompt,
|
54 |
text=lrc_prompt,
|
@@ -185,7 +191,7 @@ with gr.Blocks(css=css) as demo:
|
|
185 |
lines=12,
|
186 |
max_lines=50,
|
187 |
elem_classes="lyrics-scroll-box",
|
188 |
-
value="""[00:
|
189 |
)
|
190 |
|
191 |
current_prompt_type = gr.State(value="audio")
|
@@ -215,21 +221,20 @@ with gr.Blocks(css=css) as demo:
|
|
215 |
[00:10.00]Moonlight spills through broken blinds
|
216 |
[00:13.20]Your shadow dances on the dashboard shrine
|
217 |
```
|
218 |
-
|
219 |
-
|
220 |
-
- Total timestamps should not exceed 01:35.00 (95 seconds)
|
221 |
-
3. **Audio Prompt Requirements**
|
222 |
- Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds
|
223 |
- For optimal results, the 10-second clips should be carefully selected
|
224 |
- Shorter clips may lead to incoherent generation
|
225 |
-
|
226 |
- **Chinese and English**
|
227 |
- More languages comming soon
|
228 |
|
229 |
-
|
230 |
- If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
|
231 |
|
232 |
""")
|
|
|
233 |
|
234 |
lyrics_btn = gr.Button("Generate", variant="primary")
|
235 |
audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
|
@@ -302,9 +307,9 @@ with gr.Blocks(css=css) as demo:
|
|
302 |
|
303 |
gr.Examples(
|
304 |
examples=[
|
305 |
-
["""[00:
|
306 |
-
["""[00:04.
|
307 |
-
["""[00:
|
308 |
],
|
309 |
|
310 |
inputs=[lrc],
|
@@ -313,6 +318,7 @@ with gr.Blocks(css=css) as demo:
|
|
313 |
elem_id="lrc-examples-container",
|
314 |
)
|
315 |
|
|
|
316 |
# page 2
|
317 |
with gr.Tab("Lyrics Generate", id=1):
|
318 |
with gr.Row():
|
@@ -403,7 +409,7 @@ with gr.Blocks(css=css) as demo:
|
|
403 |
|
404 |
lyrics_btn.click(
|
405 |
fn=infer_music,
|
406 |
-
inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method],
|
407 |
outputs=audio_output
|
408 |
)
|
409 |
|
|
|
27 |
|
28 |
MAX_SEED = np.iinfo(np.int32).max
|
29 |
device='cuda'
|
30 |
+
cfm, cfm_full, tokenizer, muq, vae = prepare_model(device)
|
31 |
cfm = torch.compile(cfm)
|
32 |
+
cfm_full = torch.compile(cfm_full)
|
33 |
|
34 |
@spaces.GPU(duration=20)
|
35 |
+
def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav', odeint_method='euler', Music_Duration='95s', device='cuda'):
|
36 |
+
if Music_Duration == '95s':
|
37 |
+
max_frames = 2048
|
38 |
+
cfm_model = cfm
|
39 |
+
else:
|
40 |
+
max_frames = 6144
|
41 |
+
cfm_model = cfm_full
|
42 |
if randomize_seed:
|
43 |
seed = random.randint(0, MAX_SEED)
|
44 |
torch.manual_seed(seed)
|
|
|
54 |
raise gr.Error(f"Error: {str(e)}")
|
55 |
negative_style_prompt = get_negative_style_prompt(device)
|
56 |
latent_prompt = get_reference_latent(device, max_frames)
|
57 |
+
generated_song = inference(cfm_model=cfm_model,
|
58 |
vae_model=vae,
|
59 |
cond=latent_prompt,
|
60 |
text=lrc_prompt,
|
|
|
191 |
lines=12,
|
192 |
max_lines=50,
|
193 |
elem_classes="lyrics-scroll-box",
|
194 |
+
value="""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n[01:34.95]And that I'm enough\n[01:37.91]I need it and I don't know why\n[01:42.08]This late at night\n[01:44.24]Isn't it lonely\n[01:47.18]I'd do anything to make you want me\n[01:51.30]I'd give it all up if you told me\n[01:55.32]That I'd be\n[01:57.35]The number one girl in your eyes\n[02:00.72]Your one and only\n[02:03.57]So what's it gon' take for you to want me\n[02:07.78]I'd give it all up if you told me\n[02:11.74]That I'd be\n[02:13.86]The number one girl in your eyes\n[02:17.03]The girl in your eyes\n[02:21.05]The girl in your eyes\n[02:26.30]Tell me I'm the number one girl\n[02:28.44]I'm the number one girl in your eyes\n[02:33.49]The girl in your eyes\n[02:37.58]The girl in your eyes\n[02:42.74]Tell me I'm the number one girl\n[02:44.88]I'm the number one girl in your eyes\n[02:49.91]Well isn't it lonely\n[02:53.19]I'd do anything to make you want me\n[02:57.10]I'd give it all up if you told me\n[03:01.15]That I'd be\n[03:03.31]The number one girl in your eyes\n[03:06.57]Your one and only\n[03:09.42]So what's it gon' take for you to want me\n[03:13.50]I'd give it all up if you told me\n[03:17.56]That I'd be\n[03:19.66]The number one girl in your eyes\n[03:25.74]The number one girl in your eyes"""
|
195 |
)
|
196 |
|
197 |
current_prompt_type = gr.State(value="audio")
|
|
|
221 |
[00:10.00]Moonlight spills through broken blinds
|
222 |
[00:13.20]Your shadow dances on the dashboard shrine
|
223 |
```
|
224 |
+
|
225 |
+
2. **Audio Prompt Requirements**
|
|
|
|
|
226 |
- Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds
|
227 |
- For optimal results, the 10-second clips should be carefully selected
|
228 |
- Shorter clips may lead to incoherent generation
|
229 |
+
3. **Supported Languages**
|
230 |
- **Chinese and English**
|
231 |
- More languages comming soon
|
232 |
|
233 |
+
4. **Others**
|
234 |
- If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
|
235 |
|
236 |
""")
|
237 |
+
Music_Duration = gr.Radio(["95s", "285s"], label="Music Duration", value="95s")
|
238 |
|
239 |
lyrics_btn = gr.Button("Generate", variant="primary")
|
240 |
audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
|
|
|
307 |
|
308 |
gr.Examples(
|
309 |
examples=[
|
310 |
+
["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n[01:34.95]And that I'm enough\n[01:37.91]I need it and I don't know why\n[01:42.08]This late at night\n[01:44.24]Isn't it lonely\n[01:47.18]I'd do anything to make you want me\n[01:51.30]I'd give it all up if you told me\n[01:55.32]That I'd be\n[01:57.35]The number one girl in your eyes\n[02:00.72]Your one and only\n[02:03.57]So what's it gon' take for you to want me\n[02:07.78]I'd give it all up if you told me\n[02:11.74]That I'd be\n[02:13.86]The number one girl in your eyes\n[02:17.03]The girl in your eyes\n[02:21.05]The girl in your eyes\n[02:26.30]Tell me I'm the number one girl\n[02:28.44]I'm the number one girl in your eyes\n[02:33.49]The girl in your eyes\n[02:37.58]The girl in your eyes\n[02:42.74]Tell me I'm the number one girl\n[02:44.88]I'm the number one girl in your eyes\n[02:49.91]Well isn't it lonely\n[02:53.19]I'd do anything to make you want me\n[02:57.10]I'd give it all up if you told me\n[03:01.15]That I'd be\n[03:03.31]The number one girl in your eyes\n[03:06.57]Your one and only\n[03:09.42]So what's it gon' take for you to want me\n[03:13.50]I'd give it all up if you told me\n[03:17.56]That I'd be\n[03:19.66]The number one girl in your eyes\n[03:25.74]The number one girl in your eyes"""],
|
311 |
+
["""[00:00.52]Abracadabra abracadabra\n[00:03.97]Ha\n[00:04.66]Abracadabra abracadabra\n[00:12.02]Yeah\n[00:15.80]Pay the toll to the angels\n[00:19.08]Drawin' circles in the clouds\n[00:23.31]Keep your mind on the distance\n[00:26.67]When the devil turns around\n[00:30.95]Hold me in your heart tonight\n[00:34.11]In the magic of the dark moonlight\n[00:38.44]Save me from this empty fight\n[00:43.83]In the game of life\n[00:45.84]Like a poem said by a lady in red\n[00:49.45]You hear the last few words of your life\n[00:53.15]With a haunting dance now you're both in a trance\n[00:56.90]It's time to cast your spell on the night\n[01:01.40]Abracadabra ama-ooh-na-na\n[01:04.88]Abracadabra porta-ooh-ga-ga\n[01:08.92]Abracadabra abra-ooh-na-na\n[01:12.30]In her tongue she's sayin'\n[01:14.76]Death or love tonight\n[01:18.61]Abracadabra abracadabra\n[01:22.18]Abracadabra abracadabra\n[01:26.08]Feel the beat under your feet\n[01:27.82]The floor's on fire\n[01:29.90]Abracadabra abracadabra\n[01:33.78]Choose the road on the west side\n[01:37.09]As the dust flies watch it burn\n[01:41.45]Don't waste time on feeling\n[01:44.64]Your depression won't return\n[01:49.15]Hold me in your heart tonight\n[01:52.21]In the magic of the dark moonlight\n[01:56.54]Save me from this empty fight\n[02:01.77]In the game of life\n[02:03.94]Like a poem said by a lady in red\n[02:07.52]You hear the last few words of your life\n[02:11.19]With a haunting dance now you're both in a trance\n[02:14.95]It's time to cast your spell on the night\n[02:19.53]Abracadabra ama-ooh-na-na\n[02:22.71]Abracadabra porta-ooh-ga-ga\n[02:26.94]Abracadabra abra-ooh-na-na\n[02:30.42]In her tongue she's sayin'\n[02:32.83]Death or love tonight\n[02:36.55]Abracadabra abracadabra\n[02:40.27]Abracadabra abracadabra\n[02:44.19]Feel the beat under your feet\n[02:46.14]The floor's on fire\n[02:47.95]Abracadabra abracadabra\n[02:51.17]Phantom of the dance floor come to me\n[02:58.46]Sing for me a sinful melody\n[03:06.51]Ah-ah-ah-ah-ah ah-ah ah-ah\n[03:13.76]Ah-ah-ah-ah-ah ah-ah ah-ah\n[03:22.39]Abracadabra ama-ooh-na-na\n[03:25.66]Abracadabra porta-ooh-ga-ga\n[03:29.87]Abracadabra abra-ooh-na-na\n[03:33.16]In her tongue she's sayin'\n[03:35.55]Death or love tonight"""],
|
312 |
+
["""[00:00.27]只因你太美 baby 只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby bae\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby 只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n[01:36.35]你到底属于谁\n[01:37.65]就是现在告诉我\n[01:40.00]跟着那节奏 缓缓 make wave\n[01:42.42]甜蜜的奶油 it's your birthday cake\n[01:44.66]男人们的 game call me 你恋人\n[01:46.83]别被欺骗愉快的 I wanna play\n[01:48.83]我的脑海每分每秒为你一人沉醉\n[01:50.90]最迷人让我神魂颠倒是你身上香水\n[01:53.30]Oh right baby I'm fall in love with you\n[01:55.20]我的一切你都拿走\n[01:56.40]只要有你就已足够\n[01:58.56]我到底应该怎样\n[02:00.37]Uh 我心里一直很不安\n[02:03.12]其他男人们的视线\n[02:04.84]Oh 全都只看着你的脸\n[02:07.33]Eh oh\n[02:08.39]难道真的因你而疯狂吗\n[02:12.43]我本来不是这种人\n[02:14.35]因你变成奇怪的人\n[02:16.59]第一次呀变成这样的我\n[02:21.76]不管我怎么去否认\n[02:24.03]只因你太美 baby 只因你太美 baby\n[02:32.37]只因你实在是太美 baby\n[02:37.49]只因你太美 baby\n[02:43.66]我愿意把我的全部都给你\n[02:47.19]我每天在梦里都梦见你\n[02:49.13]还有我闭着眼睛也能看到你\n[02:52.58]现在开始我只准你看我\n[02:56.28]I don't wanna wake up in dream\n[02:57.92]我只想看你这是真心话\n[02:59.86]只因你太美 baby 只因你太美 baby\n[03:08.20]只因你实在是太美 baby\n[03:13.22]只因你太美 baby\n[03:17.69]Oh eh oh\n[03:19.36]现在确认的告诉我\n[03:21.91]Oh eh oh\n[03:23.85]你到底属于谁\n[03:26.58]Oh eh oh\n[03:28.32]现在确认的告诉我\n[03:30.95]Oh eh oh\n[03:32.82]你到底属于谁就是现在告诉我"""]
|
313 |
],
|
314 |
|
315 |
inputs=[lrc],
|
|
|
318 |
elem_id="lrc-examples-container",
|
319 |
)
|
320 |
|
321 |
+
|
322 |
# page 2
|
323 |
with gr.Tab("Lyrics Generate", id=1):
|
324 |
with gr.Row():
|
|
|
409 |
|
410 |
lyrics_btn.click(
|
411 |
fn=infer_music,
|
412 |
+
inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method, Music_Duration],
|
413 |
outputs=audio_output
|
414 |
)
|
415 |
|
diffrhythm/infer/infer_utils.py
CHANGED
@@ -13,18 +13,27 @@ from diffrhythm.model import DiT, CFM
|
|
13 |
def prepare_model(device):
|
14 |
# prepare cfm model
|
15 |
dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-base", filename="cfm_model.pt")
|
|
|
16 |
dit_config_path = "./diffrhythm/config/diffrhythm-1b.json"
|
17 |
with open(dit_config_path) as f:
|
18 |
model_config = json.load(f)
|
19 |
dit_model_cls = DiT
|
20 |
cfm = CFM(
|
21 |
-
transformer=dit_model_cls(**model_config["model"], use_style_prompt=True),
|
22 |
num_channels=model_config["model"]['mel_dim'],
|
23 |
use_style_prompt=True
|
24 |
)
|
25 |
cfm = cfm.to(device)
|
26 |
cfm = load_checkpoint(cfm, dit_ckpt_path, device=device, use_ema=False)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# prepare tokenizer
|
29 |
tokenizer = CNENTokenizer()
|
30 |
|
@@ -35,8 +44,9 @@ def prepare_model(device):
|
|
35 |
# prepare vae
|
36 |
vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
|
37 |
vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
|
38 |
-
return cfm, tokenizer, muq, vae
|
39 |
|
|
|
|
|
40 |
|
41 |
# for song edit, will be added in the future
|
42 |
def get_reference_latent(device, max_frames):
|
@@ -114,9 +124,8 @@ class CNENTokenizer():
|
|
114 |
def decode(self, token):
|
115 |
return "|".join([self.id2phone[x-1] for x in token])
|
116 |
|
117 |
-
def get_lrc_token(text, tokenizer, device):
|
118 |
|
119 |
-
max_frames = 2048
|
120 |
lyrics_shift = 0
|
121 |
sampling_rate = 44100
|
122 |
downsample_rate = 2048
|
@@ -138,7 +147,7 @@ def get_lrc_token(text, tokenizer, device):
|
|
138 |
lrc_with_time = modified_lrc_with_time
|
139 |
|
140 |
lrc_with_time = [(time_start, line) for (time_start, line) in lrc_with_time if time_start < max_secs]
|
141 |
-
lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
|
142 |
|
143 |
normalized_start_time = 0.
|
144 |
|
|
|
13 |
def prepare_model(device):
|
14 |
# prepare cfm model
|
15 |
dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-base", filename="cfm_model.pt")
|
16 |
+
dit_full_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-full", filename="cfm_model.pt")
|
17 |
dit_config_path = "./diffrhythm/config/diffrhythm-1b.json"
|
18 |
with open(dit_config_path) as f:
|
19 |
model_config = json.load(f)
|
20 |
dit_model_cls = DiT
|
21 |
cfm = CFM(
|
22 |
+
transformer=dit_model_cls(**model_config["model"], use_style_prompt=True, max_pos=2048),
|
23 |
num_channels=model_config["model"]['mel_dim'],
|
24 |
use_style_prompt=True
|
25 |
)
|
26 |
cfm = cfm.to(device)
|
27 |
cfm = load_checkpoint(cfm, dit_ckpt_path, device=device, use_ema=False)
|
28 |
|
29 |
+
cfm_full = CFM(
|
30 |
+
transformer=dit_model_cls(**model_config["model"], use_style_prompt=True, max_pos=6144),
|
31 |
+
num_channels=model_config["model"]['mel_dim'],
|
32 |
+
use_style_prompt=True
|
33 |
+
)
|
34 |
+
cfm_full = cfm_full.to(device)
|
35 |
+
cfm_full = load_checkpoint(cfm_full, dit_full_ckpt_path, device=device, use_ema=False)
|
36 |
+
|
37 |
# prepare tokenizer
|
38 |
tokenizer = CNENTokenizer()
|
39 |
|
|
|
44 |
# prepare vae
|
45 |
vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
|
46 |
vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
|
|
|
47 |
|
48 |
+
return cfm, cfm_full, tokenizer, muq, vae
|
49 |
+
|
50 |
|
51 |
# for song edit, will be added in the future
|
52 |
def get_reference_latent(device, max_frames):
|
|
|
124 |
def decode(self, token):
|
125 |
return "|".join([self.id2phone[x-1] for x in token])
|
126 |
|
127 |
+
def get_lrc_token(max_frames, text, tokenizer, device):
|
128 |
|
|
|
129 |
lyrics_shift = 0
|
130 |
sampling_rate = 44100
|
131 |
downsample_rate = 2048
|
|
|
147 |
lrc_with_time = modified_lrc_with_time
|
148 |
|
149 |
lrc_with_time = [(time_start, line) for (time_start, line) in lrc_with_time if time_start < max_secs]
|
150 |
+
# lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
|
151 |
|
152 |
normalized_start_time = 0.
|
153 |
|
diffrhythm/model/cfm.py
CHANGED
@@ -105,7 +105,7 @@ class CFM(nn.Module):
|
|
105 |
cfg_strength=4.0,
|
106 |
sway_sampling_coef=None,
|
107 |
seed: int | None = None,
|
108 |
-
max_duration=
|
109 |
vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
|
110 |
no_ref_audio=False,
|
111 |
duplicate_test=False,
|
|
|
105 |
cfg_strength=4.0,
|
106 |
sway_sampling_coef=None,
|
107 |
seed: int | None = None,
|
108 |
+
max_duration=6144,
|
109 |
vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
|
110 |
no_ref_audio=False,
|
111 |
duplicate_test=False,
|
diffrhythm/model/dit.py
CHANGED
@@ -31,13 +31,13 @@ from diffrhythm.model.modules import (
|
|
31 |
|
32 |
# Text embedding
|
33 |
class TextEmbedding(nn.Module):
|
34 |
-
def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
|
35 |
super().__init__()
|
36 |
self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
|
37 |
|
38 |
if conv_layers > 0:
|
39 |
self.extra_modeling = True
|
40 |
-
self.precompute_max_pos =
|
41 |
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
|
42 |
self.text_blocks = nn.Sequential(
|
43 |
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
|
@@ -103,7 +103,8 @@ class DiT(nn.Module):
|
|
103 |
text_dim=None,
|
104 |
conv_layers=0,
|
105 |
long_skip_connection=False,
|
106 |
-
use_style_prompt=False
|
|
|
107 |
):
|
108 |
super().__init__()
|
109 |
|
@@ -112,14 +113,14 @@ class DiT(nn.Module):
|
|
112 |
self.start_time_embed = TimestepEmbedding(cond_dim)
|
113 |
if text_dim is None:
|
114 |
text_dim = mel_dim
|
115 |
-
self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
|
116 |
self.input_embed = InputEmbedding(mel_dim, text_dim, dim, cond_dim=cond_dim)
|
117 |
|
118 |
|
119 |
self.dim = dim
|
120 |
self.depth = depth
|
121 |
|
122 |
-
llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
|
123 |
llama_config._attn_implementation = 'sdpa'
|
124 |
|
125 |
self.transformer_blocks = nn.ModuleList(
|
|
|
31 |
|
32 |
# Text embedding
|
33 |
class TextEmbedding(nn.Module):
|
34 |
+
def __init__(self, text_num_embeds, text_dim, max_pos, conv_layers=0, conv_mult=2):
|
35 |
super().__init__()
|
36 |
self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
|
37 |
|
38 |
if conv_layers > 0:
|
39 |
self.extra_modeling = True
|
40 |
+
self.precompute_max_pos = max_pos # ~44s of 24khz audio
|
41 |
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
|
42 |
self.text_blocks = nn.Sequential(
|
43 |
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
|
|
|
103 |
text_dim=None,
|
104 |
conv_layers=0,
|
105 |
long_skip_connection=False,
|
106 |
+
use_style_prompt=False,
|
107 |
+
max_pos=2048,
|
108 |
):
|
109 |
super().__init__()
|
110 |
|
|
|
113 |
self.start_time_embed = TimestepEmbedding(cond_dim)
|
114 |
if text_dim is None:
|
115 |
text_dim = mel_dim
|
116 |
+
self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers, max_pos=max_pos)
|
117 |
self.input_embed = InputEmbedding(mel_dim, text_dim, dim, cond_dim=cond_dim)
|
118 |
|
119 |
|
120 |
self.dim = dim
|
121 |
self.depth = depth
|
122 |
|
123 |
+
llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu', max_position_embeddings=max_pos)
|
124 |
llama_config._attn_implementation = 'sdpa'
|
125 |
|
126 |
self.transformer_blocks = nn.ModuleList(
|