Spaces:
Running
on
T4
Running
on
T4
Integrate unlimited non-melody guided music (no 30 second limit)
Browse files- app.py +67 -37
- audiocraft/utils/extend.py +6 -2
app.py
CHANGED
@@ -33,47 +33,77 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
33 |
if MODEL is None or MODEL.name != model:
|
34 |
MODEL = load_model(model)
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
else:
|
57 |
-
#
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
melody_sample_rate=sr,
|
67 |
-
progress=True
|
68 |
-
)
|
69 |
-
else:
|
70 |
-
output = MODEL.generate(descriptions=[text], progress=False)
|
71 |
|
72 |
if output_segments:
|
73 |
try:
|
74 |
-
# Combine the output segments into one long audio file
|
75 |
-
output_segments = [segment.detach().cpu().float()[0] for segment in output_segments]
|
76 |
-
output = torch.cat(output_segments, dim=dimension)
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
except Exception as e:
|
78 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
79 |
output = output_segments[0].detach().cpu().float()[0]
|
@@ -81,7 +111,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
81 |
output = output.detach().cpu().float()[0]
|
82 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
83 |
if include_settings:
|
84 |
-
video_description = f"{text}\n Duration: {str(
|
85 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
86 |
audio_write(
|
87 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
|
|
33 |
if MODEL is None or MODEL.name != model:
|
34 |
MODEL = load_model(model)
|
35 |
|
36 |
+
output = None
|
37 |
+
segment_duration = duration
|
38 |
+
initial_duration = duration
|
39 |
+
output_segments = []
|
40 |
+
while duration > 0:
|
41 |
+
if not output_segments: # first pass of long or short song
|
42 |
+
if segment_duration > MODEL.lm.cfg.dataset.segment_duration:
|
43 |
+
segment_duration = MODEL.lm.cfg.dataset.segment_duration
|
44 |
+
else:
|
45 |
+
segment_duration = duration
|
46 |
+
else: # next pass of long song
|
47 |
+
if duration + overlap < MODEL.lm.cfg.dataset.segment_duration:
|
48 |
+
segment_duration = duration + overlap
|
49 |
+
else:
|
50 |
+
segment_duration = MODEL.lm.cfg.dataset.segment_duration
|
51 |
+
# implement seed
|
52 |
+
if seed < 0:
|
53 |
+
seed = random.randint(0, 0xffff_ffff_ffff)
|
54 |
+
torch.manual_seed(seed)
|
55 |
|
56 |
+
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
|
57 |
+
MODEL.set_generation_params(
|
58 |
+
use_sampling=True,
|
59 |
+
top_k=topk,
|
60 |
+
top_p=topp,
|
61 |
+
temperature=temperature,
|
62 |
+
cfg_coef=cfg_coef,
|
63 |
+
duration=segment_duration,
|
64 |
+
)
|
65 |
+
|
66 |
+
if melody:
|
67 |
+
# todo return excess duration, load next model and continue in loop structure building up output_segments
|
68 |
+
if duration > MODEL.lm.cfg.dataset.segment_duration:
|
69 |
+
output_segments, duration = generate_music_segments(text, melody, MODEL, seed, duration, overlap, MODEL.lm.cfg.dataset.segment_duration)
|
70 |
+
else:
|
71 |
+
# pure original code
|
72 |
+
sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
|
73 |
+
print(melody.shape)
|
74 |
+
if melody.dim() == 2:
|
75 |
+
melody = melody[None]
|
76 |
+
melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
|
77 |
+
output = MODEL.generate_with_chroma(
|
78 |
+
descriptions=[text],
|
79 |
+
melody_wavs=melody,
|
80 |
+
melody_sample_rate=sr,
|
81 |
+
progress=True
|
82 |
+
)
|
83 |
+
# All output_segments are populated, so we can break the loop or set duration to 0
|
84 |
+
break
|
85 |
else:
|
86 |
+
#output = MODEL.generate(descriptions=[text], progress=False)
|
87 |
+
if not output_segments:
|
88 |
+
next_segment = MODEL.generate(descriptions=[text], progress=True)
|
89 |
+
duration -= segment_duration
|
90 |
+
else:
|
91 |
+
last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
|
92 |
+
next_segment = MODEL.generate_continuation(last_chunk, MODEL.sample_rate, descriptions=[text], progress=True)
|
93 |
+
duration -= segment_duration - overlap
|
94 |
+
output_segments.append(next_segment)
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
if output_segments:
|
97 |
try:
|
98 |
+
# Combine the output segments into one long audio file or stack tracks
|
99 |
+
#output_segments = [segment.detach().cpu().float()[0] for segment in output_segments]
|
100 |
+
#output = torch.cat(output_segments, dim=dimension)
|
101 |
+
|
102 |
+
output = output_segments[0]
|
103 |
+
for i in range(1, len(output_segments)):
|
104 |
+
overlap_samples = overlap * MODEL.sample_rate
|
105 |
+
output = torch.cat([output[:, :, :-overlap_samples], output_segments[i][:, :, overlap_samples:]], dim=2)
|
106 |
+
output = output.detach().cpu().float()[0]
|
107 |
except Exception as e:
|
108 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
109 |
output = output_segments[0].detach().cpu().float()[0]
|
|
|
111 |
output = output.detach().cpu().float()[0]
|
112 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
113 |
if include_settings:
|
114 |
+
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}"
|
115 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
116 |
audio_write(
|
117 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
audiocraft/utils/extend.py
CHANGED
@@ -43,10 +43,14 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
43 |
|
44 |
# Calculate the total number of segments
|
45 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
46 |
-
|
|
|
|
|
|
|
47 |
|
48 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
49 |
if len(melody_segments) < total_segments:
|
|
|
50 |
for i in range(total_segments - len(melody_segments)):
|
51 |
segment = melody_segments[i]
|
52 |
melody_segments.append(segment)
|
@@ -78,7 +82,7 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
78 |
#output_segments.append(output[:, :segment_duration])
|
79 |
output_segments.append(output)
|
80 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
81 |
-
return output_segments
|
82 |
|
83 |
def save_image(image):
|
84 |
"""
|
|
|
43 |
|
44 |
# Calculate the total number of segments
|
45 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
46 |
+
|
47 |
+
#calc excess duration
|
48 |
+
excess_duration = total_segments * segment_duration - duration
|
49 |
+
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
|
50 |
|
51 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
52 |
if len(melody_segments) < total_segments:
|
53 |
+
#fix melody_segments
|
54 |
for i in range(total_segments - len(melody_segments)):
|
55 |
segment = melody_segments[i]
|
56 |
melody_segments.append(segment)
|
|
|
82 |
#output_segments.append(output[:, :segment_duration])
|
83 |
output_segments.append(output)
|
84 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
85 |
+
return output_segments, excess_duration
|
86 |
|
87 |
def save_image(image):
|
88 |
"""
|