Spaces:
Runtime error
Runtime error
from music.music import get_random_spit, get_albums | |
from vits.models import SynthesizerInfer | |
from omegaconf import OmegaConf | |
import torchcrepe | |
import torch | |
import io | |
import os | |
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile | |
import random | |
import logging | |
logging.getLogger('numba').setLevel(logging.WARNING) | |
logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
logging.getLogger('urllib3').setLevel(logging.WARNING) | |
logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
def load_svc_model(checkpoint_path, model): | |
assert os.path.isfile(checkpoint_path) | |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") | |
saved_state_dict = checkpoint_dict["model_g"] | |
state_dict = model.state_dict() | |
new_state_dict = {} | |
for k, v in state_dict.items(): | |
new_state_dict[k] = saved_state_dict[k] | |
model.load_state_dict(new_state_dict) | |
return model | |
def compute_f0_nn(filename, device): | |
audio, sr = librosa.load(filename, sr=16000) | |
assert sr == 16000 | |
# Load audio | |
audio = torch.tensor(np.copy(audio))[None] | |
# Here we'll use a 20 millisecond hop length | |
hop_length = 320 | |
# Provide a sensible frequency range for your domain (upper limit is 2006 Hz) | |
# This would be a reasonable range for speech | |
fmin = 50 | |
fmax = 1000 | |
# Select a model capacity--one of "tiny" or "full" | |
model = "full" | |
# Pick a batch size that doesn't cause memory errors on your gpu | |
batch_size = 512 | |
# Compute pitch using first gpu | |
pitch, periodicity = torchcrepe.predict( | |
audio, | |
sr, | |
hop_length, | |
fmin, | |
fmax, | |
model, | |
batch_size=batch_size, | |
device=device, | |
return_periodicity=True, | |
) | |
pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2 | |
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2 | |
# CREPE was not trained on silent audio. some error on silent need filter. | |
periodicity = torchcrepe.filter.median(periodicity, 9) | |
pitch = torchcrepe.filter.mean(pitch, 9) | |
pitch[periodicity < 0.1] = 0 | |
pitch = pitch.squeeze(0) | |
return pitch | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
hp = OmegaConf.load("configs/base.yaml") | |
model = SynthesizerInfer( | |
hp.data.filter_length // 2 + 1, | |
hp.data.segment_size // hp.data.hop_length, | |
hp) | |
load_svc_model("vits_pretrain/sovits5.0-48k-debug.pth", model) | |
model.eval() | |
model.to(device) | |
def svc_change(argswave, argsspk): | |
argsppg = "svc_tmp.ppg.npy" | |
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}") | |
spk = np.load(argsspk) | |
spk = torch.FloatTensor(spk) | |
ppg = np.load(argsppg) | |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2 | |
ppg = torch.FloatTensor(ppg) | |
pit = compute_f0_nn(argswave, device) | |
pit = torch.FloatTensor(pit) | |
len_pit = pit.size()[0] | |
len_ppg = ppg.size()[0] | |
len_min = min(len_pit, len_ppg) | |
pit = pit[:len_min] | |
ppg = ppg[:len_min, :] | |
with torch.no_grad(): | |
spk = spk.unsqueeze(0).to(device) | |
source = pit.unsqueeze(0).to(device) | |
source = model.pitch2source(source) | |
hop_size = hp.data.hop_length | |
all_frame = len_min | |
hop_frame = 10 | |
out_chunk = 2500 # 25 S | |
out_index = 0 | |
out_audio = [] | |
has_audio = False | |
while (out_index + out_chunk < all_frame): | |
has_audio = True | |
if (out_index == 0): # start frame | |
cut_s = out_index | |
cut_s_48k = 0 | |
else: | |
cut_s = out_index - hop_frame | |
cut_s_48k = hop_frame * hop_size | |
if (out_index + out_chunk + hop_frame > all_frame): # end frame | |
cut_e = out_index + out_chunk | |
cut_e_48k = 0 | |
else: | |
cut_e = out_index + out_chunk + hop_frame | |
cut_e_48k = -1 * hop_frame * hop_size | |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device) | |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device) | |
sub_len = torch.LongTensor([cut_e - cut_s]).to(device) | |
sub_har = source[:, :, cut_s * | |
hop_size:cut_e * hop_size].to(device) | |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har) | |
sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
sub_out = sub_out[cut_s_48k:cut_e_48k] | |
out_audio.extend(sub_out) | |
out_index = out_index + out_chunk | |
if (out_index < all_frame): | |
if (has_audio): | |
cut_s = out_index - hop_frame | |
cut_s_48k = hop_frame * hop_size | |
else: | |
cut_s = 0 | |
cut_s_48k = 0 | |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device) | |
sub_pit = pit[cut_s:].unsqueeze(0).to(device) | |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device) | |
sub_har = source[:, :, cut_s * hop_size:].to(device) | |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har) | |
sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
sub_out = sub_out[cut_s_48k:] | |
out_audio.extend(sub_out) | |
out_audio = np.asarray(out_audio) | |
return out_audio | |
def svc_main(sid, input_audio): | |
if input_audio is None: | |
return "You need to upload an audio", None | |
sampling_rate, audio = input_audio | |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
if len(audio.shape) > 1: | |
audio = librosa.to_mono(audio.transpose(1, 0)) | |
if sampling_rate != 16000: | |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) | |
if len(audio) > 16000 * 100: | |
audio = audio[:16000 * 100] | |
wav_path = "temp.wav" | |
soundfile.write(wav_path, audio, 16000, format="wav") | |
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy") | |
return "Success", (48000, out_audio) | |
def auto_search(name): | |
config = {'logfilepath': 'musicdl.log', 'savedir': 'downloaded', 'search_size_per_source': 5, 'proxies': {}} | |
albums = get_albums(keywords=name, config=config) | |
album = random.choice(albums) | |
save_path = get_random_spit(album) | |
return save_path | |
app = gr.Blocks() | |
with app: | |
title = "Singer Voice Clone 0.1 Demo" | |
desc = """ small singer voice clone Demo App. <br /> | |
Enter keywords auto search music to clone or upload music yourself | |
It's just a simplified demo, you can use more advanced features optimize music quality <br />""" | |
tutorial_link = "https://docs.cworld.ai" | |
gr.HTML( | |
f""" | |
<div style="text-align: center; margin: 0 auto;"> | |
<div | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 0.8rem; | |
font-size: 1.75rem; | |
" | |
> | |
<svg height="100%" stroke-miterlimit="10" style="fill-rule:nonzero;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;" version="1.1" viewBox="0 0 100 100" width="100%" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> | |
<defs/> | |
<clipPath id="ArtboardFrame"> | |
<rect height="100" width="100" x="0" y="0"/> | |
</clipPath> | |
<g clip-path="url(#ArtboardFrame)" id="SvgjsG2907"> | |
<g opacity="1"> | |
<g opacity="1"> | |
<path d="M49.5597 6.74187C73.4486 6.74187 92.893 26.1863 92.893 50.0752C92.893 73.9641 73.4486 93.4085 49.5597 93.4085C25.6708 93.4085 6.22637 73.9641 6.22637 50.0752C6.22637 26.1863 25.6708 6.74187 49.5597 6.74187M49.5597 0.075206C21.893 0.075206-0.440293 22.4085-0.440293 50.0752C-0.440293 77.7419 21.893 100.075 49.5597 100.075C77.2264 100.075 99.5597 77.7419 99.5597 50.0752C99.5597 22.4085 77.2264 0.075206 49.5597 0.075206L49.5597 0.075206Z" fill="#111111" fill-rule="nonzero" opacity="1" stroke="none"/> | |
<path d="M55.1153 77.853L44.0042 77.853L44.0042 72.2974C44.0042 69.1863 46.4486 66.7419 49.5597 66.7419L49.5597 66.7419C52.6708 66.7419 55.1153 69.1863 55.1153 72.2974L55.1153 77.853Z" fill="#111111" fill-rule="nonzero" opacity="1" stroke="none"/> | |
<path d="M21.7819 33.4085L32.893 33.4085L32.893 33.4085L32.893 55.6308L32.893 55.6308L21.7819 55.6308L21.7819 55.6308L21.7819 33.4085L21.7819 33.4085Z" fill="#111111" fill-rule="nonzero" opacity="1" stroke="none"/> | |
<path d="M66.2264 33.4085L77.3375 33.4085L77.3375 33.4085L77.3375 55.6308L77.3375 55.6308L66.2264 55.6308L66.2264 55.6308L66.2264 33.4085L66.2264 33.4085Z" fill="#111111" fill-rule="nonzero" opacity="1" stroke="none"/> | |
</g> | |
</g> | |
</g> | |
</svg> | |
<h1 style="font-weight: 900; margin-bottom: 7px;margin-top:5px"> | |
{title} | |
</h1> | |
</div> | |
<p style="margin-bottom: 10px; font-size: 94%; line-height: 23px;"> | |
{desc} | |
There is the <a href="{tutorial_link}"> tutorial </a> | |
</p> | |
</div> | |
""" | |
) | |
sid = gr.Dropdown(label="Singer", choices=["22", "33", "47", "51"], value="47") | |
vc_input2 = gr.Textbox(label="Music Name") | |
vc_search = gr.Button("Auto Search", variant="primary") | |
vc_input3 = gr.Audio(label="Upload Music Yourself") | |
vc_search.click(auto_search, [vc_input2], [vc_input3]) | |
vc_submit = gr.Button("Convert", variant="primary") | |
vc_output1 = gr.Textbox(label="Run Status") | |
vc_output2 = gr.Audio(label="Result Audio") | |
vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2]) | |
app.launch() | |