Spaces:
Sleeping
Sleeping
File size: 11,492 Bytes
c48371b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
import re, os
import requests
import json
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
pattern = r'//www\.bilibili\.com/video[^"]*'
def get_bilibili_video_id(url):
match = re.search(r'/video/([a-zA-Z0-9]+)/', url)
extracted_value = match.group(1)
return extracted_value
# Get bilibili audio
def find_first_appearance_with_neighborhood(text, pattern):
match = re.search(pattern, text)
if match:
return match.group()
else:
return None
def search_bilibili(keyword):
if keyword.startswith("BV"):
req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1".format(keyword), headers=headers).text
else:
req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1&tids=3&page=1".format(keyword), headers=headers).text
video_link = "https:" + find_first_appearance_with_neighborhood(req, pattern)
return video_link
def get_response(html_url):
headers = {
"referer": "https://www.bilibili.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
response = requests.get(html_url, headers=headers)
return response
def get_video_info(html_url):
response = get_response(html_url)
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data = json.loads(html_data)
if json_data['data']['dash']['audio'][0]['backupUrl']!=None:
audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
else:
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
return audio_url, video_url
def save_audio(title, html_url):
audio_url = get_video_info(html_url)[0]
#video_url = get_video_info(html_url)[1]
audio_content = get_response(audio_url).content
#video_content = get_response(video_url).content
with open(title + '.mp3', mode='wb') as f:
f.write(audio_content)
print("音乐内容保存完成")
#with open(title + '.mp4', mode='wb') as f:
# f.write(video_content)
#print("视频内容保存完成"
from uvr5.vr import AudioPre
weight_uvr5_root = "uvr5/uvr_model"
uvr5_names = []
for name in os.listdir(weight_uvr5_root):
if name.endswith(".pth") or "onnx" in name:
uvr5_names.append(name.replace(".pth", ""))
func = AudioPre
pre_fun_hp2 = func(
agg=int(10),
model_path=os.path.join(weight_uvr5_root, "UVR-HP2.pth"),
device=device,
is_half=True,
)
pre_fun_hp5 = func(
agg=int(10),
model_path=os.path.join(weight_uvr5_root, "UVR-HP5.pth"),
device=device,
is_half=True,
)
import webrtcvad
from pydub import AudioSegment
from pydub.utils import make_chunks
import os
import librosa
import soundfile
import gradio as gr
def vad(audio_name):
audio = AudioSegment.from_file(audio_name, format="wav")
# Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz)
audio = audio.set_frame_rate(48000)
# Set single channel (mono)
audio = audio.set_channels(1)
# Initialize VAD
vad = webrtcvad.Vad()
# Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive)
vad.set_mode(3)
# Convert pydub audio to bytes
frame_duration = 30 # Duration of a frame in ms
frame_width = int(audio.frame_rate * frame_duration / 1000) # width of a frame in samples
frames = make_chunks(audio, frame_duration)
# Perform voice activity detection
voiced_frames = []
for frame in frames:
if len(frame.raw_data) < frame_width * 2: # Ensure frame is correct length
break
is_speech = vad.is_speech(frame.raw_data, audio.frame_rate)
if is_speech:
voiced_frames.append(frame)
# Combine voiced frames back to an audio segment
voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0))
voiced_audio.export("voiced_audio.wav", format="wav")
def youtube_downloader(
video_identifier,
filename,
split_model,
start_time
):
print(video_identifier)
video_info = get_video_info(video_identifier)[0]
print(video_info)
audio_content = get_response(video_info).content
with open(filename.strip() + ".wav", mode="wb") as f:
f.write(audio_content)
audio_path = filename.strip() + ".wav"
start_ms = start_time * 1000
end_ms = start_ms + 45000
# make dir output
os.makedirs("output", exist_ok=True)
if split_model=="UVR-HP2":
pre_fun = pre_fun_hp2
else:
pre_fun = pre_fun_hp5
audio_orig = AudioSegment.from_file(audio_path)
if len(audio_orig) > end_ms:
# Extract the segment
segment = audio_orig[start_ms:end_ms]
segment.export(filename.strip() + ".wav", format="wav")
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
os.remove(filename.strip()+".wav")
else:
segment = audio_orig[start_ms:len(audio_orig)]
segment.export(filename.strip() + ".wav", format="wav")
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
os.remove(filename.strip()+".wav")
return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"
def youtube_downloader_100s(
video_identifier,
filename,
split_model
):
print(video_identifier)
video_info = get_video_info(video_identifier)[0]
print(video_info)
audio_content = get_response(video_info).content
with open(filename.strip() + ".wav", mode="wb") as f:
f.write(audio_content)
audio_path = filename.strip() + ".wav"
if split_model=="UVR-HP2":
pre_fun = pre_fun_hp2
else:
pre_fun = pre_fun_hp5
os.makedirs("output", exist_ok=True)
audio_orig = AudioSegment.from_file(audio_path)
if len(audio_orig) > 120000:
start_ms = 10000
end_ms = start_ms + 110000
# Extract the segment
segment = audio_orig[start_ms:end_ms]
segment.export(filename.strip() + ".wav", format="wav")
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
os.remove(filename.strip()+".wav")
else:
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
os.remove(filename.strip()+".wav")
return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"
def convert(start_time, song_name_src, song_name_ref, check_song, key_shift, vocal_vol, inst_vol):
split_model = "UVR-HP5"
song_name_ref = song_name_ref.strip().replace(" ", "")
video_identifier = search_bilibili(song_name_ref)
song_id = get_bilibili_video_id(video_identifier)
song_name_src = song_name_src.strip().replace(" ", "")
video_identifier_src = search_bilibili(song_name_src)
song_id_src = get_bilibili_video_id(video_identifier_src)
if os.path.isdir(f"./output/{split_model}/{song_id}")==False:
audio, sr = librosa.load(youtube_downloader_100s(video_identifier, song_id, split_model)[0], sr=24000, mono=True)
soundfile.write("audio_ref.wav", audio, sr)
else:
audio, sr = librosa.load(f"./output/{split_model}/{song_id}/vocal_{song_id}.wav_10.wav", sr=24000, mono=True)
soundfile.write("audio_ref.wav", audio, sr)
vad("audio_ref.wav")
#if os.path.isdir(f"./output/{split_model}/{song_id_src}")==False:
audio_src, sr_src = librosa.load(youtube_downloader(video_identifier_src, song_id_src, split_model, start_time)[0], sr=24000, mono=True)
soundfile.write("audio_src.wav", audio_src, sr_src)
#else:
# audio_src, sr_src = librosa.load(f"./output/{split_model}/{song_id_src}/vocal_{song_id_src}.wav_10.wav", sr=24000, mono=True)
# soundfile.write("audio_src.wav", audio_src, sr_src)
if os.path.isfile("output_svc/NeuCoSVCv2.wav"):
os.remove("output_svc/NeuCoSVCv2.wav")
if check_song == True:
os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift}")
else:
os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift} --speech_enroll")
audio_vocal = AudioSegment.from_file("output_svc/NeuCoSVCv2.wav", format="wav")
# Load the second audio file
audio_inst = AudioSegment.from_file(f"output/{split_model}/{song_id_src}/instrument_{song_id_src}.wav_10.wav", format="wav")
audio_vocal = audio_vocal + vocal_vol # Increase volume of the first audio by 5 dB
audio_inst = audio_inst + inst_vol # Decrease volume of the second audio by 5 dB
# Concatenate audio files
combined_audio = audio_vocal.overlay(audio_inst)
# Export the concatenated audio to a new file
combined_audio.export(f"{song_name_src}-AI翻唱.wav", format="wav")
return f"{song_name_src}-AI翻唱.wav"
app = gr.Blocks()
with app:
gr.Markdown("# <center>🥳💕🎶 NeuCoSVC v2 AI歌手全明星,无需训练、一键翻唱、重磅更新!</center>")
gr.Markdown("## <center>🌟 只需 1 个歌曲名,一键翻唱任意歌手的任意歌曲,支持说话语音翻唱,随时随地,听你想听!</center>")
gr.Markdown("### <center>🌊 NeuCoSVC v2 先享版 Powered by Tencent ARC Lab & Tsinghua University 💕</center>")
with gr.Row():
with gr.Column():
with gr.Row():
inp1 = gr.Textbox(label="请填写想要AI翻唱的歌曲或BV号", info="直接填写BV号的得到的歌曲最匹配,也可以选择填写“歌曲名+歌手名”")
inp2 = gr.Textbox(label="请填写含有目标音色的歌曲或BV号", info="例如您希望使用AI周杰伦的音色,就在此处填写周杰伦的任意一首歌")
with gr.Row():
inp0 = gr.Number(value=0, label="起始时间 (秒)", info="此程序将自动从起始时间开始提取45秒的翻唱歌曲")
inp3 = gr.Checkbox(label="参考音频是否为歌曲演唱,默认为是", info="如果参考音频为正常说话语音,请取消打勾", value=True)
inp4 = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="歌曲人声升降调", info="默认为0,+2为升高2个key,以此类推")
with gr.Row():
inp5 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="调节人声音量,默认为0")
inp6 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="调节伴奏音量,默认为0")
btn = gr.Button("一键开启AI翻唱之旅吧💕", variant="primary")
with gr.Column():
out = gr.Audio(label="AI歌手为您倾情演唱的歌曲", type="filepath", interactive=True)
btn.click(convert, [inp0, inp1, inp2, inp3, inp4, inp5, inp6], out)
gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
gr.HTML('''
<div class="footer">
<p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
</p>
</div>
''')
app.queue().launch(share=True, show_error=True) |