import numpy as np import soundfile as sf # import torch from moviepy import AudioFileClip, VideoFileClip from pydub import AudioSegment from pytubefix import YouTube from pytubefix.cli import on_progress # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from source_separation import Predictor def token_verifier(): visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D" po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA==" return (visitor_data, po_token) def download_from_youtube(url, folder_path): yt = YouTube( url, on_progress_callback=on_progress, use_po_token=True, po_token_verifier=token_verifier, ) print(yt.title) ys = yt.streams.get_highest_resolution() ys.download(output_path=folder_path, filename="temp.mp4") def separate_video_and_audio(video_path, audio_path): # Load the video clip video_clip = VideoFileClip(video_path) # Extract the audio from the video clip audio_clip = video_clip.audio # Write the audio to a separate file audio_clip.write_audiofile(audio_path) def load_audio(audio_path, sample_rate=44_100): audio = AudioSegment.from_file(audio_path) print("Entering the preprocessing of audio") # Convert the audio file to WAV format audio = audio.set_frame_rate(sample_rate) audio = audio.set_sample_width(2) # Set bit depth to 16bit audio = audio.set_channels(1) # Set to mono print("Audio file converted to WAV format") # Calculate the gain to be applied target_dBFS = -20 gain = target_dBFS - audio.dBFS print(f"Calculating the gain needed for the audio: {gain} dB") # Normalize volume and limit gain range to between -3 and 3 normalized_audio = audio.apply_gain(min(max(gain, -3), 3)) waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32) max_amplitude = np.max(np.abs(waveform)) waveform /= max_amplitude # Normalize print(f"waveform shape: {waveform.shape}") print("waveform in np ndarray, dtype=" + str(waveform.dtype)) return waveform, sample_rate args = { "model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx", "denoise": True, "margin": 44100, "chunks": 15, "n_fft": 6144, "dim_t": 8, "dim_f": 3072, } separate_predictor = Predictor(args=args, device="cpu") def source_separation(waveform): """ Separate the audio into vocals and non-vocals using the given predictor. Args: predictor: The separation model predictor. audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate. Returns ------- dict: A dictionary containing the separated vocals and updated audio waveform. """ vocals, no_vocals = separate_predictor.predict(waveform) vocals = vocals[:, 0] # vocals is stereo, only use one channel no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel return vocals, no_vocals def export_to_wav(vocals, no_vocals, sample_rate, folder_path): """Export segmented audio to WAV files.""" sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate) sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate) def combine_video_and_audio(video_path, no_vocals_path, output_path): my_clip = VideoFileClip(video_path, audio=False) audio_background = AudioFileClip(no_vocals_path) my_clip.audio = audio_background my_clip.write_videofile(output_path) # https://www.youtube.com/watch?v=1jZEyU_eO1s def get_karaoke(url): folder_path = "data/samples/" video_path = folder_path + "temp.mp4" audio_path = folder_path + "temp.mp3" no_vocals_path = folder_path + "temp_no_vocals.wav" output_path = folder_path + "result.mp4" download_from_youtube(url, folder_path) separate_video_and_audio(video_path, audio_path) waveform, sample_rate = load_audio(audio_path) vocals, no_vocals = source_separation(waveform) export_to_wav(vocals, no_vocals, sample_rate, folder_path) combine_video_and_audio(video_path, no_vocals_path, output_path) return output_path