Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- src/modules/Audio/denoise.py +35 -0
- src/modules/Audio/separation.py +25 -0
- src/modules/Audio/silence_processing.py +120 -0
- src/modules/Audio/vocal_chunks.py +89 -0
- src/modules/Audio/youtube.py +89 -0
src/modules/Audio/denoise.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Reduce noise from audio"""
|
2 |
+
|
3 |
+
import ffmpeg
|
4 |
+
|
5 |
+
from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted
|
6 |
+
|
7 |
+
|
8 |
+
def ffmpeg_reduce_noise(input_file_path: str, output_file: str) -> None:
|
9 |
+
"""Reduce noise from vocal audio with ffmpeg."""
|
10 |
+
|
11 |
+
# Denoise audio samples with FFT.
|
12 |
+
# A description of the accepted parameters follows.
|
13 |
+
|
14 |
+
# noise_reduction, nr
|
15 |
+
# Set the noise reduction in dB, allowed range is 0.01 to 97. Default value is 12 dB.
|
16 |
+
# noise_floor, nf
|
17 |
+
# Set the noise floor in dB, allowed range is -80 to -20. Default value is -50 dB.
|
18 |
+
# track_noise, tn
|
19 |
+
# Enable noise floor tracking. By default is disabled.
|
20 |
+
# With this enabled, noise floor is automatically adjusted.
|
21 |
+
|
22 |
+
print(
|
23 |
+
f"{ULTRASINGER_HEAD} Reduce noise from vocal audio with {blue_highlighted('ffmpeg')}."
|
24 |
+
)
|
25 |
+
try:
|
26 |
+
(
|
27 |
+
ffmpeg.input(input_file_path)
|
28 |
+
.output(output_file, af="afftdn=nr=70:nf=-80:tn=1")
|
29 |
+
.overwrite_output()
|
30 |
+
.run(capture_stdout=True, capture_stderr=True)
|
31 |
+
)
|
32 |
+
except ffmpeg.Error as ffmpeg_exception:
|
33 |
+
print("ffmpeg stdout:", ffmpeg_exception.stdout.decode("utf8"))
|
34 |
+
print("ffmpeg stderr:", ffmpeg_exception.stderr.decode("utf8"))
|
35 |
+
raise ffmpeg_exception
|
src/modules/Audio/separation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Separate vocals from audio"""
|
2 |
+
|
3 |
+
import subprocess
|
4 |
+
|
5 |
+
from modules.console_colors import (
|
6 |
+
ULTRASINGER_HEAD,
|
7 |
+
blue_highlighted,
|
8 |
+
red_highlighted,
|
9 |
+
)
|
10 |
+
from modules.os_helper import current_executor_path, move, path_join
|
11 |
+
|
12 |
+
|
13 |
+
def separate_audio(input_file_path: str, output_file: str, device="cpu") -> None:
|
14 |
+
"""Separate vocals from audio with demucs."""
|
15 |
+
|
16 |
+
print(
|
17 |
+
f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker."
|
18 |
+
)
|
19 |
+
# Model selection?
|
20 |
+
# -n htdemucs_ft
|
21 |
+
subprocess.run(
|
22 |
+
["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path]
|
23 |
+
)
|
24 |
+
separated_folder = path_join(current_executor_path(), "separated")
|
25 |
+
move(separated_folder, output_file)
|
src/modules/Audio/silence_processing.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Silence processing module"""
|
2 |
+
|
3 |
+
from pydub import AudioSegment, silence
|
4 |
+
|
5 |
+
from modules.console_colors import ULTRASINGER_HEAD
|
6 |
+
from modules.Speech_Recognition.TranscribedData import TranscribedData
|
7 |
+
|
8 |
+
def remove_silence_from_transcription_data(audio_path: str, transcribed_data: list[TranscribedData]) -> list[
|
9 |
+
TranscribedData]:
|
10 |
+
"""Remove silence from given transcription data"""
|
11 |
+
|
12 |
+
print(
|
13 |
+
f"{ULTRASINGER_HEAD} Removing silent parts from transcription data"
|
14 |
+
)
|
15 |
+
|
16 |
+
silence_timestamps = get_silence_sections(audio_path)
|
17 |
+
data = remove_silence(silence_timestamps, transcribed_data)
|
18 |
+
return data
|
19 |
+
|
20 |
+
|
21 |
+
def get_silence_sections(audio_path: str,
|
22 |
+
min_silence_len=50,
|
23 |
+
silence_thresh=-50) -> list[tuple[float, float]]:
|
24 |
+
y = AudioSegment.from_wav(audio_path)
|
25 |
+
s = silence.detect_silence(y, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
|
26 |
+
s = [((start / 1000), (stop / 1000)) for start, stop in s] # convert to sec
|
27 |
+
return s
|
28 |
+
|
29 |
+
|
30 |
+
def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_data: list[TranscribedData]):
|
31 |
+
new_transcribed_data = []
|
32 |
+
|
33 |
+
for data in transcribed_data:
|
34 |
+
new_transcribed_data.append(data)
|
35 |
+
|
36 |
+
origin_end = data.end
|
37 |
+
was_split = False
|
38 |
+
|
39 |
+
for silence_start, silence_end in silence_parts_list:
|
40 |
+
|
41 |
+
# | **** | silence
|
42 |
+
# | ** ** | data
|
43 |
+
# |0 1 2 3 4 5 | time
|
44 |
+
if silence_start > origin_end or silence_end < data.start:
|
45 |
+
continue
|
46 |
+
|
47 |
+
# | ** ** | silence
|
48 |
+
# | ********** | data
|
49 |
+
# |0 1 2 3 4 5 6 | time
|
50 |
+
if silence_start >= data.start and silence_end <= origin_end:
|
51 |
+
next_index = silence_parts_list.index((silence_start, silence_end)) + 1
|
52 |
+
if next_index < len(silence_parts_list) and silence_parts_list[next_index][0] < origin_end:
|
53 |
+
split_end = silence_parts_list[next_index][0]
|
54 |
+
|
55 |
+
if silence_parts_list[next_index][1] >= origin_end:
|
56 |
+
split_word = "~ "
|
57 |
+
is_word_end = True
|
58 |
+
else:
|
59 |
+
split_word = "~"
|
60 |
+
is_word_end = False
|
61 |
+
else:
|
62 |
+
split_end = origin_end
|
63 |
+
split_word = "~ "
|
64 |
+
is_word_end = True
|
65 |
+
|
66 |
+
split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})
|
67 |
+
|
68 |
+
if not was_split:
|
69 |
+
data.end = silence_start
|
70 |
+
|
71 |
+
if data.end - data.start < 0.1:
|
72 |
+
data.start = silence_end
|
73 |
+
data.end = split_end
|
74 |
+
continue
|
75 |
+
|
76 |
+
if split_data.end - split_data.start <= 0.1:
|
77 |
+
continue
|
78 |
+
|
79 |
+
data.is_word_end = False
|
80 |
+
|
81 |
+
# Remove last whitespace from the data.word
|
82 |
+
if data.word[-1] == " ":
|
83 |
+
data.word = data.word[:-1]
|
84 |
+
|
85 |
+
if split_data.end - split_data.start > 0.1:
|
86 |
+
was_split = True
|
87 |
+
new_transcribed_data.append(split_data)
|
88 |
+
elif split_word == "~ " and not data.is_word_end:
|
89 |
+
if new_transcribed_data[-1].word[-1] != " ":
|
90 |
+
new_transcribed_data[-1].word += " "
|
91 |
+
new_transcribed_data[-1].is_word_end = True
|
92 |
+
|
93 |
+
continue
|
94 |
+
|
95 |
+
# | **** | silence
|
96 |
+
# | ** | data
|
97 |
+
# |0 1 2 3 4 | time
|
98 |
+
if silence_start < data.start and silence_end > origin_end:
|
99 |
+
new_transcribed_data.remove(data)
|
100 |
+
break
|
101 |
+
|
102 |
+
# | **** | silence
|
103 |
+
# | **** | data
|
104 |
+
# |0 1 2 3 4 5 | time
|
105 |
+
if silence_start < data.start:
|
106 |
+
data.start = silence_end
|
107 |
+
|
108 |
+
# | **** | silence
|
109 |
+
# | **** | data
|
110 |
+
# |0 1 2 3 4 | time
|
111 |
+
if silence_end > origin_end:
|
112 |
+
data.end = silence_start
|
113 |
+
|
114 |
+
# | **** | silence
|
115 |
+
# | ** | data
|
116 |
+
# |0 1 2 3 4 | time
|
117 |
+
if silence_start > origin_end:
|
118 |
+
# Nothing to do with this word anymore, go to next word
|
119 |
+
break
|
120 |
+
return new_transcribed_data
|
src/modules/Audio/vocal_chunks.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Vocal chunks module."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import wave
|
6 |
+
|
7 |
+
from modules.console_colors import ULTRASINGER_HEAD
|
8 |
+
from modules.os_helper import create_folder
|
9 |
+
from modules.Ultrastar.ultrastar_converter import (
|
10 |
+
get_end_time_from_ultrastar,
|
11 |
+
get_start_time_from_ultrastar,
|
12 |
+
)
|
13 |
+
from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
|
14 |
+
|
15 |
+
|
16 |
+
class AudioManipulation:
|
17 |
+
"""Docstring"""
|
18 |
+
|
19 |
+
|
20 |
+
def export_chunks_from_transcribed_data(
|
21 |
+
audio_filename: str, transcribed_data: [], output_folder_name: str
|
22 |
+
) -> None:
|
23 |
+
"""Export transcribed_data as vocal chunks wav files"""
|
24 |
+
print(
|
25 |
+
f"{ULTRASINGER_HEAD} Export transcribed data as vocal chunks wav files"
|
26 |
+
)
|
27 |
+
|
28 |
+
wave_file = wave.open(audio_filename, "rb")
|
29 |
+
sample_rate, n_channels = wave_file.getparams()[2], wave_file.getparams()[0]
|
30 |
+
|
31 |
+
for i, data in enumerate(transcribed_data):
|
32 |
+
start_byte = int(data.start * sample_rate * n_channels)
|
33 |
+
end_byte = int(data.end * sample_rate * n_channels)
|
34 |
+
|
35 |
+
chunk = get_chunk(end_byte, start_byte, wave_file)
|
36 |
+
export_chunk_to_wav_file(
|
37 |
+
chunk, output_folder_name, i, data.word, wave_file
|
38 |
+
)
|
39 |
+
|
40 |
+
wave_file.close()
|
41 |
+
|
42 |
+
|
43 |
+
def export_chunks_from_ultrastar_data(
|
44 |
+
audio_filename: str, ultrastar_data: UltrastarTxtValue, folder_name: str
|
45 |
+
) -> None:
|
46 |
+
"""Export ultrastar data as vocal chunks wav files"""
|
47 |
+
print(f"{ULTRASINGER_HEAD} Export Ultrastar data as vocal chunks wav files")
|
48 |
+
|
49 |
+
create_folder(folder_name)
|
50 |
+
|
51 |
+
wave_file = wave.open(audio_filename, "rb")
|
52 |
+
sample_rate, n_channels = wave_file.getparams()[2], wave_file.getparams()[0]
|
53 |
+
|
54 |
+
for i, word in enumerate(ultrastar_data.words):
|
55 |
+
start_time = get_start_time_from_ultrastar(ultrastar_data, i)
|
56 |
+
end_time = get_end_time_from_ultrastar(ultrastar_data, i)
|
57 |
+
|
58 |
+
start_byte = int(start_time * sample_rate * n_channels)
|
59 |
+
end_byte = int(end_time * sample_rate * n_channels)
|
60 |
+
|
61 |
+
chunk = get_chunk(end_byte, start_byte, wave_file)
|
62 |
+
export_chunk_to_wav_file(
|
63 |
+
chunk, folder_name, i, word, wave_file
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
def export_chunk_to_wav_file(chunk, folder_name: str, i: int, word: str, wave_file) -> None:
|
68 |
+
"""Export vocal chunks to wav file"""
|
69 |
+
|
70 |
+
clean_word = re.sub("[^A-Za-z0-9]+", "", word)
|
71 |
+
# todo: Progress?
|
72 |
+
# print(f"{str(i)} {clean_word}")
|
73 |
+
with wave.open(
|
74 |
+
os.path.join(folder_name, f"chunk_{i}_{clean_word}.wav"), "wb"
|
75 |
+
) as chunk_file:
|
76 |
+
chunk_file.setparams(wave_file.getparams())
|
77 |
+
chunk_file.writeframes(chunk)
|
78 |
+
|
79 |
+
|
80 |
+
def get_chunk(end_byte: int, start_byte: int, wave_file):
|
81 |
+
"""
|
82 |
+
Gets the chunk from wave file.
|
83 |
+
Returns chunk as n frames of audio, as a bytes object.
|
84 |
+
"""
|
85 |
+
|
86 |
+
# todo: get out of position error message
|
87 |
+
wave_file.setpos(start_byte) # ({:.2f})
|
88 |
+
chunk = wave_file.readframes(end_byte - start_byte)
|
89 |
+
return chunk
|
src/modules/Audio/youtube.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""YouTube Downloader"""
|
2 |
+
|
3 |
+
import io
|
4 |
+
import os
|
5 |
+
|
6 |
+
import yt_dlp
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
from modules.console_colors import ULTRASINGER_HEAD
|
10 |
+
from modules.Image.image_helper import crop_image_to_square
|
11 |
+
|
12 |
+
|
13 |
+
def get_youtube_title(url: str) -> tuple[str, str]:
|
14 |
+
"""Get the title of the YouTube video"""
|
15 |
+
|
16 |
+
ydl_opts = {}
|
17 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
18 |
+
result = ydl.extract_info(
|
19 |
+
url, download=False # We just want to extract the info
|
20 |
+
)
|
21 |
+
|
22 |
+
if "artist" in result:
|
23 |
+
return result["artist"].strip(), result["track"].strip()
|
24 |
+
if "-" in result["title"]:
|
25 |
+
return result["title"].split("-")[0].strip(), result["title"].split("-")[1].strip()
|
26 |
+
return result["channel"].strip(), result["title"].strip()
|
27 |
+
|
28 |
+
|
29 |
+
def download_youtube_audio(url: str, clear_filename: str, output_path: str):
|
30 |
+
"""Download audio from YouTube"""
|
31 |
+
|
32 |
+
print(f"{ULTRASINGER_HEAD} Downloading Audio")
|
33 |
+
ydl_opts = {
|
34 |
+
"format": "bestaudio/best",
|
35 |
+
"outtmpl": output_path + "/" + clear_filename,
|
36 |
+
"postprocessors": [
|
37 |
+
{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}
|
38 |
+
],
|
39 |
+
}
|
40 |
+
|
41 |
+
start_download(ydl_opts, url)
|
42 |
+
|
43 |
+
|
44 |
+
def download_youtube_thumbnail(url: str, clear_filename: str, output_path: str):
|
45 |
+
"""Download thumbnail from YouTube"""
|
46 |
+
|
47 |
+
print(f"{ULTRASINGER_HEAD} Downloading thumbnail")
|
48 |
+
ydl_opts = {
|
49 |
+
"skip_download": True,
|
50 |
+
"writethumbnail": True,
|
51 |
+
}
|
52 |
+
|
53 |
+
download_and_convert_thumbnail(ydl_opts, url, clear_filename, output_path)
|
54 |
+
|
55 |
+
|
56 |
+
def download_and_convert_thumbnail(ydl_opts, url: str, clear_filename: str, output_path: str) -> None:
|
57 |
+
"""Download and convert thumbnail from YouTube"""
|
58 |
+
|
59 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
60 |
+
info_dict = ydl.extract_info(url, download=False)
|
61 |
+
thumbnail_url = info_dict.get("thumbnail")
|
62 |
+
if thumbnail_url:
|
63 |
+
response = ydl.urlopen(thumbnail_url)
|
64 |
+
image_data = response.read()
|
65 |
+
image = Image.open(io.BytesIO(image_data))
|
66 |
+
image = image.convert('RGB') # Convert to RGB to avoid transparency or RGBA issues
|
67 |
+
image_path = os.path.join(output_path, clear_filename + " [CO].jpg")
|
68 |
+
image.save(image_path, "JPEG")
|
69 |
+
crop_image_to_square(image_path)
|
70 |
+
|
71 |
+
|
72 |
+
def download_youtube_video(url: str, clear_filename: str, output_path: str) -> None:
|
73 |
+
"""Download video from YouTube"""
|
74 |
+
|
75 |
+
print(f"{ULTRASINGER_HEAD} Downloading Video")
|
76 |
+
ydl_opts = {
|
77 |
+
"format": "bestvideo[ext=mp4]/mp4",
|
78 |
+
"outtmpl": output_path + "/" + clear_filename + ".mp4",
|
79 |
+
}
|
80 |
+
start_download(ydl_opts, url)
|
81 |
+
|
82 |
+
|
83 |
+
def start_download(ydl_opts, url: str) -> None:
|
84 |
+
"""Start the download the ydl_opts"""
|
85 |
+
|
86 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
87 |
+
errors = ydl.download(url)
|
88 |
+
if errors:
|
89 |
+
raise Exception("Download failed with error: " + str(errors))
|