Spaces:
Sleeping
Sleeping
File size: 4,459 Bytes
f5799e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
"""Silence processing module"""
from pydub import AudioSegment, silence
from modules.console_colors import ULTRASINGER_HEAD
from modules.Speech_Recognition.TranscribedData import TranscribedData
def remove_silence_from_transcription_data(audio_path: str, transcribed_data: list[TranscribedData]) -> list[
TranscribedData]:
"""Remove silence from given transcription data"""
print(
f"{ULTRASINGER_HEAD} Removing silent parts from transcription data"
)
silence_timestamps = get_silence_sections(audio_path)
data = remove_silence(silence_timestamps, transcribed_data)
return data
def get_silence_sections(audio_path: str,
min_silence_len=50,
silence_thresh=-50) -> list[tuple[float, float]]:
y = AudioSegment.from_wav(audio_path)
s = silence.detect_silence(y, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
s = [((start / 1000), (stop / 1000)) for start, stop in s] # convert to sec
return s
def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_data: list[TranscribedData]):
new_transcribed_data = []
for data in transcribed_data:
new_transcribed_data.append(data)
origin_end = data.end
was_split = False
for silence_start, silence_end in silence_parts_list:
# | **** | silence
# | ** ** | data
# |0 1 2 3 4 5 | time
if silence_start > origin_end or silence_end < data.start:
continue
# | ** ** | silence
# | ********** | data
# |0 1 2 3 4 5 6 | time
if silence_start >= data.start and silence_end <= origin_end:
next_index = silence_parts_list.index((silence_start, silence_end)) + 1
if next_index < len(silence_parts_list) and silence_parts_list[next_index][0] < origin_end:
split_end = silence_parts_list[next_index][0]
if silence_parts_list[next_index][1] >= origin_end:
split_word = "~ "
is_word_end = True
else:
split_word = "~"
is_word_end = False
else:
split_end = origin_end
split_word = "~ "
is_word_end = True
split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})
if not was_split:
data.end = silence_start
if data.end - data.start < 0.1:
data.start = silence_end
data.end = split_end
continue
if split_data.end - split_data.start <= 0.1:
continue
data.is_word_end = False
# Remove last whitespace from the data.word
if data.word[-1] == " ":
data.word = data.word[:-1]
if split_data.end - split_data.start > 0.1:
was_split = True
new_transcribed_data.append(split_data)
elif split_word == "~ " and not data.is_word_end:
if new_transcribed_data[-1].word[-1] != " ":
new_transcribed_data[-1].word += " "
new_transcribed_data[-1].is_word_end = True
continue
# | **** | silence
# | ** | data
# |0 1 2 3 4 | time
if silence_start < data.start and silence_end > origin_end:
new_transcribed_data.remove(data)
break
# | **** | silence
# | **** | data
# |0 1 2 3 4 5 | time
if silence_start < data.start:
data.start = silence_end
# | **** | silence
# | **** | data
# |0 1 2 3 4 | time
if silence_end > origin_end:
data.end = silence_start
# | **** | silence
# | ** | data
# |0 1 2 3 4 | time
if silence_start > origin_end:
# Nothing to do with this word anymore, go to next word
break
return new_transcribed_data |