Spaces:

TIMBOVILL
/

UltraSingerUI

Sleeping

File size: 4,459 Bytes

f5799e7

"""Silence processing module"""

from pydub import AudioSegment, silence

from modules.console_colors import ULTRASINGER_HEAD
from modules.Speech_Recognition.TranscribedData import TranscribedData

def remove_silence_from_transcription_data(audio_path: str, transcribed_data: list[TranscribedData]) -> list[
    TranscribedData]:
    """Remove silence from given transcription data"""

    print(
        f"{ULTRASINGER_HEAD} Removing silent parts from transcription data"
    )

    silence_timestamps = get_silence_sections(audio_path)
    data = remove_silence(silence_timestamps, transcribed_data)
    return data


def get_silence_sections(audio_path: str,
                         min_silence_len=50,
                         silence_thresh=-50) -> list[tuple[float, float]]:
    y = AudioSegment.from_wav(audio_path)
    s = silence.detect_silence(y, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    s = [((start / 1000), (stop / 1000)) for start, stop in s]  # convert to sec
    return s


def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_data: list[TranscribedData]):
    new_transcribed_data = []

    for data in transcribed_data:
        new_transcribed_data.append(data)

        origin_end = data.end
        was_split = False

        for silence_start, silence_end in silence_parts_list:

            # |    ****    | silence
            # |  **    **  | data
            # |0 1 2 3 4 5 | time
            if silence_start > origin_end or silence_end < data.start:
                continue

            # |    **  **    | silence
            # |  **********  | data
            # |0 1 2 3 4 5 6 | time
            if silence_start >= data.start and silence_end <= origin_end:
                next_index = silence_parts_list.index((silence_start, silence_end)) + 1
                if next_index < len(silence_parts_list) and silence_parts_list[next_index][0] < origin_end:
                    split_end = silence_parts_list[next_index][0]

                    if silence_parts_list[next_index][1] >= origin_end:
                        split_word = "~ "
                        is_word_end = True
                    else:
                        split_word = "~"
                        is_word_end = False
                else:
                    split_end = origin_end
                    split_word = "~ "
                    is_word_end = True

                split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})

                if not was_split:
                    data.end = silence_start

                    if data.end - data.start < 0.1:
                        data.start = silence_end
                        data.end = split_end
                        continue

                    if split_data.end - split_data.start <= 0.1:
                        continue

                    data.is_word_end = False

                    # Remove last whitespace from the data.word
                    if data.word[-1] == " ":
                        data.word = data.word[:-1]

                if split_data.end - split_data.start > 0.1:
                    was_split = True
                    new_transcribed_data.append(split_data)
                elif split_word == "~ " and not data.is_word_end:
                    if new_transcribed_data[-1].word[-1] != " ":
                        new_transcribed_data[-1].word += " "
                    new_transcribed_data[-1].is_word_end = True

                continue

            # |    ****  | silence
            # |     **   | data
            # |0 1 2 3 4 | time
            if silence_start < data.start and silence_end > origin_end:
                new_transcribed_data.remove(data)
                break

            # |    ****    | silence
            # |      ****  | data
            # |0 1 2 3 4 5 | time
            if silence_start < data.start:
                data.start = silence_end

            # |    ****  | silence
            # |  ****    | data
            # |0 1 2 3 4 | time
            if silence_end > origin_end:
                data.end = silence_start

            # |    ****  | silence
            # |  **      | data
            # |0 1 2 3 4 | time
            if silence_start > origin_end:
                # Nothing to do with this word anymore, go to next word
                break
    return new_transcribed_data