File size: 4,459 Bytes
f5799e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Silence processing module"""

from pydub import AudioSegment, silence

from modules.console_colors import ULTRASINGER_HEAD
from modules.Speech_Recognition.TranscribedData import TranscribedData

def remove_silence_from_transcription_data(audio_path: str, transcribed_data: list[TranscribedData]) -> list[
    TranscribedData]:
    """Remove silence from given transcription data"""

    print(
        f"{ULTRASINGER_HEAD} Removing silent parts from transcription data"
    )

    silence_timestamps = get_silence_sections(audio_path)
    data = remove_silence(silence_timestamps, transcribed_data)
    return data


def get_silence_sections(audio_path: str,
                         min_silence_len=50,
                         silence_thresh=-50) -> list[tuple[float, float]]:
    y = AudioSegment.from_wav(audio_path)
    s = silence.detect_silence(y, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    s = [((start / 1000), (stop / 1000)) for start, stop in s]  # convert to sec
    return s


def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_data: list[TranscribedData]):
    new_transcribed_data = []

    for data in transcribed_data:
        new_transcribed_data.append(data)

        origin_end = data.end
        was_split = False

        for silence_start, silence_end in silence_parts_list:

            # |    ****    | silence
            # |  **    **  | data
            # |0 1 2 3 4 5 | time
            if silence_start > origin_end or silence_end < data.start:
                continue

            # |    **  **    | silence
            # |  **********  | data
            # |0 1 2 3 4 5 6 | time
            if silence_start >= data.start and silence_end <= origin_end:
                next_index = silence_parts_list.index((silence_start, silence_end)) + 1
                if next_index < len(silence_parts_list) and silence_parts_list[next_index][0] < origin_end:
                    split_end = silence_parts_list[next_index][0]

                    if silence_parts_list[next_index][1] >= origin_end:
                        split_word = "~ "
                        is_word_end = True
                    else:
                        split_word = "~"
                        is_word_end = False
                else:
                    split_end = origin_end
                    split_word = "~ "
                    is_word_end = True

                split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})

                if not was_split:
                    data.end = silence_start

                    if data.end - data.start < 0.1:
                        data.start = silence_end
                        data.end = split_end
                        continue

                    if split_data.end - split_data.start <= 0.1:
                        continue

                    data.is_word_end = False

                    # Remove last whitespace from the data.word
                    if data.word[-1] == " ":
                        data.word = data.word[:-1]

                if split_data.end - split_data.start > 0.1:
                    was_split = True
                    new_transcribed_data.append(split_data)
                elif split_word == "~ " and not data.is_word_end:
                    if new_transcribed_data[-1].word[-1] != " ":
                        new_transcribed_data[-1].word += " "
                    new_transcribed_data[-1].is_word_end = True

                continue

            # |    ****  | silence
            # |     **   | data
            # |0 1 2 3 4 | time
            if silence_start < data.start and silence_end > origin_end:
                new_transcribed_data.remove(data)
                break

            # |    ****    | silence
            # |      ****  | data
            # |0 1 2 3 4 5 | time
            if silence_start < data.start:
                data.start = silence_end

            # |    ****  | silence
            # |  ****    | data
            # |0 1 2 3 4 | time
            if silence_end > origin_end:
                data.end = silence_start

            # |    ****  | silence
            # |  **      | data
            # |0 1 2 3 4 | time
            if silence_start > origin_end:
                # Nothing to do with this word anymore, go to next word
                break
    return new_transcribed_data