File size: 6,990 Bytes
710db5f
 
 
 
 
 
 
 
 
 
9a005ca
 
 
 
 
1ca8bfc
9a005ca
 
 
 
710db5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5034a0a
 
 
 
6f8989f
939d453
cabbcc5
6f8989f
 
 
 
 
8e1fa57
 
fc0b265
5034a0a
d0fe6ef
 
fc0b265
2399e19
 
 
 
 
8dc4fc2
9287297
710db5f
 
 
 
 
 
 
 
 
 
 
 
b9fa8bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710db5f
b9fa8bc
710db5f
b9fa8bc
 
710db5f
b9fa8bc
 
 
 
 
 
 
 
 
 
 
 
 
de83212
710db5f
0de4f47
b9fa8bc
0de4f47
b9fa8bc
 
0de4f47
b9fa8bc
 
 
 
 
 
 
 
c1ab764
b9fa8bc
 
 
 
de83212
710db5f
ddfea13
710db5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import re


def timeformat_srt(time):
    hours = time // 3600
    minutes = (time - hours * 3600) // 60
    seconds = time - hours * 3600 - minutes * 60
    milliseconds = (time - int(time)) * 1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"

def timeformat_txt(time):
    hours = time // 3600
    minutes = (time - hours * 3600) // 60
    seconds = time - hours * 3600 - minutes * 60
    #milliseconds = (time - int(time)) * 1000
    if hours > 0:
        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
    else:
        return f"{int(minutes):02d}:{int(seconds):02d}"
    
def timeformat_vtt(time):
    hours = time // 3600
    minutes = (time - hours * 3600) // 60
    seconds = time - hours * 3600 - minutes * 60
    milliseconds = (time - int(time)) * 1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"


def write_file(subtitle, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(subtitle)


def get_srt(segments):
    output = ""
    for i, segment in enumerate(segments):
        output += f"{i + 1}\n"
        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
        if segment['text'].startswith(' '):
            segment['text'] = segment['text'][1:]
        output += f"{segment['text']}\n\n"
    return output

def get_csv(segments):
    bDiarization = False    
    output = ""

    # Check if speakers are identified
    for i, segment in enumerate(segments):
        if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None:
            bDiarization = True
            break

    for i, segment in enumerate(segments):
        if bDiarization:
            speaker_id = ((segment['text']).split(":", 1)[0]).strip()
            speaker_text = ((segment['text']).split(":", 1)[1]).strip()
            output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{speaker_id};{speaker_text};\n"
        else:
            speaker_text = (segment['text']).strip()
            output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{speaker_text};\n"
        
    # Add titles to csv file
    if bDiarization:
        output = "Line;Start time;End time;Speaker;Text;\n" + output
    else:    
        output = "Line;Start time;End time;Text;\n" + output

    return output.rstrip("\n")

def get_vtt(segments):
    output = "WebVTT\n\n"
    for i, segment in enumerate(segments):
        output += f"{i + 1}\n"
        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
        if segment['text'].startswith(' '):
            segment['text'] = segment['text'][1:]
        output += f"{segment['text']}\n\n"
    return output


#def get_txt(segments):
#    output = ""
#    for i, segment in enumerate(segments):
#        if segment['text'].startswith(' '):
#            segment['text'] = segment['text'][1:]
#        #output += f"{segment['text']}\n"
#        output += f"{timeformat_txt(segment['start'])}\t{segment['text']}\n"
#    return output

#def get_plaintext(segments):
#    output = ""
#    for i, segment in enumerate(segments):
#        if segment['text'].startswith(' '):
#            segment['text'] = segment['text'][1:]
#        output += f"{segment['text']}\n"
#    return output

def get_txt(segments):
    bDiarization = False 
    output = ""

    # Check if speakers are identified
    for i, segment in enumerate(segments):
        if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None:
            bDiarization = True
            break

    for i, segment in enumerate(segments):
        if bDiarization:
            speaker_id = ((segment['text']).split(":", 1)[0]).strip()
            speaker_text = ((segment['text']).split(":", 1)[1]).strip()
            output += f"{timeformat_txt(segment['start'])}\t{speaker_id}\t{speaker_text}\n"
        else:
            speaker_text = (segment['text']).strip()
            output += f"{timeformat_txt(segment['start'])}\t{speaker_text}\n"

    return output

def get_plaintext(segments):
    bDiarization = False 
    output = ""

    # Check if speakers are identified
    for i, segment in enumerate(segments):
        if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None:
            bDiarization = True
            break

    for i, segment in enumerate(segments):
        if bDiarization:
            speaker_id = ((segment['text']).split(":", 1)[0]).strip()
            speaker_text = ((segment['text']).split(":", 1)[1]).strip()
            output += f"{speaker_id}\t{speaker_text}\n"
        else:
            speaker_text = (segment['text']).strip()
            output += f"{speaker_text}\n"

    return output

def parse_srt(file_path):
    """Reads SRT file and returns as dict"""
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_data = file.read()

    data = []
    blocks = srt_data.split('\n\n')

    for block in blocks:
        if block.strip() != '':
            lines = block.strip().split('\n')
            index = lines[0]
            timestamp = lines[1]
            sentence = ' '.join(lines[2:])

            data.append({
                "index": index,
                "timestamp": timestamp,
                "sentence": sentence
            })
    return data


def parse_vtt(file_path):
    """Reads WebVTT file and returns as dict"""
    with open(file_path, 'r', encoding='utf-8') as file:
        webvtt_data = file.read()

    data = []
    blocks = webvtt_data.split('\n\n')

    for block in blocks:
        if block.strip() != '' and not block.strip().startswith("WebVTT"):
            lines = block.strip().split('\n')
            index = lines[0]
            timestamp = lines[1]
            sentence = ' '.join(lines[2:])

            data.append({
                "index": index,
                "timestamp": timestamp,
                "sentence": sentence
            })

    return data


def get_serialized_srt(dicts):
    output = ""
    for dic in dicts:
        output += f'{dic["index"]}\n'
        output += f'{dic["timestamp"]}\n'
        output += f'{dic["sentence"]}\n\n'
    return output


def get_serialized_vtt(dicts):
    output = "WebVTT\n\n"
    for dic in dicts:
        output += f'{dic["index"]}\n'
        output += f'{dic["timestamp"]}\n'
        output += f'{dic["sentence"]}\n\n'
    return output


def safe_filename(name):
    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
    # Truncate the filename if it exceeds the max_length (20)
    if len(safe_name) > 20:
        file_extension = safe_name.split('.')[-1]
        if len(file_extension) + 1 < 20:
            truncated_name = safe_name[:20 - len(file_extension) - 1]
            safe_name = truncated_name + '.' + file_extension
        else:
            safe_name = safe_name[:20]
    return safe_name