File size: 2,618 Bytes
e7a6721
f0b42bb
fee32f6
 
edc4e08
fee32f6
 
 
 
 
 
 
 
 
 
 
 
 
f0b42bb
edc4e08
 
 
fee32f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edc4e08
f0b42bb
 
 
fee32f6
f0b42bb
 
fee32f6
f0b42bb
 
fee32f6
 
edc4e08
fee32f6
f0b42bb
edc4e08
 
e7a6721
 
 
edc4e08
42e8713
 
 
e7a6721
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
import json
from faster_whisper import WhisperModel  # Assuming you have installed this library

def split_text_into_lines(data, max_chars, max_duration, max_gap):
    subtitles = []
    line = []
    line_duration = 0

    for idx, word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)

        duration_exceeded = line_duration > max_duration
        chars_exceeded = len(temp) > max_chars
        maxgap_exceeded = (word_data['start'] - data[idx - 1]['end']) > max_gap if idx > 0 else False

        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": temp,
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0

    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

def transcribe_audio(audiofilename, max_chars, max_duration, max_gap):
    model_size = "medium"
    model = WhisperModel(model_size)

    segments, info = model.transcribe(audiofilename, word_timestamps=True)
    segments = list(segments)  # The transcription will actually run here.
    wordlevel_info = []

    for segment in segments:
        for word in segment.words:
            wordlevel_info.append({'word': word.word, 'start': word.start, 'end': word.end})

    linelevel_subtitles = split_text_into_lines(wordlevel_info, max_chars, max_duration, max_gap)
    return linelevel_subtitles

def audio_transcription(audiofile, max_chars, max_duration, max_gap):
    transcription = transcribe_audio(audiofile, max_chars, max_duration, max_gap)
    return json.dumps(transcription, indent=4)

iface = gr.Interface(audio_transcription, 
                     [gr.Audio(sources="upload", type="filepath"),
                      gr.Number(label="MaxChars"),
                      gr.Number(label="MaxDuration"),
                      gr.Number(label="MaxGap")],
                     "text", 
                     description="Upload an audio file and get its transcription in JSON format.")
iface.launch()