File size: 8,144 Bytes
856d805
 
5fd1d62
 
 
baafc0a
 
 
 
 
 
5fd1d62
 
1044a67
 
 
 
 
 
 
 
baafc0a
 
 
 
 
 
 
 
 
 
 
 
 
 
1044a67
 
 
 
 
 
 
 
 
 
 
856d805
 
 
 
 
 
baafc0a
1044a67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c05ea2
1044a67
6c05ea2
1044a67
6c05ea2
1044a67
baafc0a
 
 
856d805
baafc0a
 
 
 
 
5fd1d62
baafc0a
1044a67
7aa414b
5fd1d62
baafc0a
 
856d805
baafc0a
 
 
856d805
 
baafc0a
 
 
 
 
 
 
 
 
 
 
 
 
5fd1d62
baafc0a
 
 
 
 
 
 
856d805
baafc0a
 
 
5fd1d62
baafc0a
5fd1d62
 
1044a67
 
 
 
 
 
 
 
 
 
 
856d805
 
 
 
 
 
 
5fd1d62
7aa414b
1044a67
7a98cb1
856d805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1044a67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import os

import gradio as gr
import torch
import whisper
from moviepy.editor import (
    AudioFileClip,
    ColorClip,
    VideoFileClip,
    concatenate_videoclips,
)


def generate_srt_file(transcription_result: dict, srt_file_path: str, lag=0) -> None:
    """
    Write and save an SRT file from the transcription result.

    Args:
        transcription_result: The transcription result from Whisper model.
        srt_file_path: The path to save the SRT file.
    """
    with open(srt_file_path, "w") as file:
        for i, segment in enumerate(transcription_result["segments"], start=1):
            # Adjusting times for lag
            start_time = segment["start"] + lag
            end_time = segment["end"] + lag
            text = segment["text"]

            # Convert times to SRT format (HH:MM:SS,MS)
            start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}"
            end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}"

            file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n")


def get_srt_filename(video_path: str, audio_path: str = None) -> str:
    """
    Get the SRT filename based on the input video or audio file.

    Args:
        video_path: The path to the video file.
        audio_path: The path to the audio file.

    Returns:
        The SRT filename.
    """
    if video_path is not None:
        return os.path.splitext(os.path.basename(video_path))[0] + ".srt"
    else:
        return os.path.splitext(os.path.basename(audio_path))[0] + ".srt"


def generate_video(
    audio_path: str,
    video_path: str,
    input: str,
    language: str,
    lag: int,
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> tuple[str, str]:
    """
    Generate a subtitled video from the input audio or video file.

    Args:
        audio_path: The path to the audio file.
        video_path: The path to the video file.
        input: The type of input file (audio or video).
        language: The language code for transcription.
        lag: The lag time in seconds to delay the transcription.
        progress: The progress bar to show the progress of the task.

    Returns:
        The path to the generated video file and the SRT file.
    """
    if audio_path is None and video_path is None:
        raise gr.Error("Please upload an audio or video file.")
    if input == "Video" and video_path is None:
        raise gr.Error("Please upload a video file.")
    if input == "Audio" and audio_path is None:
        raise gr.Error("Please upload an audio file.")
    progress(0.0, "Checking input...")
    if input == "Video":
        progress(0.0, "Extracting audio from video...")
        audio_path = f"./{os.path.splitext(os.path.basename(video_path))[0]}.wav"
        video = VideoFileClip(video_path)
        video.audio.write_audiofile(audio_path)
        video.close()
        progress(0.1, "Audio extracted!")

    # Transcribe audio
    progress(0.1, "Transcribing audio...")
    result = MODEL.transcribe(audio_path, language=language)
    progress(0.30, "Audio transcribed!")

    # Generate SRT file
    progress(0.30, "Generating SRT file...")
    srt_file_path = get_srt_filename(video_path, audio_path)
    generate_srt_file(result, srt_file_path, lag=lag)
    progress(0.40, "SRT file generated!")

    if result["segments"] == []:
        raise gr.Error("No speech detected in the audio.")
    if input == "Video":
        if lag == 0:
            return video_path, srt_file_path
        else:
            # we simply extend the original video with a black screen at the end of duration lag
            video = VideoFileClip(video_path)
            black_screen = ColorClip(
                size=video.size, color=(0, 0, 0), duration=lag
            ).set_fps(1)
            final_video = concatenate_videoclips([video, black_screen])
            output_video_path = "./transcribed_video.mp4"
            final_video.write_videofile(
                output_video_path, codec="libx264", audio_codec="aac"
            )
            return output_video_path, srt_file_path
    else:
        output_video_path = "./transcribed_video.mp4"
        audio_clip = AudioFileClip(audio_path)
        duration = audio_clip.duration + lag
        video_clip = ColorClip(
            size=(1280, 720), color=(0, 0, 0), duration=duration
        ).set_fps(1)
        video_clip = video_clip.set_audio(audio_clip)
        video_clip.write_videofile(
            output_video_path, codec="libx264", audio_codec="aac"
        )
        return output_video_path, srt_file_path


def download_srt(audio_input: str, video_input: str) -> str:
    """
    Download the SRT file based on the input audio or video file.

    Args:
        audio_input: The path to the audio file.
        video_input: The path to the video file.

    Returns:
        The path to the downloaded SRT file.
    """
    srt_file_path = get_srt_filename(video_input, audio_input)
    if os.path.exists(srt_file_path):
        return srt_file_path
    else:
        raise gr.Error("No SRT file found. Please generate subtitles first.")


if __name__ == "__main__":
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    MODEL = whisper.load_model("base", device=DEVICE)

    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
        <div style="text-align: center;">
            <h1 style="color: #4A90E2; font-size: 3em;">Audio Transcription & Subtitled Video Generator πŸŽ₯✨</h1>
            <p style="font-size: 1.2em; color: #333; max-width: 1000px; margin: auto; text-align: left;">
                Transform your audio or video files into subtitled content effortlessly! <br>
                1. Upload your audio or video file, select the language, and receive a video with synchronized subtitles. <br>
                2. You can view the subtitled video directly here or download the subtitles as an SRT file for your use.
            </p>
        </div>
        """
        )

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(
                    sources=["upload", "microphone"],
                    type="filepath",
                    label="🎡 Upload Audio File",
                )
                video_input = gr.Video(
                    label="πŸ“Ή Or Upload Video File", sources=["upload", "webcam"]
                )
            with gr.Column():
                file_type = gr.Dropdown(
                    ["Video", "Audio"],
                    label="File Type",
                    value="Video",
                    interactive=True,
                )
                language = gr.Dropdown(
                    ["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
                    label="Select Language",
                    value="en",
                    interactive=True,
                )
                lag_slider = gr.Slider(
                    minimum=0,
                    maximum=10,
                    step=1,
                    value=0,
                    label="⏱ Lag (seconds): delay the transcription by this amount of time.",
                )
                transcribe_button = gr.Button(
                    "🎬 Generate Subtitled Video", variant="primary"
                )
                download_button = gr.Button("πŸ’Ύ Download SRT File", variant="secondary")

            with gr.Column():
                video_output = gr.Video(
                    label="Play Video with Subtitles", show_download_button=False
                )
                srt_file_output = gr.File(label="Download Subtitle (SRT)")

        transcribe_button.click(
            fn=generate_video,
            inputs=[audio_input, video_input, file_type, language, lag_slider],
            outputs=video_output,
        )

        download_button.click(
            fn=download_srt,
            inputs=[audio_input, video_input],
            outputs=srt_file_output,
        )

    demo.launch()