File size: 2,312 Bytes
5fd1d62
 
 
7a98cb1
5fd1d62
 
 
7a98cb1
5fd1d62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a98cb1
 
 
5fd1d62
 
7a98cb1
 
5fd1d62
 
 
 
 
 
 
 
 
 
7a98cb1
 
 
cc9d80e
5fd1d62
7a98cb1
 
5fd1d62
 
 
 
bfe569d
 
 
5fd1d62
 
 
7a98cb1
 
 
 
 
 
 
 
5fd1d62
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import torch
import whisper
from moviepy.editor import AudioFileClip, ColorClip, concatenate_videoclips
from moviepy.video.VideoClip import TextClip


def generate_video(audio_path, language, lag):
    # Transcribe audio
    result = model.transcribe(audio_path, language=language)

    # Prepare video clips from transcription segments
    clips = []
    for segment in result["segments"]:
        text_clip = (
            TextClip(
                segment["text"],
                fontsize=24,
                font="Arial",
                color="white",
                bg_color="black",
                size=(1280, 720),
            )
            .set_duration(segment["end"] - segment["start"])
            .set_start(segment["start"])
        )
        clips.append(text_clip)

    if lag > 0:
        clips.insert(0, ColorClip((1280, 720), color=(0, 0, 0)).set_duration(lag))

    # Concatenate clips and set audio
    video = concatenate_videoclips(clips, method="compose")

    # Add audio to the video
    video = video.set_audio(AudioFileClip(audio_path))

    # Export video to a buffer
    output_path = "./transcribed_video.mp4"
    video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac")

    return output_path


if __name__ == "__main__":
    DEVICE = (
        "cuda"
        if torch.cuda.is_available()
        else "cpu"
    )
    model = whisper.load_model("base", device=DEVICE)

    # Gradio interface
    iface = gr.Interface(
        fn=generate_video,
        inputs=[
            gr.Audio(
                sources=["upload", "microphone"], type="filepath", label="Audio File"
            ),
            gr.Dropdown(
                ["en", "es", "fr", "de", "it", "nl", "ru", "zh"],
                label="Language",
                value="en",
            ),
            gr.Slider(
                minimum=0,
                maximum=10,
                step=1,
                value=0,
                label="Lag (seconds): delay the transcription by this amount of time.",
            ),
        ],
        outputs=gr.Video(label="Play Video", show_download_button=True),
        title="Audio Transcription Video Generator",
        description="Upload your audio file and select the language for transcription.",
    )

    iface.launch()