|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import whisper |
|
from moviepy.editor import * |
|
from moviepy.video.VideoClip import TextClip |
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
model = whisper.load_model("base", device=DEVICE) |
|
|
|
|
|
def generate_video(audio_path, language): |
|
|
|
result = model.transcribe(audio_path, language=language) |
|
|
|
|
|
clips = [] |
|
for segment in result["segments"]: |
|
text_clip = ( |
|
TextClip( |
|
segment["text"], |
|
fontsize=24, |
|
font="Arial", |
|
color="white", |
|
bg_color="black", |
|
size=(1280, 720), |
|
) |
|
.set_duration(segment["end"] - segment["start"]) |
|
.set_start(segment["start"]) |
|
) |
|
clips.append(text_clip) |
|
|
|
|
|
video = concatenate_videoclips(clips, method="compose") |
|
video = video.set_audio(AudioFileClip(audio_path)) |
|
|
|
|
|
output_path = "./transcribed_video.mp4" |
|
video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac") |
|
|
|
return output_path |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
print( |
|
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " |
|
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=generate_video, |
|
inputs=[ |
|
gr.Audio( |
|
sources=["upload", "microphone"], type="filepath", label="Audio File" |
|
), |
|
gr.Dropdown( |
|
["en", "es", "fr", "de", "it", "nl", "ru", "zh"], |
|
label="Language", |
|
), |
|
], |
|
outputs=gr.Video(label="Play Video", show_download_button=True), |
|
title="Audio Transcription Video Generator", |
|
description="Upload your audio file and select the language for transcription.", |
|
) |
|
|
|
iface.launch() |
|
|