File size: 8,144 Bytes
856d805 5fd1d62 baafc0a 5fd1d62 1044a67 baafc0a 1044a67 856d805 baafc0a 1044a67 6c05ea2 1044a67 6c05ea2 1044a67 6c05ea2 1044a67 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 1044a67 7aa414b 5fd1d62 baafc0a 856d805 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 5fd1d62 1044a67 856d805 5fd1d62 7aa414b 1044a67 7a98cb1 856d805 1044a67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
import os
import gradio as gr
import torch
import whisper
from moviepy.editor import (
AudioFileClip,
ColorClip,
VideoFileClip,
concatenate_videoclips,
)
def generate_srt_file(transcription_result: dict, srt_file_path: str, lag=0) -> None:
"""
Write and save an SRT file from the transcription result.
Args:
transcription_result: The transcription result from Whisper model.
srt_file_path: The path to save the SRT file.
"""
with open(srt_file_path, "w") as file:
for i, segment in enumerate(transcription_result["segments"], start=1):
# Adjusting times for lag
start_time = segment["start"] + lag
end_time = segment["end"] + lag
text = segment["text"]
# Convert times to SRT format (HH:MM:SS,MS)
start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}"
end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}"
file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n")
def get_srt_filename(video_path: str, audio_path: str = None) -> str:
"""
Get the SRT filename based on the input video or audio file.
Args:
video_path: The path to the video file.
audio_path: The path to the audio file.
Returns:
The SRT filename.
"""
if video_path is not None:
return os.path.splitext(os.path.basename(video_path))[0] + ".srt"
else:
return os.path.splitext(os.path.basename(audio_path))[0] + ".srt"
def generate_video(
audio_path: str,
video_path: str,
input: str,
language: str,
lag: int,
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> tuple[str, str]:
"""
Generate a subtitled video from the input audio or video file.
Args:
audio_path: The path to the audio file.
video_path: The path to the video file.
input: The type of input file (audio or video).
language: The language code for transcription.
lag: The lag time in seconds to delay the transcription.
progress: The progress bar to show the progress of the task.
Returns:
The path to the generated video file and the SRT file.
"""
if audio_path is None and video_path is None:
raise gr.Error("Please upload an audio or video file.")
if input == "Video" and video_path is None:
raise gr.Error("Please upload a video file.")
if input == "Audio" and audio_path is None:
raise gr.Error("Please upload an audio file.")
progress(0.0, "Checking input...")
if input == "Video":
progress(0.0, "Extracting audio from video...")
audio_path = f"./{os.path.splitext(os.path.basename(video_path))[0]}.wav"
video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path)
video.close()
progress(0.1, "Audio extracted!")
# Transcribe audio
progress(0.1, "Transcribing audio...")
result = MODEL.transcribe(audio_path, language=language)
progress(0.30, "Audio transcribed!")
# Generate SRT file
progress(0.30, "Generating SRT file...")
srt_file_path = get_srt_filename(video_path, audio_path)
generate_srt_file(result, srt_file_path, lag=lag)
progress(0.40, "SRT file generated!")
if result["segments"] == []:
raise gr.Error("No speech detected in the audio.")
if input == "Video":
if lag == 0:
return video_path, srt_file_path
else:
# we simply extend the original video with a black screen at the end of duration lag
video = VideoFileClip(video_path)
black_screen = ColorClip(
size=video.size, color=(0, 0, 0), duration=lag
).set_fps(1)
final_video = concatenate_videoclips([video, black_screen])
output_video_path = "./transcribed_video.mp4"
final_video.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
else:
output_video_path = "./transcribed_video.mp4"
audio_clip = AudioFileClip(audio_path)
duration = audio_clip.duration + lag
video_clip = ColorClip(
size=(1280, 720), color=(0, 0, 0), duration=duration
).set_fps(1)
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
def download_srt(audio_input: str, video_input: str) -> str:
"""
Download the SRT file based on the input audio or video file.
Args:
audio_input: The path to the audio file.
video_input: The path to the video file.
Returns:
The path to the downloaded SRT file.
"""
srt_file_path = get_srt_filename(video_input, audio_input)
if os.path.exists(srt_file_path):
return srt_file_path
else:
raise gr.Error("No SRT file found. Please generate subtitles first.")
if __name__ == "__main__":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = whisper.load_model("base", device=DEVICE)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
<div style="text-align: center;">
<h1 style="color: #4A90E2; font-size: 3em;">Audio Transcription & Subtitled Video Generator π₯β¨</h1>
<p style="font-size: 1.2em; color: #333; max-width: 1000px; margin: auto; text-align: left;">
Transform your audio or video files into subtitled content effortlessly! <br>
1. Upload your audio or video file, select the language, and receive a video with synchronized subtitles. <br>
2. You can view the subtitled video directly here or download the subtitles as an SRT file for your use.
</p>
</div>
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="π΅ Upload Audio File",
)
video_input = gr.Video(
label="πΉ Or Upload Video File", sources=["upload", "webcam"]
)
with gr.Column():
file_type = gr.Dropdown(
["Video", "Audio"],
label="File Type",
value="Video",
interactive=True,
)
language = gr.Dropdown(
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
label="Select Language",
value="en",
interactive=True,
)
lag_slider = gr.Slider(
minimum=0,
maximum=10,
step=1,
value=0,
label="β± Lag (seconds): delay the transcription by this amount of time.",
)
transcribe_button = gr.Button(
"π¬ Generate Subtitled Video", variant="primary"
)
download_button = gr.Button("πΎ Download SRT File", variant="secondary")
with gr.Column():
video_output = gr.Video(
label="Play Video with Subtitles", show_download_button=False
)
srt_file_output = gr.File(label="Download Subtitle (SRT)")
transcribe_button.click(
fn=generate_video,
inputs=[audio_input, video_input, file_type, language, lag_slider],
outputs=video_output,
)
download_button.click(
fn=download_srt,
inputs=[audio_input, video_input],
outputs=srt_file_output,
)
demo.launch() |