File size: 6,935 Bytes
856d805 5fd1d62 baafc0a 5fd1d62 baafc0a 856d805 baafc0a 6c05ea2 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 5fd1d62 7aa414b 5fd1d62 baafc0a 856d805 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 856d805 baafc0a 5fd1d62 baafc0a 5fd1d62 856d805 5fd1d62 7aa414b 7a98cb1 856d805 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import os
import gradio as gr
import torch
import whisper
from moviepy.editor import (
AudioFileClip,
ColorClip,
CompositeVideoClip,
VideoFileClip,
concatenate_videoclips,
)
from moviepy.video.VideoClip import TextClip
def generate_srt_file(transcription_result, srt_file_path, lag=0):
with open(srt_file_path, "w") as file:
for i, segment in enumerate(transcription_result["segments"], start=1):
# Adjusting times for lag
start_time = segment["start"] + lag
end_time = segment["end"] + lag
text = segment["text"]
# Convert times to SRT format (HH:MM:SS,MS)
start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}"
end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}"
file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n")
def get_srt_filename(video_path, audio_path):
if video_path is not None:
return os.path.splitext(os.path.basename(video_path))[0] + ".srt"
else:
return os.path.splitext(os.path.basename(audio_path))[0] + ".srt"
def generate_video(
audio_path, video_path, input, language, lag, progress=gr.Progress(track_tqdm=True)
):
if audio_path is None and video_path is None:
raise ValueError("Please upload an audio or video file.")
if input == "Video" and video_path is None:
raise ValueError("Please upload a video file.")
if input == "Audio" and audio_path is None:
raise ValueError("Please upload an audio file.")
progress(0.0, "Checking input...")
if input == "Video":
progress(0.0, "Extracting audio from video...")
audio_path = f"./{os.path.splitext(os.path.basename(video_path))[0]}.wav"
video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path)
video.close()
progress(0.1, "Audio extracted!")
# Transcribe audio
progress(0.1, "Transcribing audio...")
result = model.transcribe(audio_path, language=language)
progress(0.30, "Audio transcribed!")
# Generate SRT file
progress(0.30, "Generating SRT file...")
srt_file_path = get_srt_filename(video_path, audio_path)
generate_srt_file(result, srt_file_path, lag=lag)
progress(0.40, "SRT file generated!")
if result["segments"] == []:
raise gr.Error("No speech detected in the audio.")
if input == "Video":
if lag == 0:
return video_path, srt_file_path
else:
# we simply extend the original video with a black screen at the end of duration lag
video = VideoFileClip(video_path)
fps = video.fps
black_screen = ColorClip(
size=video.size, color=(0, 0, 0), duration=lag
).set_fps(1)
final_video = concatenate_videoclips([video, black_screen])
output_video_path = "./transcribed_video.mp4"
final_video.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
else:
output_video_path = "./transcribed_video.mp4"
audio_clip = AudioFileClip(audio_path)
duration = audio_clip.duration + lag
video_clip = ColorClip(
size=(1280, 720), color=(0, 0, 0), duration=duration
).set_fps(1)
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
def download_srt(audio_input, video_input):
srt_file_path = get_srt_filename(video_input, audio_input)
if os.path.exists(srt_file_path):
return srt_file_path
else:
raise gr.Error("No SRT file found. Please generate subtitles first.")
if __name__ == "__main__":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=DEVICE)
# Gradio Blocks implementation
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
<div style="text-align: center;">
<h1 style="color: #4A90E2; font-size: 3em;">Audio Transcription & Subtitled Video Generator π₯β¨</h1>
<p style="font-size: 1.2em; color: #333; max-width: 1000px; margin: auto; text-align: left;">
Transform your audio or video files into subtitled content effortlessly! <br>
1. Upload your audio or video file, select the language, and receive a video with synchronized subtitles. <br>
2. You can view the subtitled video directly here or download the subtitles as an SRT file for your use.
</p>
</div>
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="π΅ Upload Audio File",
)
video_input = gr.Video(
label="πΉ Or Upload Video File", sources=["upload", "webcam"]
)
with gr.Column():
file_type = gr.Dropdown(
["Video", "Audio"],
label="File Type",
value="Video",
interactive=True,
)
language = gr.Dropdown(
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
label="Select Language",
value="en",
interactive=True,
)
lag_slider = gr.Slider(
minimum=0,
maximum=10,
step=1,
value=0,
label="β± Lag (seconds): delay the transcription by this amount of time.",
)
transcribe_button = gr.Button(
"π¬ Generate Subtitled Video", variant="primary"
)
download_button = gr.Button("πΎ Download SRT File", variant="secondary")
with gr.Column():
video_output = gr.Video(
label="Play Video with Subtitles", show_download_button=False
)
srt_file_output = gr.File(label="Download Subtitle (SRT)")
transcribe_button.click(
fn=generate_video,
inputs=[audio_input, video_input, file_type, language, lag_slider],
outputs=video_output,
)
download_button.click(
fn=download_srt,
inputs=[audio_input, video_input],
outputs=srt_file_output,
)
demo.launch()
|