subtify / app.py
Maximofn's picture
Improve code documentation and add docstrings
218960f
import gradio as gr
import argparse
import spaces
import os
import torch
import shutil
from time import sleep
from tqdm import tqdm
from lang_list import union_language_dict
# import pyperclip
import re
from PIL import Image
# import urllib.request
from ui_config import (
BACKGROUND_COLOR, BUTTON_COLOR, SVG_COLOR, PANEL_COLOR,
PRIMARY_TEXT_COLOR, SUBDUED_TEXT_COLOR, BACKGROUND_PRIMARY_COLOR,
BACKGROUND_SECONDARY_COLOR, PRIMARY_BODER_COLOR, BLOCK_TITLE_TEXT_COLOR,
INPUT_BACKGROUND_COLOR, INPUT_BORDER_COLOR, INPUT_PLACEHOLDER_COLOR,
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
html_social_media, get_html_subtify_logo_big, get_html_subtify_logo_small, html_buy_me_a_coffe
)
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
from slice_audio import slice_audio as slice_audio_main
from audio import get_audio_from_video
from transcribe import transcribe, get_language_dict
from diarize_library import diarize_audio
import json
NUMBER = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# DEVICE = "cpu"
DOWNLOAD = True
SLICE_AUDIO = True
TRANSCRIBE_AUDIO = True
CONCATENATE_TRANSCRIPTIONS = True
TRANSLATE_TRANSCRIPTIONS = True
ADD_SUBTITLES_TO_VIDEO = True
REMOVE_FILES = True
if DEVICE == "cpu":
# Assuming we are on huggingface server
ram = 16000
factor = 1
CHUNK_SECONDS = int(ram*factor)
CHUNK_SECONDS = 30
CHUNK_OVERLAP_SECONDS = 5
print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
else:
# Assuming we are on local machine
CHUNK_SECONDS = 30
CHUNK_OVERLAP_SECONDS = 5
YOUTUBE = "youtube"
TWITCH = "twitch"
ERROR = "error"
VIEW_OUTPUTS = True
DEBUG = True
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
subtify_logo_width, subtify_logo_height = subtify_logo.size
factor = 4
new_width = subtify_logo_width // factor
new_height = subtify_logo_height // factor
html_subtify_logo_big = get_html_subtify_logo_big(new_width, new_height)
html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
language_dict = union_language_dict()
def remove_all_files():
"""Remove all temporary files and folders"""
if os.path.exists("audios"):
command = f"rm -r audios"
os.system(command)
if os.path.exists("chunks"):
command = f"rm -r chunks"
os.system(command)
if os.path.exists("concatenated_transcriptions"):
command = f"rm -r concatenated_transcriptions"
os.system(command)
if os.path.exists("transcriptions"):
command = f"rm -r transcriptions"
os.system(command)
if os.path.exists("translated_transcriptions"):
command = f"rm -r translated_transcriptions"
os.system(command)
if os.path.exists("videos"):
command = f"rm -r videos"
os.system(command)
if os.path.exists("vocals"):
command = f"rm -r vocals"
os.system(command)
def reset_frontend():
"""Reset all frontend elements to their default state"""
visible = False
return (
None,
gr.Image(visible=visible),
gr.Dropdown(visible=visible),
gr.Dropdown(visible=visible),
gr.Dropdown(visible=visible),
gr.Accordion(visible=visible),
gr.Button(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Textbox(visible=visible),
gr.Video(visible=visible),
)
def show_auxiliar_block1():
"""Show auxiliary block 1 with URL checked message"""
return gr.Textbox(value="URL checked", visible=False)
def change_visibility_texboxes():
"""Change visibility of progress info textboxes"""
return (
gr.update(value="Done"), # auxiliar_block1
gr.update(visible=True), # get_audio_from_video_info
gr.update(visible=True), # merged_transcription
gr.update(visible=True), # video_sliced_progress_info
gr.update(visible=True), # video_transcribed_progress_info
gr.update(visible=True), # diarization_progress_info
gr.update(visible=True), # transcriptions_concatenated_progress_info
gr.update(visible=True), # video_translated_progress_info
gr.update(visible=True), # video_subtitled_progress_info
)
def get_audio(video_path):
"""
Extract audio from video file.
Args:
video_path (str): Path to video file
Returns:
list: Status update and audio file path
"""
print('*'*NUMBER)
print(f"Getting audio from video {video_path}")
audios_folder = "audios"
if DEBUG:
audio_file = f"{audios_folder}/download_audio.mp3"
if os.path.exists(audio_file):
return [
gr.update(value="Loaded"), # get_audio_from_video_info
gr.update(value=audio_file) # original_audio_path
]
try:
audio_path = get_audio_from_video(video_path, audios_folder)
return [
gr.update(value="Ok"), # get_audio_from_video_info
gr.update(value=audio_path) # original_audio_path
]
except Exception as e:
print(f"Error: {str(e)}")
return [
gr.update(value="Error"), # get_audio_from_video_info
gr.update(value="") # original_audio_path
]
def slice_audio(input_audio_path):
"""
Slice audio into chunks.
Args:
input_audio_path (str): Path to input audio file
"""
print('*'*NUMBER)
print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
# Create vocals and chunks folders
print("Creating vocals and chunks folders")
folder_vocals = "vocals"
folder_chunck = "chunks"
if not os.path.exists(folder_vocals):
os.makedirs(folder_vocals)
if not os.path.exists(folder_chunck):
os.makedirs(folder_chunck)
slice_audio_main(input_audio_path, folder_chunck, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
return (
gr.update(value="Ok"), # video_sliced_progress_info
)
def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
"""
Perform speaker diarization on audio file.
Args:
input_audio_path (str): Path to audio file
num_speakers (int): Expected number of speakers
min_speakers (int): Minimum number of speakers
max_speakers (int): Maximum number of speakers
"""
print('*'*NUMBER)
print(f"Diarize {input_audio_path}")
# Diarization file
diarization_file = f"diarization/diarization.json"
if DEBUG:
if os.path.exists(diarization_file):
with open(diarization_file, "r") as f:
diarization = f.read()
return [
gr.update(value="Loaded"),
gr.update(value=diarization)
]
# Diarize audio
diarization = diarize_audio(input_audio_path, num_speakers, min_speakers, max_speakers, DEVICE)
# Save diarization
with open(diarization_file, "w") as f:
json.dump(diarization, f)
return [
gr.update(value="Ok"),
gr.update(value=diarization)
]
def trascribe_audio(input_audio_path, source_languaje):
print('*'*NUMBER)
print(f"Transcript {input_audio_path}")
# Transcription file
transcription_file = f"transcriptions/transcription_{source_languaje}.json"
if DEBUG:
if os.path.exists(transcription_file):
transcription = open(transcription_file, "r").read()
transcription = json.loads(transcription)
return [
gr.update(value="Loaded"),
gr.update(value=transcription)
]
# Get language dict
language_dict = get_language_dict()
# Transcribe audio file
transcription_str, transcription_dict = transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
# Save transcription
with open(transcription_file, "w") as f:
transcription_json = json.dumps(transcription_dict)
f.write(transcription_json)
return (
gr.update(value="Ok"),
gr.update(value=transcription_dict)
)
def concatenate_transcriptions():
print('*'*NUMBER)
print("Concatenate transcriptions")
folder_concatenated = "concatenated_transcriptions"
if not os.path.exists(folder_concatenated):
os.makedirs(folder_concatenated)
chunck_file = "chunks/output_files.txt"
python_file = "concat_transcriptions.py"
command = f"python {python_file} {chunck_file} {CHUNK_SECONDS} {CHUNK_OVERLAP_SECONDS}"
os.system(command)
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
for file in files:
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
transcriptions_folder = "transcriptions"
transcription_extension = "srt"
command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}"
os.system(command)
audio_transcribed = "concatenated_transcriptions/download_audio.srt"
return (
gr.Textbox(value="Ok"),
gr.Textbox(value=audio_transcribed),
)
def translate_transcription(original_audio_transcribed_path, source_languaje, target_languaje):
print('*'*NUMBER)
print("Translate transcription")
folder_translated_transcriptions = "translated_transcriptions"
if not os.path.exists(folder_translated_transcriptions):
os.makedirs(folder_translated_transcriptions)
python_file = "translate_transcriptions.py"
command = f"python {python_file} {original_audio_transcribed_path} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}"
os.system(command)
translated_transcription = f"translated_transcriptions/download_audio_{target_languaje}.srt"
transcription_file = "concatenated_transcriptions/download_audio.srt"
if os.path.exists(transcription_file):
command = f"rm {transcription_file}"
os.system(command)
return (
gr.Textbox(value="Ok"),
gr.Textbox(value=translated_transcription)
)
def add_translated_subtitles_to_video(original_video_path, original_audio_path, original_audio_translated_path):
print('*'*NUMBER)
print("Add subtitles to video")
python_file = "add_subtitles_to_video.py"
command = f"python {python_file} {original_audio_translated_path} {original_video_path} {original_audio_path}"
os.system(command)
if os.path.exists(original_video_path):
command = f"rm {original_video_path}"
os.system(command)
if os.path.exists(original_audio_path):
command = f"rm {original_audio_path}"
os.system(command)
if os.path.exists(original_audio_translated_path):
command = f"rm {original_audio_translated_path}"
os.system(command)
if os.path.exists("chunks/output_files.txt"):
command = f"rm chunks/output_files.txt"
os.system(command)
subtitled_video = "videos/download_video_with_subtitles.mp4"
visible = False
return (
gr.Video(value=subtitled_video, visible=True),
gr.Textbox(value="Ok", visible=visible),
gr.Textbox(value="Ok"),
)
def hide_textbobes_progress_info():
visible = False
return (
gr.Textbox(value="Waiting", visible=visible),
gr.Textbox(value="Waiting", visible=visible),
gr.Textbox(value="Waiting", visible=visible),
gr.Textbox(value="Waiting", visible=visible),
gr.Textbox(value="Waiting", visible=visible),
gr.Textbox(value="Waiting", visible=visible),
)
def process_uploaded_video(video_path):
# Create videos folder
videos_folder = "videos"
if not os.path.exists(videos_folder):
os.makedirs(videos_folder)
if DEBUG:
video_file = f"{videos_folder}/download_video.mp4"
if os.path.exists(video_file):
return [
gr.update(label="Video uploaded"), # video_input
gr.update(visible=True), # config_block
gr.update(value=video_file), # original_video_path
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
]
# Copy uploaded video to videos folder
new_video_path = os.path.join(videos_folder, "download_video.mp4")
shutil.copy(video_path, new_video_path)
# Return updated config block with new scale and the new video path
return [
gr.update(label="Video uploaded"), # video_input
gr.update(visible=True), # config_block
gr.update(value=new_video_path), # original_video_path
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
]
def merge_transcription_and_diarization():
"""
Merge transcription and diarization results to assign speakers to each word.
Returns:
dict: Combined transcription with speaker information
"""
print('*'*NUMBER)
print("Merge transcription and diarization")
if DEBUG:
merged_transcription_path = "merged_transcription_diarization/merged.json"
if os.path.exists(merged_transcription_path):
with open(merged_transcription_path, 'r') as f:
merged_transcription = json.load(f)
return [
gr.update(value="Loaded"),
gr.update(value=merged_transcription)
]
# Load JSON files
transcription_path = "transcriptions/transcription_English.json"
diarization_path = "diarization/diarization.json"
with open(transcription_path, 'r') as f:
transcription = json.load(f)
with open(diarization_path, 'r') as f:
diarization = json.load(f)
# Create new list for combined chunks
merged_chunks = []
# For each word in transcription
for chunk in transcription.get('chunks', []):
# Verify chunk has valid timestamps
if not (isinstance(chunk.get('start'), (int, float)) and
isinstance(chunk.get('end'), (int, float))):
continue
word_start = float(chunk['start'])
word_end = float(chunk['end'])
# Find corresponding speaker in diarization
speaker = None
for segment in diarization:
# Verify segment has valid timestamps
if not (isinstance(segment.get('start'), (int, float)) and
isinstance(segment.get('end'), (int, float))):
continue
segment_start = float(segment['start'])
segment_end = float(segment['end'])
# If word is within segment time range
if (word_start >= segment_start and word_end <= segment_end):
speaker = segment['speaker']
break
# If word is mostly within segment (>50% duration)
word_duration = word_end - word_start
overlap_start = max(word_start, segment_start)
overlap_end = min(word_end, segment_end)
overlap_duration = max(0, overlap_end - overlap_start)
if overlap_duration > word_duration * 0.5:
speaker = segment['speaker']
break
# Create new chunk with speaker information
merged_chunk = {
'start': word_start,
'end': word_end,
'text': chunk['text'],
'speaker': speaker if speaker else 'UNKNOWN'
}
merged_chunks.append(merged_chunk)
# Create final dictionary
merged_transcription = {
'text': transcription.get('text', ''),
'chunks': merged_chunks
}
# Create directory if it doesn't exist
if not os.path.exists(merged_transcription_path):
os.makedirs(merged_transcription_path)
# Save result to new directory
with open(merged_transcription_path, 'w', encoding='utf-8') as f:
json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
return [
gr.update(value="Ok"),
gr.update(value=merged_transcription)
]
@spaces.GPU
def subtify():
with gr.Blocks(
theme=gr.themes.Default().set
(
body_background_fill=BACKGROUND_COLOR,
body_background_fill_dark=BACKGROUND_COLOR,
body_text_color=PRIMARY_TEXT_COLOR,
body_text_color_dark=PRIMARY_TEXT_COLOR,
body_text_color_subdued=SUBDUED_TEXT_COLOR,
body_text_color_subdued_dark=SUBDUED_TEXT_COLOR,
background_fill_primary=BACKGROUND_PRIMARY_COLOR,
background_fill_primary_dark=BACKGROUND_PRIMARY_COLOR,
background_fill_secondary=BACKGROUND_SECONDARY_COLOR,
background_fill_secondary_dark=BACKGROUND_SECONDARY_COLOR,
border_color_primary=PRIMARY_BODER_COLOR,
border_color_primary_dark=PRIMARY_BODER_COLOR,
block_background_fill=BACKGROUND_PRIMARY_COLOR,
block_background_fill_dark=BACKGROUND_PRIMARY_COLOR,
block_title_text_color=BLOCK_TITLE_TEXT_COLOR,
block_title_text_color_dark=BLOCK_TITLE_TEXT_COLOR,
input_background_fill=INPUT_BACKGROUND_COLOR,
input_background_fill_dark=INPUT_BACKGROUND_COLOR,
input_border_color=INPUT_BORDER_COLOR,
input_border_color_dark=INPUT_BORDER_COLOR,
input_placeholder_color=INPUT_PLACEHOLDER_COLOR,
input_placeholder_color_dark=INPUT_PLACEHOLDER_COLOR,
error_background_fill=ERROR_BACKGROUND_COLOR,
error_background_fill_dark=ERROR_BACKGROUND_COLOR,
error_text_color=ERROR_TEXT_COLOR,
error_text_color_dark=ERROR_TEXT_COLOR,
error_border_color=ERROR_BORDER_COLOR,
error_border_color_dark=ERROR_BORDER_COLOR,
button_secondary_background_fill=BUTTON_SECONDARY_BACKGROUND_COLOR,
button_secondary_background_fill_dark=BUTTON_SECONDARY_BACKGROUND_COLOR,
button_secondary_border_color=BUTTON_SECONDARY_BORDER_COLOR,
button_primary_background_fill_dark=BUTTON_SECONDARY_BORDER_COLOR,
button_secondary_text_color=BUTTON_SECONDARY_TEXT_COLOR,
button_secondary_text_color_dark=BUTTON_SECONDARY_TEXT_COLOR,
)
) as demo:
num_speaker = []
for i in range(100, 0, -1):
num_speaker.append(i)
# Layout
gr.HTML(html_social_media)
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
html_subtify_logo_component = gr.HTML(html_subtify_logo_big)
# Input block, where the user can upload a video and configure the subtify process
visible = False
input_block = gr.Row(variant="panel")
with input_block:
input_video_block = gr.Row(scale=2)
with input_video_block:
video_input = gr.Video(
label="Upload video",
sources=["upload"],
scale=1,
interactive=True
)
delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0)
config_block = gr.Column(scale=1, visible=visible)
with config_block:
with gr.Row():
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
with gr.Accordion("Advanced settings", open=True, visible=True) as Advanced_setings:
number_of_speakers = gr.Number(visible=True, label="Number of speakers", show_label=True, value=0, interactive=True, info="Number of speakers in the video, if you don't know, select 0")
min_speakers = gr.Number(visible=True, label="Min speakers", show_label=True, value=0, scale=0, interactive=True, info="Minimum number of speakers in the video")
max_speakers = gr.Number(visible=True, label="Max speakers", show_label=True, value=0, scale=0, interactive=True, info="Maximum number of speakers in the video")
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
with gr.Row():
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
diarization_progress_info = gr.Textbox(placeholder="Waiting", label="Diarize progress info", elem_id="diarization_progress_info", interactive=False, visible=visible)
merged_transcription_progress_info = gr.Textbox(placeholder="Waiting", label="Merge transcription and diarization progress info", elem_id="merged_transcription_progress_info", interactive=False, visible=visible)
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
original_video_path = gr.Textbox(label="Original video path", visible=visible)
transcription = gr.Textbox(label="transcription", elem_id="transcription", visible=VIEW_OUTPUTS)
diarization = gr.Textbox(label="diarization", elem_id="diarization", visible=VIEW_OUTPUTS)
merged_transcription = gr.Textbox(label="merged_transcription", elem_id="merged_transcription", visible=VIEW_OUTPUTS)
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
gr.HTML(html_buy_me_a_coffe)
# Events
# paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox)
delete_button.click(
fn=reset_frontend,
outputs=[
video_input,
source_languaje,
target_languaje,
Advanced_setings,
number_of_speakers,
subtify_button,
auxiliar_block1,
video_transcribed_progress_info,
transcriptions_concatenated_progress_info,
video_translated_progress_info,
video_subtitled_progress_info,
subtitled_video,
]
)
video_input.change(
fn=process_uploaded_video,
inputs=[video_input],
outputs=[video_input, config_block, original_video_path, html_subtify_logo_component]
)
subtify_button.click(
fn=change_visibility_texboxes,
outputs=[auxiliar_block1, get_audio_from_video_info, merged_transcription_progress_info, video_transcribed_progress_info, diarization_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
)
auxiliar_block1.change(
fn=get_audio,
inputs=[original_video_path],
outputs=[get_audio_from_video_info, original_audio_path]
)
get_audio_from_video_info.change(
fn=trascribe_audio,
inputs=[original_audio_path, source_languaje],
outputs=[video_transcribed_progress_info, transcription]
)
video_transcribed_progress_info.change(
fn=diarize,
inputs=[original_audio_path, number_of_speakers, min_speakers, max_speakers],
outputs=[diarization_progress_info, diarization]
)
diarization_progress_info.change(
fn=merge_transcription_and_diarization,
outputs=[merged_transcription_progress_info, merged_transcription]
)
# transcriptions_concatenated_progress_info.change(
# fn=translate_transcription,
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
# outputs=[video_translated_progress_info, original_audio_translated_path]
# )
# video_translated_progress_info.change(
# fn=add_translated_subtitles_to_video,
# inputs=[original_video_path, original_audio_path, original_audio_translated_path],
# outputs=[subtitled_video, video_subtitled_progress_info, auxiliar_block3]
# )
# auxiliar_block3.change(
# fn=hide_textbobes_progress_info,
# outputs=[video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
# )
demo.launch()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--no_ui", action="store_true")
parser.add_argument("--remove_all_files", action="store_true")
args = parser.parse_args()
if args.no_ui:
subtify_no_ui()
elif args.remove_all_files:
remove_all_files()
else:
subtify()