import gradio as gr import argparse import spaces import os import torch import shutil from time import sleep from tqdm import tqdm from lang_list import union_language_dict # import pyperclip import re from PIL import Image # import urllib.request from ui_config import ( BACKGROUND_COLOR, BUTTON_COLOR, SVG_COLOR, PANEL_COLOR, PRIMARY_TEXT_COLOR, SUBDUED_TEXT_COLOR, BACKGROUND_PRIMARY_COLOR, BACKGROUND_SECONDARY_COLOR, PRIMARY_BODER_COLOR, BLOCK_TITLE_TEXT_COLOR, INPUT_BACKGROUND_COLOR, INPUT_BORDER_COLOR, INPUT_PLACEHOLDER_COLOR, ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR, BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR, BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE, html_social_media, get_html_subtify_logo_big, get_html_subtify_logo_small, html_buy_me_a_coffe ) # from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url from slice_audio import slice_audio as slice_audio_main from audio import get_audio_from_video from transcribe import transcribe, get_language_dict from diarize_library import diarize_audio import json NUMBER = 100 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # DEVICE = "cpu" DOWNLOAD = True SLICE_AUDIO = True TRANSCRIBE_AUDIO = True CONCATENATE_TRANSCRIPTIONS = True TRANSLATE_TRANSCRIPTIONS = True ADD_SUBTITLES_TO_VIDEO = True REMOVE_FILES = True if DEVICE == "cpu": # Assuming we are on huggingface server ram = 16000 factor = 1 CHUNK_SECONDS = int(ram*factor) CHUNK_SECONDS = 30 CHUNK_OVERLAP_SECONDS = 5 print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}") else: # Assuming we are on local machine CHUNK_SECONDS = 30 CHUNK_OVERLAP_SECONDS = 5 YOUTUBE = "youtube" TWITCH = "twitch" ERROR = "error" VIEW_OUTPUTS = True DEBUG = True subtify_logo = Image.open("assets/subtify_logo-scaled.png") subtify_logo_width, subtify_logo_height = subtify_logo.size factor = 4 new_width = subtify_logo_width // factor new_height = subtify_logo_height // factor html_subtify_logo_big = get_html_subtify_logo_big(new_width, new_height) html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height) language_dict = union_language_dict() def remove_all_files(): """Remove all temporary files and folders""" if os.path.exists("audios"): command = f"rm -r audios" os.system(command) if os.path.exists("chunks"): command = f"rm -r chunks" os.system(command) if os.path.exists("concatenated_transcriptions"): command = f"rm -r concatenated_transcriptions" os.system(command) if os.path.exists("transcriptions"): command = f"rm -r transcriptions" os.system(command) if os.path.exists("translated_transcriptions"): command = f"rm -r translated_transcriptions" os.system(command) if os.path.exists("videos"): command = f"rm -r videos" os.system(command) if os.path.exists("vocals"): command = f"rm -r vocals" os.system(command) def reset_frontend(): """Reset all frontend elements to their default state""" visible = False return ( None, gr.Image(visible=visible), gr.Dropdown(visible=visible), gr.Dropdown(visible=visible), gr.Dropdown(visible=visible), gr.Accordion(visible=visible), gr.Button(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Textbox(visible=visible), gr.Video(visible=visible), ) def show_auxiliar_block1(): """Show auxiliary block 1 with URL checked message""" return gr.Textbox(value="URL checked", visible=False) def change_visibility_texboxes(): """Change visibility of progress info textboxes""" return ( gr.update(value="Done"), # auxiliar_block1 gr.update(visible=True), # get_audio_from_video_info gr.update(visible=True), # merged_transcription gr.update(visible=True), # video_sliced_progress_info gr.update(visible=True), # video_transcribed_progress_info gr.update(visible=True), # diarization_progress_info gr.update(visible=True), # transcriptions_concatenated_progress_info gr.update(visible=True), # video_translated_progress_info gr.update(visible=True), # video_subtitled_progress_info ) def get_audio(video_path): """ Extract audio from video file. Args: video_path (str): Path to video file Returns: list: Status update and audio file path """ print('*'*NUMBER) print(f"Getting audio from video {video_path}") audios_folder = "audios" if DEBUG: audio_file = f"{audios_folder}/download_audio.mp3" if os.path.exists(audio_file): return [ gr.update(value="Loaded"), # get_audio_from_video_info gr.update(value=audio_file) # original_audio_path ] try: audio_path = get_audio_from_video(video_path, audios_folder) return [ gr.update(value="Ok"), # get_audio_from_video_info gr.update(value=audio_path) # original_audio_path ] except Exception as e: print(f"Error: {str(e)}") return [ gr.update(value="Error"), # get_audio_from_video_info gr.update(value="") # original_audio_path ] def slice_audio(input_audio_path): """ Slice audio into chunks. Args: input_audio_path (str): Path to input audio file """ print('*'*NUMBER) print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap") # Create vocals and chunks folders print("Creating vocals and chunks folders") folder_vocals = "vocals" folder_chunck = "chunks" if not os.path.exists(folder_vocals): os.makedirs(folder_vocals) if not os.path.exists(folder_chunck): os.makedirs(folder_chunck) slice_audio_main(input_audio_path, folder_chunck, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS) return ( gr.update(value="Ok"), # video_sliced_progress_info ) def diarize(input_audio_path, num_speakers, min_speakers, max_speakers): """ Perform speaker diarization on audio file. Args: input_audio_path (str): Path to audio file num_speakers (int): Expected number of speakers min_speakers (int): Minimum number of speakers max_speakers (int): Maximum number of speakers """ print('*'*NUMBER) print(f"Diarize {input_audio_path}") # Diarization file diarization_file = f"diarization/diarization.json" if DEBUG: if os.path.exists(diarization_file): with open(diarization_file, "r") as f: diarization = f.read() return [ gr.update(value="Loaded"), gr.update(value=diarization) ] # Diarize audio diarization = diarize_audio(input_audio_path, num_speakers, min_speakers, max_speakers, DEVICE) # Save diarization with open(diarization_file, "w") as f: json.dump(diarization, f) return [ gr.update(value="Ok"), gr.update(value=diarization) ] def trascribe_audio(input_audio_path, source_languaje): print('*'*NUMBER) print(f"Transcript {input_audio_path}") # Transcription file transcription_file = f"transcriptions/transcription_{source_languaje}.json" if DEBUG: if os.path.exists(transcription_file): transcription = open(transcription_file, "r").read() transcription = json.loads(transcription) return [ gr.update(value="Loaded"), gr.update(value=transcription) ] # Get language dict language_dict = get_language_dict() # Transcribe audio file transcription_str, transcription_dict = transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS) # Save transcription with open(transcription_file, "w") as f: transcription_json = json.dumps(transcription_dict) f.write(transcription_json) return ( gr.update(value="Ok"), gr.update(value=transcription_dict) ) def concatenate_transcriptions(): print('*'*NUMBER) print("Concatenate transcriptions") folder_concatenated = "concatenated_transcriptions" if not os.path.exists(folder_concatenated): os.makedirs(folder_concatenated) chunck_file = "chunks/output_files.txt" python_file = "concat_transcriptions.py" command = f"python {python_file} {chunck_file} {CHUNK_SECONDS} {CHUNK_OVERLAP_SECONDS}" os.system(command) with open(chunck_file, 'r') as f: files = f.read().splitlines() for file in files: file_name, _ = file.split(".") _, file_name = file_name.split("/") transcriptions_folder = "transcriptions" transcription_extension = "srt" command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}" os.system(command) audio_transcribed = "concatenated_transcriptions/download_audio.srt" return ( gr.Textbox(value="Ok"), gr.Textbox(value=audio_transcribed), ) def translate_transcription(original_audio_transcribed_path, source_languaje, target_languaje): print('*'*NUMBER) print("Translate transcription") folder_translated_transcriptions = "translated_transcriptions" if not os.path.exists(folder_translated_transcriptions): os.makedirs(folder_translated_transcriptions) python_file = "translate_transcriptions.py" command = f"python {python_file} {original_audio_transcribed_path} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}" os.system(command) translated_transcription = f"translated_transcriptions/download_audio_{target_languaje}.srt" transcription_file = "concatenated_transcriptions/download_audio.srt" if os.path.exists(transcription_file): command = f"rm {transcription_file}" os.system(command) return ( gr.Textbox(value="Ok"), gr.Textbox(value=translated_transcription) ) def add_translated_subtitles_to_video(original_video_path, original_audio_path, original_audio_translated_path): print('*'*NUMBER) print("Add subtitles to video") python_file = "add_subtitles_to_video.py" command = f"python {python_file} {original_audio_translated_path} {original_video_path} {original_audio_path}" os.system(command) if os.path.exists(original_video_path): command = f"rm {original_video_path}" os.system(command) if os.path.exists(original_audio_path): command = f"rm {original_audio_path}" os.system(command) if os.path.exists(original_audio_translated_path): command = f"rm {original_audio_translated_path}" os.system(command) if os.path.exists("chunks/output_files.txt"): command = f"rm chunks/output_files.txt" os.system(command) subtitled_video = "videos/download_video_with_subtitles.mp4" visible = False return ( gr.Video(value=subtitled_video, visible=True), gr.Textbox(value="Ok", visible=visible), gr.Textbox(value="Ok"), ) def hide_textbobes_progress_info(): visible = False return ( gr.Textbox(value="Waiting", visible=visible), gr.Textbox(value="Waiting", visible=visible), gr.Textbox(value="Waiting", visible=visible), gr.Textbox(value="Waiting", visible=visible), gr.Textbox(value="Waiting", visible=visible), gr.Textbox(value="Waiting", visible=visible), ) def process_uploaded_video(video_path): # Create videos folder videos_folder = "videos" if not os.path.exists(videos_folder): os.makedirs(videos_folder) if DEBUG: video_file = f"{videos_folder}/download_video.mp4" if os.path.exists(video_file): return [ gr.update(label="Video uploaded"), # video_input gr.update(visible=True), # config_block gr.update(value=video_file), # original_video_path gr.update(value=html_subtify_logo_small) # html_subtify_logo_component ] # Copy uploaded video to videos folder new_video_path = os.path.join(videos_folder, "download_video.mp4") shutil.copy(video_path, new_video_path) # Return updated config block with new scale and the new video path return [ gr.update(label="Video uploaded"), # video_input gr.update(visible=True), # config_block gr.update(value=new_video_path), # original_video_path gr.update(value=html_subtify_logo_small) # html_subtify_logo_component ] def merge_transcription_and_diarization(): """ Merge transcription and diarization results to assign speakers to each word. Returns: dict: Combined transcription with speaker information """ print('*'*NUMBER) print("Merge transcription and diarization") if DEBUG: merged_transcription_path = "merged_transcription_diarization/merged.json" if os.path.exists(merged_transcription_path): with open(merged_transcription_path, 'r') as f: merged_transcription = json.load(f) return [ gr.update(value="Loaded"), gr.update(value=merged_transcription) ] # Load JSON files transcription_path = "transcriptions/transcription_English.json" diarization_path = "diarization/diarization.json" with open(transcription_path, 'r') as f: transcription = json.load(f) with open(diarization_path, 'r') as f: diarization = json.load(f) # Create new list for combined chunks merged_chunks = [] # For each word in transcription for chunk in transcription.get('chunks', []): # Verify chunk has valid timestamps if not (isinstance(chunk.get('start'), (int, float)) and isinstance(chunk.get('end'), (int, float))): continue word_start = float(chunk['start']) word_end = float(chunk['end']) # Find corresponding speaker in diarization speaker = None for segment in diarization: # Verify segment has valid timestamps if not (isinstance(segment.get('start'), (int, float)) and isinstance(segment.get('end'), (int, float))): continue segment_start = float(segment['start']) segment_end = float(segment['end']) # If word is within segment time range if (word_start >= segment_start and word_end <= segment_end): speaker = segment['speaker'] break # If word is mostly within segment (>50% duration) word_duration = word_end - word_start overlap_start = max(word_start, segment_start) overlap_end = min(word_end, segment_end) overlap_duration = max(0, overlap_end - overlap_start) if overlap_duration > word_duration * 0.5: speaker = segment['speaker'] break # Create new chunk with speaker information merged_chunk = { 'start': word_start, 'end': word_end, 'text': chunk['text'], 'speaker': speaker if speaker else 'UNKNOWN' } merged_chunks.append(merged_chunk) # Create final dictionary merged_transcription = { 'text': transcription.get('text', ''), 'chunks': merged_chunks } # Create directory if it doesn't exist if not os.path.exists(merged_transcription_path): os.makedirs(merged_transcription_path) # Save result to new directory with open(merged_transcription_path, 'w', encoding='utf-8') as f: json.dump(merged_transcription, f, ensure_ascii=False, indent=2) return [ gr.update(value="Ok"), gr.update(value=merged_transcription) ] @spaces.GPU def subtify(): with gr.Blocks( theme=gr.themes.Default().set ( body_background_fill=BACKGROUND_COLOR, body_background_fill_dark=BACKGROUND_COLOR, body_text_color=PRIMARY_TEXT_COLOR, body_text_color_dark=PRIMARY_TEXT_COLOR, body_text_color_subdued=SUBDUED_TEXT_COLOR, body_text_color_subdued_dark=SUBDUED_TEXT_COLOR, background_fill_primary=BACKGROUND_PRIMARY_COLOR, background_fill_primary_dark=BACKGROUND_PRIMARY_COLOR, background_fill_secondary=BACKGROUND_SECONDARY_COLOR, background_fill_secondary_dark=BACKGROUND_SECONDARY_COLOR, border_color_primary=PRIMARY_BODER_COLOR, border_color_primary_dark=PRIMARY_BODER_COLOR, block_background_fill=BACKGROUND_PRIMARY_COLOR, block_background_fill_dark=BACKGROUND_PRIMARY_COLOR, block_title_text_color=BLOCK_TITLE_TEXT_COLOR, block_title_text_color_dark=BLOCK_TITLE_TEXT_COLOR, input_background_fill=INPUT_BACKGROUND_COLOR, input_background_fill_dark=INPUT_BACKGROUND_COLOR, input_border_color=INPUT_BORDER_COLOR, input_border_color_dark=INPUT_BORDER_COLOR, input_placeholder_color=INPUT_PLACEHOLDER_COLOR, input_placeholder_color_dark=INPUT_PLACEHOLDER_COLOR, error_background_fill=ERROR_BACKGROUND_COLOR, error_background_fill_dark=ERROR_BACKGROUND_COLOR, error_text_color=ERROR_TEXT_COLOR, error_text_color_dark=ERROR_TEXT_COLOR, error_border_color=ERROR_BORDER_COLOR, error_border_color_dark=ERROR_BORDER_COLOR, button_secondary_background_fill=BUTTON_SECONDARY_BACKGROUND_COLOR, button_secondary_background_fill_dark=BUTTON_SECONDARY_BACKGROUND_COLOR, button_secondary_border_color=BUTTON_SECONDARY_BORDER_COLOR, button_primary_background_fill_dark=BUTTON_SECONDARY_BORDER_COLOR, button_secondary_text_color=BUTTON_SECONDARY_TEXT_COLOR, button_secondary_text_color_dark=BUTTON_SECONDARY_TEXT_COLOR, ) ) as demo: num_speaker = [] for i in range(100, 0, -1): num_speaker.append(i) # Layout gr.HTML(html_social_media) gr.HTML("

Subtify

") html_subtify_logo_component = gr.HTML(html_subtify_logo_big) # Input block, where the user can upload a video and configure the subtify process visible = False input_block = gr.Row(variant="panel") with input_block: input_video_block = gr.Row(scale=2) with input_video_block: video_input = gr.Video( label="Upload video", sources=["upload"], scale=1, interactive=True ) delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0) config_block = gr.Column(scale=1, visible=visible) with config_block: with gr.Row(): source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video") target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="EspaƱol", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles") with gr.Accordion("Advanced settings", open=True, visible=True) as Advanced_setings: number_of_speakers = gr.Number(visible=True, label="Number of speakers", show_label=True, value=0, interactive=True, info="Number of speakers in the video, if you don't know, select 0") min_speakers = gr.Number(visible=True, label="Min speakers", show_label=True, value=0, scale=0, interactive=True, info="Minimum number of speakers in the video") max_speakers = gr.Number(visible=True, label="Max speakers", show_label=True, value=0, scale=0, interactive=True, info="Maximum number of speakers in the video") subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True) auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible) with gr.Row(): get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible) video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible) diarization_progress_info = gr.Textbox(placeholder="Waiting", label="Diarize progress info", elem_id="diarization_progress_info", interactive=False, visible=visible) merged_transcription_progress_info = gr.Textbox(placeholder="Waiting", label="Merge transcription and diarization progress info", elem_id="merged_transcription_progress_info", interactive=False, visible=visible) transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible) video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible) video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible) original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible) original_video_path = gr.Textbox(label="Original video path", visible=visible) transcription = gr.Textbox(label="transcription", elem_id="transcription", visible=VIEW_OUTPUTS) diarization = gr.Textbox(label="diarization", elem_id="diarization", visible=VIEW_OUTPUTS) merged_transcription = gr.Textbox(label="merged_transcription", elem_id="merged_transcription", visible=VIEW_OUTPUTS) original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible) subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible) auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible) gr.HTML(html_buy_me_a_coffe) # Events # paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox) delete_button.click( fn=reset_frontend, outputs=[ video_input, source_languaje, target_languaje, Advanced_setings, number_of_speakers, subtify_button, auxiliar_block1, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info, subtitled_video, ] ) video_input.change( fn=process_uploaded_video, inputs=[video_input], outputs=[video_input, config_block, original_video_path, html_subtify_logo_component] ) subtify_button.click( fn=change_visibility_texboxes, outputs=[auxiliar_block1, get_audio_from_video_info, merged_transcription_progress_info, video_transcribed_progress_info, diarization_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info] ) auxiliar_block1.change( fn=get_audio, inputs=[original_video_path], outputs=[get_audio_from_video_info, original_audio_path] ) get_audio_from_video_info.change( fn=trascribe_audio, inputs=[original_audio_path, source_languaje], outputs=[video_transcribed_progress_info, transcription] ) video_transcribed_progress_info.change( fn=diarize, inputs=[original_audio_path, number_of_speakers, min_speakers, max_speakers], outputs=[diarization_progress_info, diarization] ) diarization_progress_info.change( fn=merge_transcription_and_diarization, outputs=[merged_transcription_progress_info, merged_transcription] ) # transcriptions_concatenated_progress_info.change( # fn=translate_transcription, # inputs=[original_audio_transcribed_path, source_languaje, target_languaje], # outputs=[video_translated_progress_info, original_audio_translated_path] # ) # video_translated_progress_info.change( # fn=add_translated_subtitles_to_video, # inputs=[original_video_path, original_audio_path, original_audio_translated_path], # outputs=[subtitled_video, video_subtitled_progress_info, auxiliar_block3] # ) # auxiliar_block3.change( # fn=hide_textbobes_progress_info, # outputs=[video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info] # ) demo.launch() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--no_ui", action="store_true") parser.add_argument("--remove_all_files", action="store_true") args = parser.parse_args() if args.no_ui: subtify_no_ui() elif args.remove_all_files: remove_all_files() else: subtify()