Spaces:

Maximofn
/

subtify

Build error

Maximofn commited on Feb 2

Commit

e015c08

1 Parent(s): 85c53a9

Refactor project structure and update dependencies

- Modularize code by creating separate files for audio, transcription, and UI configuration
- Update requirements.txt to remove unnecessary dependencies
- Simplify audio extraction and transcription processes
- Modify app.py to use new modular structure
- Add new utility files like ui_config.py and audio.py
- Update .gitignore to include __pycache__

Files changed (8) hide show

.gitignore +2 -1
app.py +137 -506
audio.py +46 -0
requirements.txt +21 -10
slice_audio.py +53 -33
transcribe.py +77 -40
ui_config.py +57 -0
url_manager.py +87 -0

.gitignore CHANGED Viewed

@@ -14,4 +14,5 @@ sepformer.ipynb
 modelscope.ipynb
 audio_cache
 *.png
-.DS_Store

 modelscope.ipynb
 audio_cache
 *.png
+.DS_Store
+__pycache__

app.py CHANGED Viewed

@@ -3,14 +3,28 @@ import argparse
 import spaces
 import os
 import torch
 from time import sleep
 from tqdm import tqdm
 from lang_list import union_language_dict
 # import pyperclip
-from pytube import YouTube
 import re
 from PIL import Image
 # import urllib.request
 NUMBER = 100
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -25,317 +39,33 @@ REMOVE_FILES = True
 if DEVICE == "cpu":
     # I supose that I am on huggingface server
     # Get RAM space
-    ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
     factor = 1
-    SECONDS = int(ram*factor)
-    print(f"RAM: {ram}, SECONDS: {SECONDS}")
 else:
     # I supose that I am on my computer
     # Get VRAM space
-    SECONDS = 300
 YOUTUBE = "youtube"
 TWITCH = "twitch"
 ERROR = "error"
-subtify_logo = Image.open("./assets/subtify_logo-scaled.png")
 subtify_logo_width, subtify_logo_height = subtify_logo.size
 factor = 4
 new_width = subtify_logo_width // factor
 new_height = subtify_logo_height // factor
-BACKGROUND_COLOR = "#0b0f19"
-BUTTON_COLOR = "#47515f"
-SVG_COLOR = "#f3f4f6"
-PANEL_COLOR = "#101827"
-PRIMARY_TEXT_COLOR = "#f3f4f6"
-SUBDUED_TEXT_COLOR = "#59616f"
-BACKGROUND_PRIMARY_COLOR = "#1f2937"
-BACKGROUND_SECONDARY_COLOR = "#101827"
-PRIMARY_BODER_COLOR = "#323c4c"
-BLOCK_TITLE_TEXT_COLOR = "#dfe2e6"
-INPUT_BACKGROUND_COLOR = "#2f3947"
-INPUT_BORDER_COLOR = "#313b4b"
-INPUT_PLACEHOLDER_COLOR = "#616977"
-ERROR_BACKGROUND_COLOR = "#101827"
-ERROR_TEXT_COLOR = "#f7f2f2"
-ERROR_BORDER_COLOR = "#9b3339"
-BUTTON_SECONDARY_BACKGROUND_COLOR = "#434d5c"
-BUTTON_SECONDARY_BORDER_COLOR = "#444d5b"
-BUTTON_SECONDARY_TEXT_COLOR = "#c5c9cc"
-RED = "#ff0000"
-GREEN = "#00ff00"
-BLUE = "#0000ff"
-html_social_media = f'''
-<div style="float: right;">
-    <a href="https://maximofn.com/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M208 80c0-26.5 21.5-48 48-48h64c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48h-8v40H464c30.9 0 56 25.1 56 56v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H464c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-4.4-3.6-8-8-8H312v40h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H256c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V280H112c-4.4 0-8 3.6-8 8v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-30.9 25.1-56 56-56H264V192h-8c-26.5 0-48-21.5-48-48V80z"/>
-        </svg>
-    </a>
-    <a href="http://github.com/maximofn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 496 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/>
-        </svg>
-    </a>
-    <a href="http://linkedin.com/in/MaximoFN/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M416 32H31.9C14.3 32 0 46.5 0 64.3v383.4C0 465.5 14.3 480 31.9 480H416c17.6 0 32-14.5 32-32.3V64.3c0-17.8-14.4-32.3-32-32.3zM135.4 416H69V202.2h66.5V416zm-33.2-243c-21.3 0-38.5-17.3-38.5-38.5S80.9 96 102.2 96c21.2 0 38.5 17.3 38.5 38.5 0 21.3-17.2 38.5-38.5 38.5zm282.1 243h-66.4V312c0-24.8-.5-56.7-34.5-56.7-34.6 0-39.9 27-39.9 54.9V416h-66.4V202.2h63.7v29.2h.9c8.9-16.8 30.6-34.5 62.9-34.5 67.2 0 79.7 44.3 79.7 101.9V416z"/>
-        </svg>
-    </a>
-    <a href="http://kaggle.com/maximofn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 320 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M304.2 501.5L158.4 320.3 298.2 185c2.6-2.7 1.7-10.5-5.3-10.5h-69.2c-3.5 0-7 1.8-10.5 5.3L80.9 313.5V7.5q0-7.5-7.5-7.5H21.5Q14 0 14 7.5v497q0 7.5 7.5 7.5h51.9q7.5 0 7.5-7.5v-109l30.8-29.3 110.5 140.6c3 3.5 6.5 5.3 10.5 5.3h66.9q5.25 0 6-3z"/>
-        </svg>
-    </a>
-    <a href="https://twitter.com/Maximo_fn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M389.2 48h70.6L305.6 224.2 487 464H345L233.7 318.6 106.5 464H35.8L200.7 275.5 26.8 48H172.4L272.9 180.9 389.2 48zM364.4 421.8h39.1L151.1 88h-42L364.4 421.8z"/>
-        </svg>
-    </a>
-    <a href="https://www.instagram.com/maximo__fn/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M224.1 141c-63.6 0-114.9 51.3-114.9 114.9s51.3 114.9 114.9 114.9S339 319.5 339 255.9 287.7 141 224.1 141zm0 189.6c-41.1 0-74.7-33.5-74.7-74.7s33.5-74.7 74.7-74.7 74.7 33.5 74.7 74.7-33.6 74.7-74.7 74.7zm146.4-194.3c0 14.9-12 26.8-26.8 26.8-14.9 0-26.8-12-26.8-26.8s12-26.8 26.8-26.8 26.8 12 26.8 26.8zm76.1 27.2c-1.7-35.9-9.9-67.7-36.2-93.9-26.2-26.2-58-34.4-93.9-36.2-37-2.1-147.9-2.1-184.9 0-35.8 1.7-67.6 9.9-93.9 36.1s-34.4 58-36.2 93.9c-2.1 37-2.1 147.9 0 184.9 1.7 35.9 9.9 67.7 36.2 93.9s58 34.4 93.9 36.2c37 2.1 147.9 2.1 184.9 0 35.9-1.7 67.7-9.9 93.9-36.2 26.2-26.2 34.4-58 36.2-93.9 2.1-37 2.1-147.8 0-184.8zM398.8 388c-7.8 19.6-22.9 34.7-42.6 42.6-29.5 11.7-99.5 9-132.1 9s-102.7 2.6-132.1-9c-19.6-7.8-34.7-22.9-42.6-42.6-11.7-29.5-9-99.5-9-132.1s-2.6-102.7 9-132.1c7.8-19.6 22.9-34.7 42.6-42.6 29.5-11.7 99.5-9 132.1-9s102.7-2.6 132.1 9c19.6 7.8 34.7 22.9 42.6 42.6 11.7 29.5 9 99.5 9 132.1s2.7 102.7-9 132.1z"/>
-        </svg>
-    </a>
-    <a href="https://www.youtube.com/channel/UCdQwg2JU_fWRsHn3yIlf3tw" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M549.655 124.083c-6.281-23.65-24.787-42.276-48.284-48.597C458.781 64 288 64 288 64S117.22 64 74.629 75.486c-23.497 6.322-42.003 24.947-48.284 48.597-11.412 42.867-11.412 132.305-11.412 132.305s0 89.438 11.412 132.305c6.281 23.65 24.787 41.5 48.284 47.821C117.22 448 288 448 288 448s170.78 0 213.371-11.486c23.497-6.321 42.003-24.171 48.284-47.821 11.412-42.867 11.412-132.305 11.412-132.305s0-89.438-11.412-132.305zm-317.51 213.508V175.185l142.739 81.205-142.739 81.201z"/>
-        </svg>
-    </a>
-    <a href="https://www.facebook.com/profile.php?id=100085177670661" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M504 256C504 119 393 8 256 8S8 119 8 256c0 123.78 90.69 226.38 209.25 245V327.69h-63V256h63v-54.64c0-62.15 37-96.48 93.67-96.48 27.14 0 55.52 4.84 55.52 4.84v61h-31.28c-30.8 0-40.41 19.12-40.41 38.73V256h68.78l-11 71.69h-57.78V501C413.31 482.38 504 379.78 504 256z"/>
-        </svg>
-    </a>
-    <a href="https://www.tiktok.com/@maximo__fn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M448,209.91a210.06,210.06,0,0,1-122.77-39.25V349.38A162.55,162.55,0,1,1,185,188.31V278.2a74.62,74.62,0,1,0,52.23,71.18V0l88,0a121.18,121.18,0,0,0,1.86,22.17h0A122.18,122.18,0,0,0,381,102.39a121.43,121.43,0,0,0,67,20.14Z"/>
-        </svg>
-    </a>
-    <a href="https://www.twitch.tv/maximofn/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
-        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
-            <style>
-                svg {"{"}
-                    fill: {SVG_COLOR}
-                {"}"}
-            </style>
-            <path d="M391.17,103.47H352.54v109.7h38.63ZM285,103H246.37V212.75H285ZM120.83,0,24.31,91.42V420.58H140.14V512l96.53-91.42h77.25L487.69,256V0ZM449.07,237.75l-77.22,73.12H294.61l-67.6,64v-64H140.14V36.58H449.07Z"/>
-        </svg>
-    </a>
-</div>
-'''
-html_subtify_logo = f"""
-<div style="display: flex; justify-content: center; align-items: center;">
-    <img src='https://pub-fb664c455eca46a2ba762a065ac900f7.r2.dev/subtify_logo-scaled.webp' width={new_width}px height={new_height}px >
-</div>
-"""
-html_buy_me_a_coffe = '''
-<div style="float: right;">
-    <a href="https://www.buymeacoffee.com/maximofn" target="_blank">
-        <img src="https://img.shields.io/badge/Buy_Me_A_Coffee-support_my_work-FFDD00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white&labelColor=101010" alt="buy me a coffe">
-    </a>
-</div>
-'''
 language_dict = union_language_dict()
-# def subtify_no_ui():
-#     number_works = 6
-#     progress_bar = tqdm(total=number_works, desc="Subtify")
-#     folder_chunck = "chunks"
-#     folder_concatenated = "concatenated_transcriptions"
-#     folder_translated_transcriptions = "translated_transcriptions"
-#     if not os.path.exists(folder_chunck):
-#         os.makedirs(folder_chunck)
-#     if not os.path.exists(folder_concatenated):
-#         os.makedirs(folder_concatenated)
-#     if not os.path.exists(folder_translated_transcriptions):
-#         os.makedirs(folder_translated_transcriptions)
-#     ################## Download video and audio ##################
-#     if DOWNLOAD:
-#         print('*'*NUMBER)
-#         # url = "https://www.twitch.tv/videos/1936119752"             # twitch Rob Mula 2 horas
-#         # url = "https://www.youtube.com/watch?v=yX5EJf4R77s"         # ✅ debate, varios hablantes, 3 minutos
-#         # url = "https://www.youtube.com/watch?v=cgx0QnXo1OU"         # ✅ smart home, un solo hablante, 4:42 minutos
-#         # url = "https://www.youtube.com/watch?v=dgOBxhi19T8"         # ✅ rob mula, muchos hablantes, 4:28 minutos
-#         # url = "https://www.youtube.com/watch?v=Coj72EzmX20"         # rob mula, un solo hablante, 16 minutos
-#         # url = "https://www.youtube.com/watch?v=Tqth0fKo0_g"           # Conversación short
-#         url = "https://www.youtube.com/watch?v=h9xPrgTYP_0"         # Letitia 40 segundos
-#         print(f"Downloading video and audio from {url}")
-#         python_file = "download.py"
-#         command = f"python {python_file} {url}"
-#         os.system(command)
-#         sleep(1)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
-#     ################## Slice audio ##################
-#     if SLICE_AUDIO:
-#         print('*'*NUMBER)
-#         print("Slicing audio")
-#         python_file = "slice_audio.py"
-#         audio = "audios/download_audio.mp3"
-#         command = f"python {python_file} {audio} {SECONDS}"
-#         os.system(command)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
-#     ################# Transcript slices ##################
-#     if TRANSCRIBE_AUDIO:
-#         print('*'*NUMBER)
-#         print("Transcript slices")
-#         chunks_folder = "chunks"
-#         if not os.path.exists(chunks_folder):
-#             os.makedirs(chunks_folder)
-#         python_file = "transcribe.py"
-#         chunks_file = "chunks/output_files.txt"
-#         number_of_speakers = 10
-#         source_languaje = "English"
-#         command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
-#         os.system(command)
-#         if REMOVE_FILES:
-#             with open(chunks_file, 'r') as f:
-#                 files = f.read().splitlines()
-#             for file in files:
-#                 audios_extension = "mp3"
-#                 file_name, _ = file.split(".")
-#                 _, file_name = file_name.split("/")
-#                 vocal = f'{chunks_folder}/{file_name}.{audios_extension}'
-#                 command = f"rm {vocal}"
-#                 os.system(command)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
-#     ################## Concatenate transcriptions ##################
-#     if CONCATENATE_TRANSCRIPTIONS:
-#         print('*'*NUMBER)
-#         print("Concatenate transcriptions")
-#         folder_concatenated = "concatenated_transcriptions"
-#         if not os.path.exists(folder_concatenated):
-#             os.makedirs(folder_concatenated)
-#         chunck_file = "chunks/output_files.txt"
-#         python_file = "concat_transcriptions.py"
-#         command = f"python {python_file} {chunck_file} {SECONDS}"
-#         os.system(command)
-#         if REMOVE_FILES:
-#             with open(chunck_file, 'r') as f:
-#                 files = f.read().splitlines()
-#             for file in files:
-#                 file_name, _ = file.split(".")
-#                 _, file_name = file_name.split("/")
-#                 transcriptions_folder = "transcriptions"
-#                 transcription_extension = "srt"
-#                 command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}"
-#                 os.system(command)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
-#     ################## Translate transcription ##################
-#     target_languaje = "Español"
-#     if TRANSLATE_TRANSCRIPTIONS:
-#         print('*'*NUMBER)
-#         print("Translate transcription")
-#         transcription_file = "concatenated_transcriptions/download_audio.srt"
-#         source_languaje = "English"
-#         python_file = "translate_transcriptions.py"
-#         command = f"python {python_file} {transcription_file} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}"
-#         os.system(command)
-#         if REMOVE_FILES:
-#             if os.path.exists(transcription_file):
-#                 command = f"rm {transcription_file}"
-#                 os.system(command)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
-#     ################## Add subtitles to video ##################
-#     if ADD_SUBTITLES_TO_VIDEO:
-#         print('*'*NUMBER)
-#         print("Add subtitles to video")
-#         python_file = "add_subtitles_to_video.py"
-#         transcription_file = f"translated_transcriptions/download_audio_{target_languaje}.srt"
-#         input_video_file = "videos/download_video.mp4"
-#         input_audio_file = "audios/download_audio.mp3"
-#         command = f"python {python_file} {transcription_file} {input_video_file} {input_audio_file}"
-#         os.system(command)
-#         if REMOVE_FILES:
-#             if os.path.exists(input_video_file):
-#                 command = f"rm {input_video_file}"
-#                 os.system(command)
-#             if os.path.exists(input_audio_file):
-#                 command = f"rm {input_audio_file}"
-#                 os.system(command)
-#             if os.path.exists(transcription_file):
-#                 command = f"rm {transcription_file}"
-#                 os.system(command)
-#             if os.path.exists("chunks/output_files.txt"):
-#                 command = f"rm chunks/output_files.txt"
-#                 os.system(command)
-#             if os.path.exists("chunks"):
-#                 command = f"rm -r chunks"
-#                 os.system(command)
-#             if os.path.exists("vocals/speakers.txt"):
-#                 command = f"rm vocals/speakers.txt"
-#                 os.system(command)
-#         print('*'*NUMBER)
-#         print("\n\n")
-#     progress_bar.update(1)
 def remove_all_files():
     if os.path.exists("audios"):
         command = f"rm -r audios"
@@ -359,13 +89,10 @@ def remove_all_files():
         command = f"rm -r vocals"
         os.system(command)
-# def paste_url_from_clipboard():
-#     return pyperclip.paste()
 def reset_frontend():
     visible = False
     return (
-        "",
         gr.Image(visible=visible),
         gr.Dropdown(visible=visible),
         gr.Dropdown(visible=visible),
@@ -381,142 +108,47 @@ def reset_frontend():
         gr.Textbox(visible=visible),
         gr.Textbox(visible=visible),
         gr.Textbox(visible=visible),
-        gr.Textbox(visible=visible),
-        gr.Textbox(visible=visible),
         gr.Video(visible=visible),
     )
 def show_auxiliar_block1():
     return gr.Textbox(value="URL checked", visible=False)
-def get_youtube_thumbnail(url):
-    yt = YouTube(url)
-    thumbnail_url = yt.thumbnail_url
-    return thumbnail_url
-def is_valid_youtube_url(url):
-    # This regular expression should match the following YouTube URL formats:
-    # - https://youtube.com/watch?v=video_id
-    # - https://www.youtube.com/watch?v=video_id
-    # - https://youtu.be/video_id
-    patron_youtube = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
-    return bool(re.match(patron_youtube, url))
-def is_valid_twitch_url(url):
-    # This regular expression should match the following Twitch URL formats:
-    # - https://twitch.tv/channel_name
-    # - https://www.twitch.tv/channel_name
-    # - https://twitch.tv/videos/video_id
-    twitch_pattern = r'(https?://)?(www\.)?twitch\.tv/(videos/\d+|\w+)'
-    return bool(re.match(twitch_pattern, url))
-def is_valid_url(url):
-    num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
-    source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
-    target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
-    advanced_setings = gr.Accordion(visible=True)
-    number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
-    subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
-    # Youtube
-    if "youtube" in url.lower() or "youtu.be" in url.lower():
-        if is_valid_youtube_url(url):
-            thumbnail = get_youtube_thumbnail(url)
-            if thumbnail:
-                return (
-                    gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
-                    source_languaje,
-                    target_languaje,
-                    advanced_setings,
-                    number_of_speakers,
-                    subtify_button,
-                )
-            else:
-                return (
-                    gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
-                    source_languaje,
-                    target_languaje,
-                    advanced_setings,
-                    number_of_speakers,
-                    subtify_button,
-                )
-    # Twitch
-    elif "twitch" in url.lower() or "twitch.tv" in url.lower():
-        if is_valid_twitch_url(url):
-            return (
-                gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
-                source_languaje,
-                target_languaje,
-                advanced_setings,
-                number_of_speakers,
-                subtify_button,
-            )
-    # Error
-    visible = False
-    image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
-    source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
-    target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
-    advanced_setings = gr.Accordion(visible=visible)
-    number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
-    subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
-    return (
-        image,
-        source_languaje,
-        target_languaje,
-        advanced_setings,
-        number_of_speakers,
-        subtify_button,
-    )
 def change_visibility_texboxes():
     return (
-        gr.Textbox(value="Done"),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=True),
-        gr.Textbox(visible=False),
     )
-def get_audio_and_video_from_video(url):
     print('*'*NUMBER)
-    print(f"Downloading video and audio from {url}")
-    audios_folder = "audios"
-    videos_folder = "videos"
-    if not os.path.exists(audios_folder):
-        os.makedirs(audios_folder)
-    if not os.path.exists(videos_folder):
-        os.makedirs(videos_folder)
-    python_file = "download.py"
-    command = f"python {python_file} {url}"
-    os.system(command)
-    sleep(1)
-    audio = "audios/download_audio.mp3"
-    video = "videos/download_video.mp4"
-    if not os.path.exists(audio):
-        raise Exception("Error downloading audio")
-    if not os.path.exists(video):
-        raise Exception("Error downloading video")
-    return (
-        gr.Textbox(value="Ok"),
-        gr.Textbox(value=audio),
-        gr.Textbox(value=video),
-    )
-def slice_audio(audio_path):
     print('*'*NUMBER)
-    print("Slicing audio")
     folder_vocals = "vocals"
     folder_chunck = "chunks"
     if not os.path.exists(folder_vocals):
@@ -524,34 +156,22 @@ def slice_audio(audio_path):
     if not os.path.exists(folder_chunck):
         os.makedirs(folder_chunck)
-    python_file = "slice_audio.py"
-    command = f"python {python_file} {audio_path} {SECONDS}"
-    os.system(command)
     return (
-        gr.Textbox(value="Ok")
     )
-def trascribe_audio(source_languaje, number_of_speakers):
     print('*'*NUMBER)
-    print("Transcript slices")
-    folder_chunks = "chunks"
-    python_file = "transcribe.py"
-    chunks_file = "chunks/output_files.txt"
-    command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
-    os.system(command)
-    with open(chunks_file, 'r') as f:
-        files = f.read().splitlines()
-    for file in files:
-        audios_extension = "mp3"
-        file_name, _ = file.split(".")
-        _, file_name = file_name.split("/")
-        vocal = f'{folder_chunks}/{file_name}.{audios_extension}'
-        command = f"rm {vocal}"
-        os.system(command)
     return (
         gr.Textbox(value="Ok")
     )
@@ -566,7 +186,7 @@ def concatenate_transcriptions():
     chunck_file = "chunks/output_files.txt"
     python_file = "concat_transcriptions.py"
-    command = f"python {python_file} {chunck_file} {SECONDS}"
     os.system(command)
     with open(chunck_file, 'r') as f:
@@ -651,6 +271,23 @@ def hide_textbobes_progress_info():
         gr.Textbox(value="Waiting", visible=visible),
     )
 @spaces.GPU
 def subtify():
     with gr.Blocks(
@@ -700,54 +337,59 @@ def subtify():
         gr.HTML(html_social_media)
         gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
         gr.HTML(html_subtify_logo)
-        with gr.Row(variant="panel"):
-            url_textbox = gr.Textbox(placeholder="Add video URL here and wait a moment", label="Video URL", elem_id="video_url", scale=1, interactive=True)
-            # paste_button   = gr.Button(size="sm", icon="icons/paste.svg",   value="paste", min_width="10px", scale=0)
-            delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0)
         visible = False
-        auxiliar_block1 = gr.Textbox(label="Auxiliar block 1", elem_id="auxiliar_block1", interactive=False, visible=visible)
-        with gr.Row(equal_height=False):
-            image = gr.Image(visible=visible, scale=1)
-            with gr.Column():
                 with gr.Row():
-                    source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
-                    target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
-                with gr.Accordion("Advanced settings", open=False, visible=visible) as Advanced_setings:
-                    number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
-                subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
-        auxiliar_block2 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 2", elem_id="auxiliar_block2", interactive=False, visible=visible)
         with gr.Row():
-            video_donwloaded_progress_info = gr.Textbox(placeholder="Waiting", label="Video download progress info", elem_id="video_donwloaded_progress_info", interactive=False, visible=visible)
-            video_sliced_progress_info = gr.Textbox(placeholder="Waiting", label="Video slice progress info", elem_id="video_sliced_progress_info", interactive=False, visible=visible)
             video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
             transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
             video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
             video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
         original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
-        original_video_path = gr.Textbox(label="Original video path", elem_id="original_video_path", visible=visible)
         original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
         original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
         subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
         auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
         # Events
         # paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox)
         delete_button.click(
             fn=reset_frontend,
             outputs=[
-                url_textbox,
-                image,
                 source_languaje,
                 target_languaje,
                 Advanced_setings,
                 number_of_speakers,
                 subtify_button,
-                auxiliar_block2,
-                video_donwloaded_progress_info,
-                video_sliced_progress_info,
                 video_transcribed_progress_info,
                 transcriptions_concatenated_progress_info,
                 video_translated_progress_info,
@@ -755,54 +397,43 @@ def subtify():
                 subtitled_video,
             ]
         )
-        url_textbox.change(
-            fn=show_auxiliar_block1,
-            outputs=[auxiliar_block1]
-        )
-        auxiliar_block1.change(
-            fn=is_valid_url,
-            inputs=url_textbox,
-            outputs=[image, source_languaje, target_languaje, Advanced_setings, number_of_speakers, subtify_button]
         )
         subtify_button.click(
             fn=change_visibility_texboxes,
-            outputs=[auxiliar_block2, video_donwloaded_progress_info, video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info, auxiliar_block1]
-        )
-        auxiliar_block2.change(
-            fn=get_audio_and_video_from_video,
-            inputs=[url_textbox],
-            outputs=[video_donwloaded_progress_info, original_audio_path, original_video_path]
         )
-        video_donwloaded_progress_info.change(
-            fn=slice_audio,
-            inputs=[original_audio_path],
-            outputs=[video_sliced_progress_info]
         )
-        video_sliced_progress_info.change(
             fn=trascribe_audio,
-            inputs=[source_languaje, number_of_speakers],
             outputs=[video_transcribed_progress_info]
         )
-        video_transcribed_progress_info.change(
-            fn=concatenate_transcriptions,
-            outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
-        )
-        transcriptions_concatenated_progress_info.change(
-            fn=translate_transcription,
-            inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
-            outputs=[video_translated_progress_info, original_audio_translated_path]
-        )
-        video_translated_progress_info.change(
-            fn=add_translated_subtitles_to_video,
-            inputs=[original_video_path, original_audio_path, original_audio_translated_path],
-            outputs=[subtitled_video, video_subtitled_progress_info, auxiliar_block3]
-        )
-        auxiliar_block3.change(
-            fn=hide_textbobes_progress_info,
-            outputs=[video_donwloaded_progress_info, video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
-        )
-        gr.HTML(html_buy_me_a_coffe)
     demo.launch()

 import spaces
 import os
 import torch
+import shutil
 from time import sleep
 from tqdm import tqdm
 from lang_list import union_language_dict
 # import pyperclip
 import re
 from PIL import Image
 # import urllib.request
+from ui_config import (
+    BACKGROUND_COLOR, BUTTON_COLOR, SVG_COLOR, PANEL_COLOR,
+    PRIMARY_TEXT_COLOR, SUBDUED_TEXT_COLOR, BACKGROUND_PRIMARY_COLOR,
+    BACKGROUND_SECONDARY_COLOR, PRIMARY_BODER_COLOR, BLOCK_TITLE_TEXT_COLOR,
+    INPUT_BACKGROUND_COLOR, INPUT_BORDER_COLOR, INPUT_PLACEHOLDER_COLOR,
+    ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
+    BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
+    BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
+    html_social_media, get_html_subtify_logo, html_buy_me_a_coffe
+)
+# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
+from slice_audio import slice_audio as slice_audio_main
+from audio import get_audio_from_video
+from transcribe import transcribe, get_language_dict
 NUMBER = 100
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if DEVICE == "cpu":
     # I supose that I am on huggingface server
     # Get RAM space
+    # ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
+    ram = 16000
     factor = 1
+    CHUNK_SECONDS = int(ram*factor)
+    CHUNK_SECONDS = 30
+    CHUNK_OVERLAP_SECONDS = 5
+    print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
 else:
     # I supose that I am on my computer
     # Get VRAM space
+    CHUNK_SECONDS = 30
+    CHUNK_OVERLAP_SECONDS = 5
 YOUTUBE = "youtube"
 TWITCH = "twitch"
 ERROR = "error"
+subtify_logo = Image.open("assets/subtify_logo-scaled.png")
 subtify_logo_width, subtify_logo_height = subtify_logo.size
 factor = 4
 new_width = subtify_logo_width // factor
 new_height = subtify_logo_height // factor
+html_subtify_logo = get_html_subtify_logo(new_width, new_height)
 language_dict = union_language_dict()
 def remove_all_files():
     if os.path.exists("audios"):
         command = f"rm -r audios"
         command = f"rm -r vocals"
         os.system(command)
 def reset_frontend():
     visible = False
     return (
+        None,
         gr.Image(visible=visible),
         gr.Dropdown(visible=visible),
         gr.Dropdown(visible=visible),
         gr.Textbox(visible=visible),
         gr.Textbox(visible=visible),
         gr.Textbox(visible=visible),
         gr.Video(visible=visible),
     )
 def show_auxiliar_block1():
     return gr.Textbox(value="URL checked", visible=False)
 def change_visibility_texboxes():
     return (
+        gr.update(value="Done"), # auxiliar_block1
+        gr.update(visible=True), # get_audio_from_video_info
+        gr.update(visible=True), # video_sliced_progress_info
+        gr.update(visible=True), # video_transcribed_progress_info
+        gr.update(visible=True), # transcriptions_concatenated_progress_info
+        gr.update(visible=True), # video_translated_progress_info
+        gr.update(visible=True), # video_subtitled_progress_info
     )
+def get_audio(video_path):
     print('*'*NUMBER)
+    print(f"Getting audio from video {video_path}")
+    audios_folder = "audios"
+    try:
+        audio_path = get_audio_from_video(video_path, audios_folder)
+        return [
+            gr.update(value="Ok"),  # get_audio_from_video_info
+            gr.update(value=audio_path)  # original_audio_path
+        ]
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return [
+            gr.update(value="Error"),  # get_audio_from_video_info
+            gr.update(value="")  # original_audio_path
+        ]
+def slice_audio(input_audio_path):
     print('*'*NUMBER)
+    print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
+    # Create vocals and chunks folders
+    print("Creating vocals and chunks folders")
     folder_vocals = "vocals"
     folder_chunck = "chunks"
     if not os.path.exists(folder_vocals):
     if not os.path.exists(folder_chunck):
         os.makedirs(folder_chunck)
+    slice_audio_main(input_audio_path, folder_chunck, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
     return (
+        gr.update(value="Ok"),  # video_sliced_progress_info
     )
+def trascribe_audio(input_audio_path, source_languaje):
     print('*'*NUMBER)
+    print(f"Transcript {input_audio_path}")
+    # Get language dict
+    language_dict = get_language_dict()
+    # Transcribe audio file
+    transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
     return (
         gr.Textbox(value="Ok")
     )
     chunck_file = "chunks/output_files.txt"
     python_file = "concat_transcriptions.py"
+    command = f"python {python_file} {chunck_file} {CHUNK_SECONDS} {CHUNK_OVERLAP_SECONDS}"
     os.system(command)
     with open(chunck_file, 'r') as f:
         gr.Textbox(value="Waiting", visible=visible),
     )
+def process_uploaded_video(video_path):
+    # Create videos folder
+    videos_folder = "videos"
+    if not os.path.exists(videos_folder):
+        os.makedirs(videos_folder)
+    # Copy uploaded video to videos folder
+    new_video_path = os.path.join(videos_folder, "download_video.mp4")
+    shutil.copy(video_path, new_video_path)
+    # Return updated config block with new scale and the new video path
+    return [
+        gr.update(label="Video uploaded"),  # video_input
+        gr.update(visible=True),  # config_block
+        gr.update(value=new_video_path)  # original_video_path
+    ]
 @spaces.GPU
 def subtify():
     with gr.Blocks(
         gr.HTML(html_social_media)
         gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
         gr.HTML(html_subtify_logo)
+        # Input block, where the user can upload a video and configure the subtify process
         visible = False
+        input_block = gr.Row(variant="panel")
+        with input_block:
+            input_video_block = gr.Row(scale=2)
+            with input_video_block:
+                video_input = gr.Video(
+                    label="Upload video",
+                    sources=["upload"],
+                    scale=1,
+                    interactive=True
+                )
+                delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0)
+            config_block = gr.Column(scale=1, visible=visible)
+            with config_block:
                 with gr.Row():
+                    source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
+                    target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
+                with gr.Accordion("Advanced settings", open=False, visible=True) as Advanced_setings:
+                    number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
+                subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
+        auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
         with gr.Row():
+            get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
             video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
             transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
             video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
             video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
         original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
+        original_video_path = gr.Textbox(label="Original video path", visible=visible)
         original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
         original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
         subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
         auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
+        gr.HTML(html_buy_me_a_coffe)
         # Events
         # paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox)
         delete_button.click(
             fn=reset_frontend,
             outputs=[
+                video_input,
                 source_languaje,
                 target_languaje,
                 Advanced_setings,
                 number_of_speakers,
                 subtify_button,
+                auxiliar_block1,
                 video_transcribed_progress_info,
                 transcriptions_concatenated_progress_info,
                 video_translated_progress_info,
                 subtitled_video,
             ]
         )
+        video_input.change(
+            fn=process_uploaded_video,
+            inputs=[video_input],
+            outputs=[video_input, config_block, original_video_path]
         )
         subtify_button.click(
             fn=change_visibility_texboxes,
+            outputs=[auxiliar_block1, get_audio_from_video_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
         )
+        auxiliar_block1.change(
+            fn=get_audio,
+            inputs=[original_video_path],
+            outputs=[get_audio_from_video_info, original_audio_path]
         )
+        get_audio_from_video_info.change(
             fn=trascribe_audio,
+            inputs=[original_audio_path, source_languaje],
             outputs=[video_transcribed_progress_info]
         )
+        # video_transcribed_progress_info.change(
+        #     fn=concatenate_transcriptions,
+        #     outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
+        # )
+        # transcriptions_concatenated_progress_info.change(
+        #     fn=translate_transcription,
+        #     inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
+        #     outputs=[video_translated_progress_info, original_audio_translated_path]
+        # )
+        # video_translated_progress_info.change(
+        #     fn=add_translated_subtitles_to_video,
+        #     inputs=[original_video_path, original_audio_path, original_audio_translated_path],
+        #     outputs=[subtitled_video, video_subtitled_progress_info, auxiliar_block3]
+        # )
+        # auxiliar_block3.change(
+        #     fn=hide_textbobes_progress_info,
+        #     outputs=[video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
+        # )
     demo.launch()

audio.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import ffmpeg
+DEBUG=True
+def get_audio_from_video(video_path: str, output_folder: str) -> str:
+    """
+    Extract audio from video and save it as mp3.
+    Args:
+        video_path (str): Path to the video file
+        output_folder (str): Path to folder where audio will be saved
+    Returns:
+        str: Path to the saved audio file
+    Raises:
+        Exception: If video file doesn't exist
+        Exception: If there's an error extracting the audio
+    """
+    # Validate video exists
+    if not os.path.exists(video_path):
+        raise Exception(f"Video file not found: {video_path}")
+    # Create output folder if it doesn't exist
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    try:
+        # Generate output path
+        audio_filename = "download_audio.mp3"
+        audio_path = os.path.join(output_folder, audio_filename)
+        if DEBUG:
+            if os.path.exists(audio_path):
+                return audio_path
+        # Extract audio using ffmpeg
+        stream = ffmpeg.input(video_path)
+        stream = ffmpeg.output(stream, audio_path, acodec='libmp3lame')
+        ffmpeg.run(stream, overwrite_output=True)
+        return audio_path
+    except Exception as e:
+        raise Exception(f"Error extracting audio from video: {str(e)}")

requirements.txt CHANGED Viewed

@@ -1,20 +1,31 @@
 # gradio
 gradio
-# Get environment
-transformers
 # Download youtube and twitch videos
-pytube
-yt-dlp
-twitch-dl
 # Trascribe audios
-git+https://github.com/m-bain/whisperx.git
 pyannote.audio
-# Translate
-protobuf
-# Add subtitles to videos
-opencv-python

 # gradio
 gradio
+# spaces
+spaces
+# pytorch
+torch
+torchvision
+torchaudio
+# Transformers
+transformers accelerate
+# ffmpeg
+ffmpeg-python
 # Download youtube and twitch videos
+# pytube
+# yt-dlp
+# twitch-dl
 # Trascribe audios
+# git+https://github.com/m-bain/whisperx.git
 pyannote.audio
+# # Translate
+# protobuf
+# # Add subtitles to videos
+# opencv-python

slice_audio.py CHANGED Viewed

@@ -4,6 +4,7 @@ from tqdm import tqdm
 START = 00
 FOLDER = "chunks"
 def seconds_to_hms(seconds):
     hour = 00
@@ -22,53 +23,72 @@ def seconds_to_hms(seconds):
 def hms_to_seconds(hour, minute, second):
     return hour*3600 + minute*60 + second
-def main(args):
-    input = args.input
-    # name, extension = input.split(".")
-    path, filename = os.path.split(input)
     name, extension = os.path.splitext(filename)
-    seconds = int(args.seconds)
     # Get audio duration in seconds
-    duration = float(os.popen(f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {input}').read())
     hour, minute, second = seconds_to_hms(int(duration))
-    # Number of chunks
-    num_chunks = -(-int(duration) // seconds)  # Redondeo hacia arriba
-    # Slice audio into seconds chunks
-    hour, minute, second = seconds_to_hms(seconds) # Duration of each chunk
     output_files = []
     progress_bar = tqdm(total=num_chunks, desc="Slice audio into chunks progress")
     for chunk in range(num_chunks):
-        start_time = chunk * seconds
-        hour_start, minute_start, second_start = seconds_to_hms(start_time) # Start time of each chunk
-        if start_time + seconds > duration:
-            hour, minute, second = seconds_to_hms(duration - start_time)
-        else:
-            hour, minute, second = seconds_to_hms(seconds)
-        output = f"{FOLDER}/{name}_chunk{chunk:003d}{extension}"
-        if start_time + seconds > duration:
-            command = f'ffmpeg -i {input} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -loglevel error {output}'
         else:
-            command = f'ffmpeg -i {input} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -t {hour:02}:{minute:02}:{second:02} -loglevel error {output}'
         os.system(command)
         output_files.append(output)
         progress_bar.update(1)
-    # write output files to a txt file
-    with open(f"{FOLDER}/output_files.txt", "w") as f:
         for output_file in output_files:
             f.write(f"{output_file}\n")
-if __name__ == "__main__":
-    argparser = argparse.ArgumentParser(description='Slice audio into smaller chunks')
-    argparser.add_argument('input', help='Input audio file')
-    argparser.add_argument('seconds', help='Duration of each chunk in seconds')
-    args = argparser.parse_args()
-    main(args)

 START = 00
 FOLDER = "chunks"
+DEBUG = True
 def seconds_to_hms(seconds):
     hour = 00
 def hms_to_seconds(hour, minute, second):
     return hour*3600 + minute*60 + second
+def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
+    """
+    Slice audio into chunks with specified duration and overlap.
+    Args:
+        input_audio_path (str): Path to input audio file
+        output_folder (str): Path to output folder
+        chunks_seconds (int): Duration of each chunk in seconds
+        chunk_overlap_seconds (int): Overlap between chunks in seconds
+    """
+    _, filename = os.path.split(input_audio_path)
     name, extension = os.path.splitext(filename)
     # Get audio duration in seconds
+    duration = float(os.popen(f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {input_audio_path}').read())
     hour, minute, second = seconds_to_hms(int(duration))
+    print(f"\tDuration ({duration} seconds): {hour:02d}:{minute:02d}:{second:02d}")
+    # Calculate effective chunk duration considering overlap
+    effective_chunk = chunks_seconds - chunk_overlap_seconds
+    # Calculate number of chunks needed
+    if effective_chunk > 0:
+        num_chunks = -(-int(duration - chunk_overlap_seconds) // effective_chunk)  # Ceiling division
+    else:
+        raise ValueError("Overlap duration must be less than chunk duration")
+    # Slice audio into chunks with overlap
     output_files = []
     progress_bar = tqdm(total=num_chunks, desc="Slice audio into chunks progress")
     for chunk in range(num_chunks):
+        # Calculate start and end times for this chunk
+        start_time = chunk * effective_chunk
+        end_time = min(start_time + chunks_seconds, duration)
+        # Convert times to HH:MM:SS format
+        hour_start, minute_start, second_start = seconds_to_hms(start_time)
+        # Calculate chunk duration
+        chunk_duration = end_time - start_time
+        hour_duration, minute_duration, second_duration = seconds_to_hms(chunk_duration)
+        # Generate output filename
+        output = f"{output_folder}/{name}_chunk{chunk:003d}{extension}"
+        if DEBUG:
+            if os.path.exists(output):
+                output_files.append(output)
+                progress_bar.update(1)
+                continue
+        # Build ffmpeg command with -y flag to overwrite without asking
+        if chunk == num_chunks - 1:  # Last chunk
+            command = f'ffmpeg -y -i {input_audio_path} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -loglevel error {output}'
         else:
+            command = f'ffmpeg -y -i {input_audio_path} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -t {hour_duration:02d}:{minute_duration:02d}:{second_duration:02d} -loglevel error {output}'
+        # Execute command
         os.system(command)
         output_files.append(output)
         progress_bar.update(1)
+    progress_bar.close()
+    # Write output files to a txt file (with overwrite)
+    with open(f"{output_folder}/output_files.txt", "w") as f:
         for output_file in output_files:
             f.write(f"{output_file}\n")

transcribe.py CHANGED Viewed

@@ -2,50 +2,86 @@ import os
 import argparse
 from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
 from tqdm import tqdm
-# For pyannote.audio diarize
-from pyannote.audio import Model
-model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token="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")
-language_dict = {}
-# Iterate over the LANGUAGE_NAME_TO_CODE dictionary
-for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
-    # Extract the language code (the first two characters before the underscore)
-    lang_code = language_code.split('_')[0].lower()
-    # Check if the language code is present in WHISPER_LANGUAGES
-    if lang_code in WHISPER_LANGUAGES:
-        # Construct the entry for the resulting dictionary
-        language_dict[language_name] = {
-            "transcriber": lang_code,
-            "translator": language_code
-        }
-def transcribe(audio_file, language, num_speakers, device):
     output_folder = "transcriptions"
-    # Transcribe audio file
-    model = "large-v2"
-    # word_timestamps = True
-    print_progress = False
-    if device == "cpu":
-        # I supose that I am on huggingface server
-        compute_type = "float32"
-    else:
-        compute_type = "float16"
-    fp16 = True
-    batch_size = 8
-    verbose = False
-    min_speakers = 1
-    max_speakers = num_speakers
-    threads = 4
-    output_format = "srt"
-    hf_token = "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"
-    command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
---output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
---fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device} \
---diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
-    os.system(command)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Transcribe audio files')
@@ -66,5 +102,6 @@ if __name__ == "__main__":
         _, input_name = input_file.split('/')
         extension = "mp3"
         file = f'{chunks_folder}/{input_name}.{extension}'
         transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
-        progress_bar.update(1)

 import argparse
 from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
 from tqdm import tqdm
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+def get_language_dict():
+    language_dict = {}
+    # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
+    for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
+        # Extract the language code (the first two characters before the underscore)
+        lang_code = language_code.split('_')[0].lower()
+        # Check if the language code is present in WHISPER_LANGUAGES
+        if lang_code in WHISPER_LANGUAGES:
+            # Construct the entry for the resulting dictionary
+            language_dict[language_name] = {
+                "transcriber": lang_code,
+                "translator": language_code
+            }
+    return language_dict
+def transcribe(audio_file, language, device, chunk_length_s=30, stride_length_s=5):
+    """
+    Transcribe audio file using Whisper model.
+    Args:
+        audio_file (str): Path to audio file
+        language (str): Language code for transcription
+        device (str): Device to use for inference ('cuda' or 'cpu')
+        chunk_length_s (int): Length of audio chunks in seconds
+        stride_length_s (int): Stride length between chunks in seconds
+    """
     output_folder = "transcriptions"
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    # Get output filename
+    audio_filename = os.path.basename(audio_file)
+    filename_without_ext = os.path.splitext(audio_filename)[0]
+    output_file = os.path.join(output_folder, f"{filename_without_ext}.srt")
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    # Load model and processor
+    model_id = "openai/whisper-large-v3-turbo"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
+        use_safetensors=True
+    )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    # Create pipeline with timestamp generation
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+        chunk_length_s=chunk_length_s,
+        stride_length_s=stride_length_s,
+        return_timestamps=True
+    )
+    # Transcribe with timestamps and generate attention mask
+    result = pipe(
+        audio_file,
+        return_timestamps=True,
+        generate_kwargs={
+            "language": language,
+            "task": "transcribe",
+            "use_cache": True,
+            "num_beams": 1
+        }
+    )
+    print(result)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Transcribe audio files')
         _, input_name = input_file.split('/')
         extension = "mp3"
         file = f'{chunks_folder}/{input_name}.{extension}'
+        language_dict = get_language_dict()
         transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
+        progress_bar.update(1)

ui_config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Definición de colores
+BACKGROUND_COLOR = "#0b0f19"
+BUTTON_COLOR = "#47515f"
+SVG_COLOR = "#f3f4f6"
+PANEL_COLOR = "#101827"
+PRIMARY_TEXT_COLOR = "#f3f4f6"
+SUBDUED_TEXT_COLOR = "#59616f"
+BACKGROUND_PRIMARY_COLOR = "#1f2937"
+BACKGROUND_SECONDARY_COLOR = "#101827"
+PRIMARY_BODER_COLOR = "#323c4c"
+BLOCK_TITLE_TEXT_COLOR = "#dfe2e6"
+INPUT_BACKGROUND_COLOR = "#2f3947"
+INPUT_BORDER_COLOR = "#313b4b"
+INPUT_PLACEHOLDER_COLOR = "#616977"
+ERROR_BACKGROUND_COLOR = "#101827"
+ERROR_TEXT_COLOR = "#f7f2f2"
+ERROR_BORDER_COLOR = "#9b3339"
+BUTTON_SECONDARY_BACKGROUND_COLOR = "#434d5c"
+BUTTON_SECONDARY_BORDER_COLOR = "#444d5b"
+BUTTON_SECONDARY_TEXT_COLOR = "#c5c9cc"
+RED = "#ff0000"
+GREEN = "#00ff00"
+BLUE = "#0000ff"
+# HTML para redes sociales
+html_social_media = f'''
+<div style="float: right;">
+    <a href="https://maximofn.com/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary  svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
+        <svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
+            <style>
+                svg {"{"}
+                    fill: {SVG_COLOR}
+                {"}"}
+            </style>
+            <path d="M208 80c0-26.5 21.5-48 48-48h64c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48h-8v40H464c30.9 0 56 25.1 56 56v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H464c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-4.4-3.6-8-8-8H312v40h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H256c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V280H112c-4.4 0-8 3.6-8 8v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-30.9 25.1-56 56-56H264V192h-8c-26.5 0-48-21.5-48-48V80z"/>
+        </svg>
+    </a>
+    <!-- Resto de los enlaces de redes sociales... -->
+</div>
+'''
+# HTML para el logo
+def get_html_subtify_logo(new_width, new_height):
+    return f"""
+    <div style="display: flex; justify-content: center; align-items: center;">
+        <img src='https://pub-fb664c455eca46a2ba762a065ac900f7.r2.dev/subtify_logo-scaled.webp' width={new_width}px height={new_height}px >
+    </div>
+    """
+# HTML para el botón de Buy Me a Coffee
+html_buy_me_a_coffe = '''
+<div style="float: right;">
+    <a href="https://www.buymeacoffee.com/maximofn" target="_blank">
+        <img src="https://img.shields.io/badge/Buy_Me_A_Coffee-support_my_work-FFDD00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white&labelColor=101010" alt="buy me a coffe">
+    </a>
+</div>
+'''

url_manager.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import re
+import gradio as gr
+from pytube import YouTube
+from lang_list import union_language_dict
+language_dict = union_language_dict()
+def get_youtube_thumbnail(url):
+    yt = YouTube(url)
+    thumbnail_url = yt.thumbnail_url
+    return thumbnail_url
+def is_valid_youtube_url(url):
+    # This regular expression should match the following YouTube URL formats:
+    # - https://youtube.com/watch?v=video_id
+    # - https://www.youtube.com/watch?v=video_id
+    # - https://youtu.be/video_id
+    patron_youtube = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
+    return bool(re.match(patron_youtube, url))
+def is_valid_twitch_url(url):
+    # This regular expression should match the following Twitch URL formats:
+    # - https://twitch.tv/channel_name
+    # - https://www.twitch.tv/channel_name
+    # - https://twitch.tv/videos/video_id
+    twitch_pattern = r'(https?://)?(www\.)?twitch\.tv/(videos/\d+|\w+)'
+    return bool(re.match(twitch_pattern, url))
+def is_valid_url(url):
+    num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+    source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
+    target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
+    advanced_setings = gr.Accordion(visible=True)
+    number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
+    subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
+    # Youtube
+    if "youtube" in url.lower() or "youtu.be" in url.lower():
+        if is_valid_youtube_url(url):
+            thumbnail = get_youtube_thumbnail(url)
+            if thumbnail:
+                return (
+                    gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
+                    source_languaje,
+                    target_languaje,
+                    advanced_setings,
+                    number_of_speakers,
+                    subtify_button,
+                )
+            else:
+                return (
+                    gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
+                    source_languaje,
+                    target_languaje,
+                    advanced_setings,
+                    number_of_speakers,
+                    subtify_button,
+                )
+    # Twitch
+    elif "twitch" in url.lower() or "twitch.tv" in url.lower():
+        if is_valid_twitch_url(url):
+            return (
+                gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
+                source_languaje,
+                target_languaje,
+                advanced_setings,
+                number_of_speakers,
+                subtify_button,
+            )
+    # Error
+    visible = False
+    image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
+    source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
+    target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
+    advanced_setings = gr.Accordion(visible=visible)
+    number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
+    subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
+    return (
+        image,
+        source_languaje,
+        target_languaje,
+        advanced_setings,
+        number_of_speakers,
+        subtify_button,
+    )