Refactor project structure and update dependencies
Browse files- Modularize code by creating separate files for audio, transcription, and UI configuration
- Update requirements.txt to remove unnecessary dependencies
- Simplify audio extraction and transcription processes
- Modify app.py to use new modular structure
- Add new utility files like ui_config.py and audio.py
- Update .gitignore to include __pycache__
- .gitignore +2 -1
- app.py +137 -506
- audio.py +46 -0
- requirements.txt +21 -10
- slice_audio.py +53 -33
- transcribe.py +77 -40
- ui_config.py +57 -0
- url_manager.py +87 -0
.gitignore
CHANGED
@@ -14,4 +14,5 @@ sepformer.ipynb
|
|
14 |
modelscope.ipynb
|
15 |
audio_cache
|
16 |
*.png
|
17 |
-
.DS_Store
|
|
|
|
14 |
modelscope.ipynb
|
15 |
audio_cache
|
16 |
*.png
|
17 |
+
.DS_Store
|
18 |
+
__pycache__
|
app.py
CHANGED
@@ -3,14 +3,28 @@ import argparse
|
|
3 |
import spaces
|
4 |
import os
|
5 |
import torch
|
|
|
6 |
from time import sleep
|
7 |
from tqdm import tqdm
|
8 |
from lang_list import union_language_dict
|
9 |
# import pyperclip
|
10 |
-
from pytube import YouTube
|
11 |
import re
|
12 |
from PIL import Image
|
13 |
# import urllib.request
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
NUMBER = 100
|
16 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -25,317 +39,33 @@ REMOVE_FILES = True
|
|
25 |
if DEVICE == "cpu":
|
26 |
# I supose that I am on huggingface server
|
27 |
# Get RAM space
|
28 |
-
ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
|
|
|
29 |
factor = 1
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
else:
|
33 |
# I supose that I am on my computer
|
34 |
# Get VRAM space
|
35 |
-
|
|
|
36 |
|
37 |
YOUTUBE = "youtube"
|
38 |
TWITCH = "twitch"
|
39 |
ERROR = "error"
|
40 |
|
41 |
-
subtify_logo = Image.open("
|
42 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
43 |
factor = 4
|
44 |
new_width = subtify_logo_width // factor
|
45 |
new_height = subtify_logo_height // factor
|
46 |
|
47 |
-
|
48 |
-
BUTTON_COLOR = "#47515f"
|
49 |
-
SVG_COLOR = "#f3f4f6"
|
50 |
-
PANEL_COLOR = "#101827"
|
51 |
-
PRIMARY_TEXT_COLOR = "#f3f4f6"
|
52 |
-
SUBDUED_TEXT_COLOR = "#59616f"
|
53 |
-
BACKGROUND_PRIMARY_COLOR = "#1f2937"
|
54 |
-
BACKGROUND_SECONDARY_COLOR = "#101827"
|
55 |
-
PRIMARY_BODER_COLOR = "#323c4c"
|
56 |
-
BLOCK_TITLE_TEXT_COLOR = "#dfe2e6"
|
57 |
-
INPUT_BACKGROUND_COLOR = "#2f3947"
|
58 |
-
INPUT_BORDER_COLOR = "#313b4b"
|
59 |
-
INPUT_PLACEHOLDER_COLOR = "#616977"
|
60 |
-
ERROR_BACKGROUND_COLOR = "#101827"
|
61 |
-
ERROR_TEXT_COLOR = "#f7f2f2"
|
62 |
-
ERROR_BORDER_COLOR = "#9b3339"
|
63 |
-
BUTTON_SECONDARY_BACKGROUND_COLOR = "#434d5c"
|
64 |
-
BUTTON_SECONDARY_BORDER_COLOR = "#444d5b"
|
65 |
-
BUTTON_SECONDARY_TEXT_COLOR = "#c5c9cc"
|
66 |
-
RED = "#ff0000"
|
67 |
-
GREEN = "#00ff00"
|
68 |
-
BLUE = "#0000ff"
|
69 |
-
|
70 |
-
html_social_media = f'''
|
71 |
-
<div style="float: right;">
|
72 |
-
<a href="https://maximofn.com/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
73 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
|
74 |
-
<style>
|
75 |
-
svg {"{"}
|
76 |
-
fill: {SVG_COLOR}
|
77 |
-
{"}"}
|
78 |
-
</style>
|
79 |
-
<path d="M208 80c0-26.5 21.5-48 48-48h64c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48h-8v40H464c30.9 0 56 25.1 56 56v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H464c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-4.4-3.6-8-8-8H312v40h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H256c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V280H112c-4.4 0-8 3.6-8 8v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-30.9 25.1-56 56-56H264V192h-8c-26.5 0-48-21.5-48-48V80z"/>
|
80 |
-
</svg>
|
81 |
-
</a>
|
82 |
-
<a href="http://github.com/maximofn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
83 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 496 512">
|
84 |
-
<style>
|
85 |
-
svg {"{"}
|
86 |
-
fill: {SVG_COLOR}
|
87 |
-
{"}"}
|
88 |
-
</style>
|
89 |
-
<path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/>
|
90 |
-
</svg>
|
91 |
-
</a>
|
92 |
-
<a href="http://linkedin.com/in/MaximoFN/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
93 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
|
94 |
-
<style>
|
95 |
-
svg {"{"}
|
96 |
-
fill: {SVG_COLOR}
|
97 |
-
{"}"}
|
98 |
-
</style>
|
99 |
-
<path d="M416 32H31.9C14.3 32 0 46.5 0 64.3v383.4C0 465.5 14.3 480 31.9 480H416c17.6 0 32-14.5 32-32.3V64.3c0-17.8-14.4-32.3-32-32.3zM135.4 416H69V202.2h66.5V416zm-33.2-243c-21.3 0-38.5-17.3-38.5-38.5S80.9 96 102.2 96c21.2 0 38.5 17.3 38.5 38.5 0 21.3-17.2 38.5-38.5 38.5zm282.1 243h-66.4V312c0-24.8-.5-56.7-34.5-56.7-34.6 0-39.9 27-39.9 54.9V416h-66.4V202.2h63.7v29.2h.9c8.9-16.8 30.6-34.5 62.9-34.5 67.2 0 79.7 44.3 79.7 101.9V416z"/>
|
100 |
-
</svg>
|
101 |
-
</a>
|
102 |
-
<a href="http://kaggle.com/maximofn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
103 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 320 512">
|
104 |
-
<style>
|
105 |
-
svg {"{"}
|
106 |
-
fill: {SVG_COLOR}
|
107 |
-
{"}"}
|
108 |
-
</style>
|
109 |
-
<path d="M304.2 501.5L158.4 320.3 298.2 185c2.6-2.7 1.7-10.5-5.3-10.5h-69.2c-3.5 0-7 1.8-10.5 5.3L80.9 313.5V7.5q0-7.5-7.5-7.5H21.5Q14 0 14 7.5v497q0 7.5 7.5 7.5h51.9q7.5 0 7.5-7.5v-109l30.8-29.3 110.5 140.6c3 3.5 6.5 5.3 10.5 5.3h66.9q5.25 0 6-3z"/>
|
110 |
-
</svg>
|
111 |
-
</a>
|
112 |
-
<a href="https://twitter.com/Maximo_fn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
113 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
|
114 |
-
<style>
|
115 |
-
svg {"{"}
|
116 |
-
fill: {SVG_COLOR}
|
117 |
-
{"}"}
|
118 |
-
</style>
|
119 |
-
<path d="M389.2 48h70.6L305.6 224.2 487 464H345L233.7 318.6 106.5 464H35.8L200.7 275.5 26.8 48H172.4L272.9 180.9 389.2 48zM364.4 421.8h39.1L151.1 88h-42L364.4 421.8z"/>
|
120 |
-
</svg>
|
121 |
-
</a>
|
122 |
-
<a href="https://www.instagram.com/maximo__fn/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
123 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
|
124 |
-
<style>
|
125 |
-
svg {"{"}
|
126 |
-
fill: {SVG_COLOR}
|
127 |
-
{"}"}
|
128 |
-
</style>
|
129 |
-
<path d="M224.1 141c-63.6 0-114.9 51.3-114.9 114.9s51.3 114.9 114.9 114.9S339 319.5 339 255.9 287.7 141 224.1 141zm0 189.6c-41.1 0-74.7-33.5-74.7-74.7s33.5-74.7 74.7-74.7 74.7 33.5 74.7 74.7-33.6 74.7-74.7 74.7zm146.4-194.3c0 14.9-12 26.8-26.8 26.8-14.9 0-26.8-12-26.8-26.8s12-26.8 26.8-26.8 26.8 12 26.8 26.8zm76.1 27.2c-1.7-35.9-9.9-67.7-36.2-93.9-26.2-26.2-58-34.4-93.9-36.2-37-2.1-147.9-2.1-184.9 0-35.8 1.7-67.6 9.9-93.9 36.1s-34.4 58-36.2 93.9c-2.1 37-2.1 147.9 0 184.9 1.7 35.9 9.9 67.7 36.2 93.9s58 34.4 93.9 36.2c37 2.1 147.9 2.1 184.9 0 35.9-1.7 67.7-9.9 93.9-36.2 26.2-26.2 34.4-58 36.2-93.9 2.1-37 2.1-147.8 0-184.8zM398.8 388c-7.8 19.6-22.9 34.7-42.6 42.6-29.5 11.7-99.5 9-132.1 9s-102.7 2.6-132.1-9c-19.6-7.8-34.7-22.9-42.6-42.6-11.7-29.5-9-99.5-9-132.1s-2.6-102.7 9-132.1c7.8-19.6 22.9-34.7 42.6-42.6 29.5-11.7 99.5-9 132.1-9s102.7-2.6 132.1 9c19.6 7.8 34.7 22.9 42.6 42.6 11.7 29.5 9 99.5 9 132.1s2.7 102.7-9 132.1z"/>
|
130 |
-
</svg>
|
131 |
-
</a>
|
132 |
-
<a href="https://www.youtube.com/channel/UCdQwg2JU_fWRsHn3yIlf3tw" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
133 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
|
134 |
-
<style>
|
135 |
-
svg {"{"}
|
136 |
-
fill: {SVG_COLOR}
|
137 |
-
{"}"}
|
138 |
-
</style>
|
139 |
-
<path d="M549.655 124.083c-6.281-23.65-24.787-42.276-48.284-48.597C458.781 64 288 64 288 64S117.22 64 74.629 75.486c-23.497 6.322-42.003 24.947-48.284 48.597-11.412 42.867-11.412 132.305-11.412 132.305s0 89.438 11.412 132.305c6.281 23.65 24.787 41.5 48.284 47.821C117.22 448 288 448 288 448s170.78 0 213.371-11.486c23.497-6.321 42.003-24.171 48.284-47.821 11.412-42.867 11.412-132.305 11.412-132.305s0-89.438-11.412-132.305zm-317.51 213.508V175.185l142.739 81.205-142.739 81.201z"/>
|
140 |
-
</svg>
|
141 |
-
</a>
|
142 |
-
<a href="https://www.facebook.com/profile.php?id=100085177670661" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
143 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
|
144 |
-
<style>
|
145 |
-
svg {"{"}
|
146 |
-
fill: {SVG_COLOR}
|
147 |
-
{"}"}
|
148 |
-
</style>
|
149 |
-
<path d="M504 256C504 119 393 8 256 8S8 119 8 256c0 123.78 90.69 226.38 209.25 245V327.69h-63V256h63v-54.64c0-62.15 37-96.48 93.67-96.48 27.14 0 55.52 4.84 55.52 4.84v61h-31.28c-30.8 0-40.41 19.12-40.41 38.73V256h68.78l-11 71.69h-57.78V501C413.31 482.38 504 379.78 504 256z"/>
|
150 |
-
</svg>
|
151 |
-
</a>
|
152 |
-
<a href="https://www.tiktok.com/@maximo__fn" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
153 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512">
|
154 |
-
<style>
|
155 |
-
svg {"{"}
|
156 |
-
fill: {SVG_COLOR}
|
157 |
-
{"}"}
|
158 |
-
</style>
|
159 |
-
<path d="M448,209.91a210.06,210.06,0,0,1-122.77-39.25V349.38A162.55,162.55,0,1,1,185,188.31V278.2a74.62,74.62,0,1,0,52.23,71.18V0l88,0a121.18,121.18,0,0,0,1.86,22.17h0A122.18,122.18,0,0,0,381,102.39a121.43,121.43,0,0,0,67,20.14Z"/>
|
160 |
-
</svg>
|
161 |
-
</a>
|
162 |
-
<a href="https://www.twitch.tv/maximofn/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
163 |
-
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 512 512">
|
164 |
-
<style>
|
165 |
-
svg {"{"}
|
166 |
-
fill: {SVG_COLOR}
|
167 |
-
{"}"}
|
168 |
-
</style>
|
169 |
-
<path d="M391.17,103.47H352.54v109.7h38.63ZM285,103H246.37V212.75H285ZM120.83,0,24.31,91.42V420.58H140.14V512l96.53-91.42h77.25L487.69,256V0ZM449.07,237.75l-77.22,73.12H294.61l-67.6,64v-64H140.14V36.58H449.07Z"/>
|
170 |
-
</svg>
|
171 |
-
</a>
|
172 |
-
</div>
|
173 |
-
'''
|
174 |
-
|
175 |
-
html_subtify_logo = f"""
|
176 |
-
<div style="display: flex; justify-content: center; align-items: center;">
|
177 |
-
<img src='https://pub-fb664c455eca46a2ba762a065ac900f7.r2.dev/subtify_logo-scaled.webp' width={new_width}px height={new_height}px >
|
178 |
-
</div>
|
179 |
-
"""
|
180 |
-
|
181 |
-
html_buy_me_a_coffe = '''
|
182 |
-
<div style="float: right;">
|
183 |
-
<a href="https://www.buymeacoffee.com/maximofn" target="_blank">
|
184 |
-
<img src="https://img.shields.io/badge/Buy_Me_A_Coffee-support_my_work-FFDD00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white&labelColor=101010" alt="buy me a coffe">
|
185 |
-
</a>
|
186 |
-
</div>
|
187 |
-
'''
|
188 |
|
189 |
language_dict = union_language_dict()
|
190 |
|
191 |
-
# def subtify_no_ui():
|
192 |
-
# number_works = 6
|
193 |
-
# progress_bar = tqdm(total=number_works, desc="Subtify")
|
194 |
-
# folder_chunck = "chunks"
|
195 |
-
# folder_concatenated = "concatenated_transcriptions"
|
196 |
-
# folder_translated_transcriptions = "translated_transcriptions"
|
197 |
-
# if not os.path.exists(folder_chunck):
|
198 |
-
# os.makedirs(folder_chunck)
|
199 |
-
# if not os.path.exists(folder_concatenated):
|
200 |
-
# os.makedirs(folder_concatenated)
|
201 |
-
# if not os.path.exists(folder_translated_transcriptions):
|
202 |
-
# os.makedirs(folder_translated_transcriptions)
|
203 |
-
|
204 |
-
# ################## Download video and audio ##################
|
205 |
-
# if DOWNLOAD:
|
206 |
-
# print('*'*NUMBER)
|
207 |
-
# # url = "https://www.twitch.tv/videos/1936119752" # twitch Rob Mula 2 horas
|
208 |
-
# # url = "https://www.youtube.com/watch?v=yX5EJf4R77s" # ✅ debate, varios hablantes, 3 minutos
|
209 |
-
# # url = "https://www.youtube.com/watch?v=cgx0QnXo1OU" # ✅ smart home, un solo hablante, 4:42 minutos
|
210 |
-
# # url = "https://www.youtube.com/watch?v=dgOBxhi19T8" # ✅ rob mula, muchos hablantes, 4:28 minutos
|
211 |
-
# # url = "https://www.youtube.com/watch?v=Coj72EzmX20" # rob mula, un solo hablante, 16 minutos
|
212 |
-
# # url = "https://www.youtube.com/watch?v=Tqth0fKo0_g" # Conversación short
|
213 |
-
# url = "https://www.youtube.com/watch?v=h9xPrgTYP_0" # Letitia 40 segundos
|
214 |
-
# print(f"Downloading video and audio from {url}")
|
215 |
-
# python_file = "download.py"
|
216 |
-
# command = f"python {python_file} {url}"
|
217 |
-
# os.system(command)
|
218 |
-
# sleep(1)
|
219 |
-
# print('*'*NUMBER)
|
220 |
-
# print("\n\n")
|
221 |
-
# progress_bar.update(1)
|
222 |
-
|
223 |
-
# ################## Slice audio ##################
|
224 |
-
# if SLICE_AUDIO:
|
225 |
-
# print('*'*NUMBER)
|
226 |
-
# print("Slicing audio")
|
227 |
-
# python_file = "slice_audio.py"
|
228 |
-
# audio = "audios/download_audio.mp3"
|
229 |
-
# command = f"python {python_file} {audio} {SECONDS}"
|
230 |
-
# os.system(command)
|
231 |
-
# print('*'*NUMBER)
|
232 |
-
# print("\n\n")
|
233 |
-
# progress_bar.update(1)
|
234 |
-
|
235 |
-
# ################# Transcript slices ##################
|
236 |
-
# if TRANSCRIBE_AUDIO:
|
237 |
-
# print('*'*NUMBER)
|
238 |
-
# print("Transcript slices")
|
239 |
-
# chunks_folder = "chunks"
|
240 |
-
# if not os.path.exists(chunks_folder):
|
241 |
-
# os.makedirs(chunks_folder)
|
242 |
-
# python_file = "transcribe.py"
|
243 |
-
# chunks_file = "chunks/output_files.txt"
|
244 |
-
# number_of_speakers = 10
|
245 |
-
# source_languaje = "English"
|
246 |
-
# command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
|
247 |
-
# os.system(command)
|
248 |
-
# if REMOVE_FILES:
|
249 |
-
# with open(chunks_file, 'r') as f:
|
250 |
-
# files = f.read().splitlines()
|
251 |
-
# for file in files:
|
252 |
-
# audios_extension = "mp3"
|
253 |
-
# file_name, _ = file.split(".")
|
254 |
-
# _, file_name = file_name.split("/")
|
255 |
-
# vocal = f'{chunks_folder}/{file_name}.{audios_extension}'
|
256 |
-
# command = f"rm {vocal}"
|
257 |
-
# os.system(command)
|
258 |
-
# print('*'*NUMBER)
|
259 |
-
# print("\n\n")
|
260 |
-
# progress_bar.update(1)
|
261 |
-
|
262 |
-
# ################## Concatenate transcriptions ##################
|
263 |
-
# if CONCATENATE_TRANSCRIPTIONS:
|
264 |
-
# print('*'*NUMBER)
|
265 |
-
# print("Concatenate transcriptions")
|
266 |
-
# folder_concatenated = "concatenated_transcriptions"
|
267 |
-
# if not os.path.exists(folder_concatenated):
|
268 |
-
# os.makedirs(folder_concatenated)
|
269 |
-
|
270 |
-
# chunck_file = "chunks/output_files.txt"
|
271 |
-
# python_file = "concat_transcriptions.py"
|
272 |
-
# command = f"python {python_file} {chunck_file} {SECONDS}"
|
273 |
-
# os.system(command)
|
274 |
-
# if REMOVE_FILES:
|
275 |
-
# with open(chunck_file, 'r') as f:
|
276 |
-
# files = f.read().splitlines()
|
277 |
-
# for file in files:
|
278 |
-
# file_name, _ = file.split(".")
|
279 |
-
# _, file_name = file_name.split("/")
|
280 |
-
# transcriptions_folder = "transcriptions"
|
281 |
-
# transcription_extension = "srt"
|
282 |
-
# command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}"
|
283 |
-
# os.system(command)
|
284 |
-
# print('*'*NUMBER)
|
285 |
-
# print("\n\n")
|
286 |
-
# progress_bar.update(1)
|
287 |
-
|
288 |
-
# ################## Translate transcription ##################
|
289 |
-
# target_languaje = "Español"
|
290 |
-
# if TRANSLATE_TRANSCRIPTIONS:
|
291 |
-
# print('*'*NUMBER)
|
292 |
-
# print("Translate transcription")
|
293 |
-
# transcription_file = "concatenated_transcriptions/download_audio.srt"
|
294 |
-
# source_languaje = "English"
|
295 |
-
# python_file = "translate_transcriptions.py"
|
296 |
-
# command = f"python {python_file} {transcription_file} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}"
|
297 |
-
# os.system(command)
|
298 |
-
# if REMOVE_FILES:
|
299 |
-
# if os.path.exists(transcription_file):
|
300 |
-
# command = f"rm {transcription_file}"
|
301 |
-
# os.system(command)
|
302 |
-
# print('*'*NUMBER)
|
303 |
-
# print("\n\n")
|
304 |
-
# progress_bar.update(1)
|
305 |
-
|
306 |
-
# ################## Add subtitles to video ##################
|
307 |
-
# if ADD_SUBTITLES_TO_VIDEO:
|
308 |
-
# print('*'*NUMBER)
|
309 |
-
# print("Add subtitles to video")
|
310 |
-
# python_file = "add_subtitles_to_video.py"
|
311 |
-
# transcription_file = f"translated_transcriptions/download_audio_{target_languaje}.srt"
|
312 |
-
# input_video_file = "videos/download_video.mp4"
|
313 |
-
# input_audio_file = "audios/download_audio.mp3"
|
314 |
-
# command = f"python {python_file} {transcription_file} {input_video_file} {input_audio_file}"
|
315 |
-
# os.system(command)
|
316 |
-
# if REMOVE_FILES:
|
317 |
-
# if os.path.exists(input_video_file):
|
318 |
-
# command = f"rm {input_video_file}"
|
319 |
-
# os.system(command)
|
320 |
-
# if os.path.exists(input_audio_file):
|
321 |
-
# command = f"rm {input_audio_file}"
|
322 |
-
# os.system(command)
|
323 |
-
# if os.path.exists(transcription_file):
|
324 |
-
# command = f"rm {transcription_file}"
|
325 |
-
# os.system(command)
|
326 |
-
# if os.path.exists("chunks/output_files.txt"):
|
327 |
-
# command = f"rm chunks/output_files.txt"
|
328 |
-
# os.system(command)
|
329 |
-
# if os.path.exists("chunks"):
|
330 |
-
# command = f"rm -r chunks"
|
331 |
-
# os.system(command)
|
332 |
-
# if os.path.exists("vocals/speakers.txt"):
|
333 |
-
# command = f"rm vocals/speakers.txt"
|
334 |
-
# os.system(command)
|
335 |
-
# print('*'*NUMBER)
|
336 |
-
# print("\n\n")
|
337 |
-
# progress_bar.update(1)
|
338 |
-
|
339 |
def remove_all_files():
|
340 |
if os.path.exists("audios"):
|
341 |
command = f"rm -r audios"
|
@@ -359,13 +89,10 @@ def remove_all_files():
|
|
359 |
command = f"rm -r vocals"
|
360 |
os.system(command)
|
361 |
|
362 |
-
# def paste_url_from_clipboard():
|
363 |
-
# return pyperclip.paste()
|
364 |
-
|
365 |
def reset_frontend():
|
366 |
visible = False
|
367 |
return (
|
368 |
-
|
369 |
gr.Image(visible=visible),
|
370 |
gr.Dropdown(visible=visible),
|
371 |
gr.Dropdown(visible=visible),
|
@@ -381,142 +108,47 @@ def reset_frontend():
|
|
381 |
gr.Textbox(visible=visible),
|
382 |
gr.Textbox(visible=visible),
|
383 |
gr.Textbox(visible=visible),
|
384 |
-
gr.Textbox(visible=visible),
|
385 |
-
gr.Textbox(visible=visible),
|
386 |
gr.Video(visible=visible),
|
387 |
)
|
388 |
|
389 |
def show_auxiliar_block1():
|
390 |
return gr.Textbox(value="URL checked", visible=False)
|
391 |
|
392 |
-
def get_youtube_thumbnail(url):
|
393 |
-
yt = YouTube(url)
|
394 |
-
thumbnail_url = yt.thumbnail_url
|
395 |
-
return thumbnail_url
|
396 |
-
|
397 |
-
def is_valid_youtube_url(url):
|
398 |
-
# This regular expression should match the following YouTube URL formats:
|
399 |
-
# - https://youtube.com/watch?v=video_id
|
400 |
-
# - https://www.youtube.com/watch?v=video_id
|
401 |
-
# - https://youtu.be/video_id
|
402 |
-
patron_youtube = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
|
403 |
-
return bool(re.match(patron_youtube, url))
|
404 |
-
|
405 |
-
def is_valid_twitch_url(url):
|
406 |
-
# This regular expression should match the following Twitch URL formats:
|
407 |
-
# - https://twitch.tv/channel_name
|
408 |
-
# - https://www.twitch.tv/channel_name
|
409 |
-
# - https://twitch.tv/videos/video_id
|
410 |
-
twitch_pattern = r'(https?://)?(www\.)?twitch\.tv/(videos/\d+|\w+)'
|
411 |
-
return bool(re.match(twitch_pattern, url))
|
412 |
-
|
413 |
-
def is_valid_url(url):
|
414 |
-
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
415 |
-
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
416 |
-
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
417 |
-
advanced_setings = gr.Accordion(visible=True)
|
418 |
-
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
419 |
-
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
420 |
-
|
421 |
-
# Youtube
|
422 |
-
if "youtube" in url.lower() or "youtu.be" in url.lower():
|
423 |
-
if is_valid_youtube_url(url):
|
424 |
-
thumbnail = get_youtube_thumbnail(url)
|
425 |
-
if thumbnail:
|
426 |
-
return (
|
427 |
-
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
|
428 |
-
source_languaje,
|
429 |
-
target_languaje,
|
430 |
-
advanced_setings,
|
431 |
-
number_of_speakers,
|
432 |
-
subtify_button,
|
433 |
-
)
|
434 |
-
else:
|
435 |
-
return (
|
436 |
-
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
|
437 |
-
source_languaje,
|
438 |
-
target_languaje,
|
439 |
-
advanced_setings,
|
440 |
-
number_of_speakers,
|
441 |
-
subtify_button,
|
442 |
-
)
|
443 |
-
|
444 |
-
# Twitch
|
445 |
-
elif "twitch" in url.lower() or "twitch.tv" in url.lower():
|
446 |
-
if is_valid_twitch_url(url):
|
447 |
-
return (
|
448 |
-
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
|
449 |
-
source_languaje,
|
450 |
-
target_languaje,
|
451 |
-
advanced_setings,
|
452 |
-
number_of_speakers,
|
453 |
-
subtify_button,
|
454 |
-
)
|
455 |
-
|
456 |
-
# Error
|
457 |
-
visible = False
|
458 |
-
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
|
459 |
-
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
460 |
-
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
461 |
-
advanced_setings = gr.Accordion(visible=visible)
|
462 |
-
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
463 |
-
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
464 |
-
return (
|
465 |
-
image,
|
466 |
-
source_languaje,
|
467 |
-
target_languaje,
|
468 |
-
advanced_setings,
|
469 |
-
number_of_speakers,
|
470 |
-
subtify_button,
|
471 |
-
)
|
472 |
-
|
473 |
def change_visibility_texboxes():
|
474 |
-
|
475 |
return (
|
476 |
-
gr.
|
477 |
-
gr.
|
478 |
-
gr.
|
479 |
-
gr.
|
480 |
-
gr.
|
481 |
-
gr.
|
482 |
-
gr.
|
483 |
-
gr.Textbox(visible=False),
|
484 |
)
|
485 |
|
486 |
-
def
|
487 |
print('*'*NUMBER)
|
488 |
-
print(f"
|
489 |
-
|
490 |
-
audios_folder = "audios"
|
491 |
-
videos_folder = "videos"
|
492 |
-
if not os.path.exists(audios_folder):
|
493 |
-
os.makedirs(audios_folder)
|
494 |
-
if not os.path.exists(videos_folder):
|
495 |
-
os.makedirs(videos_folder)
|
496 |
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
gr.Textbox(value=audio),
|
513 |
-
gr.Textbox(value=video),
|
514 |
-
)
|
515 |
-
|
516 |
-
def slice_audio(audio_path):
|
517 |
print('*'*NUMBER)
|
518 |
-
print("Slicing audio")
|
519 |
|
|
|
|
|
520 |
folder_vocals = "vocals"
|
521 |
folder_chunck = "chunks"
|
522 |
if not os.path.exists(folder_vocals):
|
@@ -524,34 +156,22 @@ def slice_audio(audio_path):
|
|
524 |
if not os.path.exists(folder_chunck):
|
525 |
os.makedirs(folder_chunck)
|
526 |
|
527 |
-
|
528 |
-
command = f"python {python_file} {audio_path} {SECONDS}"
|
529 |
-
os.system(command)
|
530 |
|
531 |
return (
|
532 |
-
gr.
|
533 |
)
|
534 |
|
535 |
-
def trascribe_audio(
|
536 |
print('*'*NUMBER)
|
537 |
-
print("Transcript
|
538 |
-
|
539 |
-
folder_chunks = "chunks"
|
540 |
-
python_file = "transcribe.py"
|
541 |
-
chunks_file = "chunks/output_files.txt"
|
542 |
-
command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
|
543 |
-
os.system(command)
|
544 |
|
545 |
-
|
546 |
-
|
547 |
-
for file in files:
|
548 |
-
audios_extension = "mp3"
|
549 |
-
file_name, _ = file.split(".")
|
550 |
-
_, file_name = file_name.split("/")
|
551 |
-
vocal = f'{folder_chunks}/{file_name}.{audios_extension}'
|
552 |
-
command = f"rm {vocal}"
|
553 |
-
os.system(command)
|
554 |
|
|
|
|
|
|
|
555 |
return (
|
556 |
gr.Textbox(value="Ok")
|
557 |
)
|
@@ -566,7 +186,7 @@ def concatenate_transcriptions():
|
|
566 |
|
567 |
chunck_file = "chunks/output_files.txt"
|
568 |
python_file = "concat_transcriptions.py"
|
569 |
-
command = f"python {python_file} {chunck_file} {
|
570 |
os.system(command)
|
571 |
|
572 |
with open(chunck_file, 'r') as f:
|
@@ -651,6 +271,23 @@ def hide_textbobes_progress_info():
|
|
651 |
gr.Textbox(value="Waiting", visible=visible),
|
652 |
)
|
653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
@spaces.GPU
|
655 |
def subtify():
|
656 |
with gr.Blocks(
|
@@ -700,54 +337,59 @@ def subtify():
|
|
700 |
gr.HTML(html_social_media)
|
701 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
702 |
gr.HTML(html_subtify_logo)
|
703 |
-
|
704 |
-
|
705 |
-
# paste_button = gr.Button(size="sm", icon="icons/paste.svg", value="paste", min_width="10px", scale=0)
|
706 |
-
delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0)
|
707 |
-
|
708 |
visible = False
|
709 |
-
|
710 |
-
with
|
711 |
-
|
712 |
-
with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
with gr.Row():
|
714 |
-
source_languaje = gr.Dropdown(visible=
|
715 |
-
target_languaje = gr.Dropdown(visible=
|
716 |
-
with gr.Accordion("Advanced settings", open=False, visible=
|
717 |
-
number_of_speakers = gr.Dropdown(visible=
|
718 |
-
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=
|
719 |
|
720 |
-
|
721 |
with gr.Row():
|
722 |
-
|
723 |
-
video_sliced_progress_info = gr.Textbox(placeholder="Waiting", label="Video slice progress info", elem_id="video_sliced_progress_info", interactive=False, visible=visible)
|
724 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
725 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
726 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
727 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
728 |
|
729 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
730 |
-
original_video_path = gr.Textbox(label="Original video path",
|
731 |
original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
|
732 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
733 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
734 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
735 |
|
|
|
|
|
736 |
# Events
|
737 |
# paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox)
|
738 |
delete_button.click(
|
739 |
fn=reset_frontend,
|
740 |
outputs=[
|
741 |
-
|
742 |
-
image,
|
743 |
source_languaje,
|
744 |
target_languaje,
|
745 |
Advanced_setings,
|
746 |
number_of_speakers,
|
747 |
subtify_button,
|
748 |
-
|
749 |
-
video_donwloaded_progress_info,
|
750 |
-
video_sliced_progress_info,
|
751 |
video_transcribed_progress_info,
|
752 |
transcriptions_concatenated_progress_info,
|
753 |
video_translated_progress_info,
|
@@ -755,54 +397,43 @@ def subtify():
|
|
755 |
subtitled_video,
|
756 |
]
|
757 |
)
|
758 |
-
|
759 |
-
fn=
|
760 |
-
|
761 |
-
|
762 |
-
auxiliar_block1.change(
|
763 |
-
fn=is_valid_url,
|
764 |
-
inputs=url_textbox,
|
765 |
-
outputs=[image, source_languaje, target_languaje, Advanced_setings, number_of_speakers, subtify_button]
|
766 |
)
|
767 |
subtify_button.click(
|
768 |
fn=change_visibility_texboxes,
|
769 |
-
outputs=[
|
770 |
-
)
|
771 |
-
auxiliar_block2.change(
|
772 |
-
fn=get_audio_and_video_from_video,
|
773 |
-
inputs=[url_textbox],
|
774 |
-
outputs=[video_donwloaded_progress_info, original_audio_path, original_video_path]
|
775 |
)
|
776 |
-
|
777 |
-
fn=
|
778 |
-
inputs=[
|
779 |
-
outputs=[
|
780 |
)
|
781 |
-
|
782 |
fn=trascribe_audio,
|
783 |
-
inputs=[
|
784 |
outputs=[video_transcribed_progress_info]
|
785 |
)
|
786 |
-
video_transcribed_progress_info.change(
|
787 |
-
|
788 |
-
|
789 |
-
)
|
790 |
-
transcriptions_concatenated_progress_info.change(
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
)
|
795 |
-
video_translated_progress_info.change(
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
)
|
800 |
-
auxiliar_block3.change(
|
801 |
-
|
802 |
-
|
803 |
-
)
|
804 |
-
|
805 |
-
gr.HTML(html_buy_me_a_coffe)
|
806 |
|
807 |
demo.launch()
|
808 |
|
|
|
3 |
import spaces
|
4 |
import os
|
5 |
import torch
|
6 |
+
import shutil
|
7 |
from time import sleep
|
8 |
from tqdm import tqdm
|
9 |
from lang_list import union_language_dict
|
10 |
# import pyperclip
|
|
|
11 |
import re
|
12 |
from PIL import Image
|
13 |
# import urllib.request
|
14 |
+
from ui_config import (
|
15 |
+
BACKGROUND_COLOR, BUTTON_COLOR, SVG_COLOR, PANEL_COLOR,
|
16 |
+
PRIMARY_TEXT_COLOR, SUBDUED_TEXT_COLOR, BACKGROUND_PRIMARY_COLOR,
|
17 |
+
BACKGROUND_SECONDARY_COLOR, PRIMARY_BODER_COLOR, BLOCK_TITLE_TEXT_COLOR,
|
18 |
+
INPUT_BACKGROUND_COLOR, INPUT_BORDER_COLOR, INPUT_PLACEHOLDER_COLOR,
|
19 |
+
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
|
20 |
+
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
|
21 |
+
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
|
22 |
+
html_social_media, get_html_subtify_logo, html_buy_me_a_coffe
|
23 |
+
)
|
24 |
+
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
|
25 |
+
from slice_audio import slice_audio as slice_audio_main
|
26 |
+
from audio import get_audio_from_video
|
27 |
+
from transcribe import transcribe, get_language_dict
|
28 |
|
29 |
NUMBER = 100
|
30 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
39 |
if DEVICE == "cpu":
|
40 |
# I supose that I am on huggingface server
|
41 |
# Get RAM space
|
42 |
+
# ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
|
43 |
+
ram = 16000
|
44 |
factor = 1
|
45 |
+
CHUNK_SECONDS = int(ram*factor)
|
46 |
+
CHUNK_SECONDS = 30
|
47 |
+
CHUNK_OVERLAP_SECONDS = 5
|
48 |
+
print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
|
49 |
else:
|
50 |
# I supose that I am on my computer
|
51 |
# Get VRAM space
|
52 |
+
CHUNK_SECONDS = 30
|
53 |
+
CHUNK_OVERLAP_SECONDS = 5
|
54 |
|
55 |
YOUTUBE = "youtube"
|
56 |
TWITCH = "twitch"
|
57 |
ERROR = "error"
|
58 |
|
59 |
+
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
|
60 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
61 |
factor = 4
|
62 |
new_width = subtify_logo_width // factor
|
63 |
new_height = subtify_logo_height // factor
|
64 |
|
65 |
+
html_subtify_logo = get_html_subtify_logo(new_width, new_height)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
language_dict = union_language_dict()
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def remove_all_files():
|
70 |
if os.path.exists("audios"):
|
71 |
command = f"rm -r audios"
|
|
|
89 |
command = f"rm -r vocals"
|
90 |
os.system(command)
|
91 |
|
|
|
|
|
|
|
92 |
def reset_frontend():
|
93 |
visible = False
|
94 |
return (
|
95 |
+
None,
|
96 |
gr.Image(visible=visible),
|
97 |
gr.Dropdown(visible=visible),
|
98 |
gr.Dropdown(visible=visible),
|
|
|
108 |
gr.Textbox(visible=visible),
|
109 |
gr.Textbox(visible=visible),
|
110 |
gr.Textbox(visible=visible),
|
|
|
|
|
111 |
gr.Video(visible=visible),
|
112 |
)
|
113 |
|
114 |
def show_auxiliar_block1():
|
115 |
return gr.Textbox(value="URL checked", visible=False)
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def change_visibility_texboxes():
|
|
|
118 |
return (
|
119 |
+
gr.update(value="Done"), # auxiliar_block1
|
120 |
+
gr.update(visible=True), # get_audio_from_video_info
|
121 |
+
gr.update(visible=True), # video_sliced_progress_info
|
122 |
+
gr.update(visible=True), # video_transcribed_progress_info
|
123 |
+
gr.update(visible=True), # transcriptions_concatenated_progress_info
|
124 |
+
gr.update(visible=True), # video_translated_progress_info
|
125 |
+
gr.update(visible=True), # video_subtitled_progress_info
|
|
|
126 |
)
|
127 |
|
128 |
+
def get_audio(video_path):
|
129 |
print('*'*NUMBER)
|
130 |
+
print(f"Getting audio from video {video_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
+
audios_folder = "audios"
|
133 |
+
try:
|
134 |
+
audio_path = get_audio_from_video(video_path, audios_folder)
|
135 |
+
return [
|
136 |
+
gr.update(value="Ok"), # get_audio_from_video_info
|
137 |
+
gr.update(value=audio_path) # original_audio_path
|
138 |
+
]
|
139 |
+
except Exception as e:
|
140 |
+
print(f"Error: {str(e)}")
|
141 |
+
return [
|
142 |
+
gr.update(value="Error"), # get_audio_from_video_info
|
143 |
+
gr.update(value="") # original_audio_path
|
144 |
+
]
|
145 |
+
|
146 |
+
def slice_audio(input_audio_path):
|
|
|
|
|
|
|
|
|
|
|
147 |
print('*'*NUMBER)
|
148 |
+
print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
|
149 |
|
150 |
+
# Create vocals and chunks folders
|
151 |
+
print("Creating vocals and chunks folders")
|
152 |
folder_vocals = "vocals"
|
153 |
folder_chunck = "chunks"
|
154 |
if not os.path.exists(folder_vocals):
|
|
|
156 |
if not os.path.exists(folder_chunck):
|
157 |
os.makedirs(folder_chunck)
|
158 |
|
159 |
+
slice_audio_main(input_audio_path, folder_chunck, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
|
|
|
|
160 |
|
161 |
return (
|
162 |
+
gr.update(value="Ok"), # video_sliced_progress_info
|
163 |
)
|
164 |
|
165 |
+
def trascribe_audio(input_audio_path, source_languaje):
|
166 |
print('*'*NUMBER)
|
167 |
+
print(f"Transcript {input_audio_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
# Get language dict
|
170 |
+
language_dict = get_language_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
+
# Transcribe audio file
|
173 |
+
transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
174 |
+
|
175 |
return (
|
176 |
gr.Textbox(value="Ok")
|
177 |
)
|
|
|
186 |
|
187 |
chunck_file = "chunks/output_files.txt"
|
188 |
python_file = "concat_transcriptions.py"
|
189 |
+
command = f"python {python_file} {chunck_file} {CHUNK_SECONDS} {CHUNK_OVERLAP_SECONDS}"
|
190 |
os.system(command)
|
191 |
|
192 |
with open(chunck_file, 'r') as f:
|
|
|
271 |
gr.Textbox(value="Waiting", visible=visible),
|
272 |
)
|
273 |
|
274 |
+
def process_uploaded_video(video_path):
|
275 |
+
# Create videos folder
|
276 |
+
videos_folder = "videos"
|
277 |
+
if not os.path.exists(videos_folder):
|
278 |
+
os.makedirs(videos_folder)
|
279 |
+
|
280 |
+
# Copy uploaded video to videos folder
|
281 |
+
new_video_path = os.path.join(videos_folder, "download_video.mp4")
|
282 |
+
shutil.copy(video_path, new_video_path)
|
283 |
+
|
284 |
+
# Return updated config block with new scale and the new video path
|
285 |
+
return [
|
286 |
+
gr.update(label="Video uploaded"), # video_input
|
287 |
+
gr.update(visible=True), # config_block
|
288 |
+
gr.update(value=new_video_path) # original_video_path
|
289 |
+
]
|
290 |
+
|
291 |
@spaces.GPU
|
292 |
def subtify():
|
293 |
with gr.Blocks(
|
|
|
337 |
gr.HTML(html_social_media)
|
338 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
339 |
gr.HTML(html_subtify_logo)
|
340 |
+
|
341 |
+
# Input block, where the user can upload a video and configure the subtify process
|
|
|
|
|
|
|
342 |
visible = False
|
343 |
+
input_block = gr.Row(variant="panel")
|
344 |
+
with input_block:
|
345 |
+
input_video_block = gr.Row(scale=2)
|
346 |
+
with input_video_block:
|
347 |
+
video_input = gr.Video(
|
348 |
+
label="Upload video",
|
349 |
+
sources=["upload"],
|
350 |
+
scale=1,
|
351 |
+
interactive=True
|
352 |
+
)
|
353 |
+
delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="clear", min_width="10px", scale=0)
|
354 |
+
|
355 |
+
config_block = gr.Column(scale=1, visible=visible)
|
356 |
+
with config_block:
|
357 |
with gr.Row():
|
358 |
+
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
359 |
+
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
360 |
+
with gr.Accordion("Advanced settings", open=False, visible=True) as Advanced_setings:
|
361 |
+
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
|
362 |
+
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
363 |
|
364 |
+
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
|
365 |
with gr.Row():
|
366 |
+
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
|
|
|
367 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
368 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
369 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
370 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
371 |
|
372 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
373 |
+
original_video_path = gr.Textbox(label="Original video path", visible=visible)
|
374 |
original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
|
375 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
376 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
377 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
378 |
|
379 |
+
gr.HTML(html_buy_me_a_coffe)
|
380 |
+
|
381 |
# Events
|
382 |
# paste_button.click(fn=paste_url_from_clipboard, outputs=url_textbox)
|
383 |
delete_button.click(
|
384 |
fn=reset_frontend,
|
385 |
outputs=[
|
386 |
+
video_input,
|
|
|
387 |
source_languaje,
|
388 |
target_languaje,
|
389 |
Advanced_setings,
|
390 |
number_of_speakers,
|
391 |
subtify_button,
|
392 |
+
auxiliar_block1,
|
|
|
|
|
393 |
video_transcribed_progress_info,
|
394 |
transcriptions_concatenated_progress_info,
|
395 |
video_translated_progress_info,
|
|
|
397 |
subtitled_video,
|
398 |
]
|
399 |
)
|
400 |
+
video_input.change(
|
401 |
+
fn=process_uploaded_video,
|
402 |
+
inputs=[video_input],
|
403 |
+
outputs=[video_input, config_block, original_video_path]
|
|
|
|
|
|
|
|
|
404 |
)
|
405 |
subtify_button.click(
|
406 |
fn=change_visibility_texboxes,
|
407 |
+
outputs=[auxiliar_block1, get_audio_from_video_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
|
|
|
|
|
|
|
|
|
|
408 |
)
|
409 |
+
auxiliar_block1.change(
|
410 |
+
fn=get_audio,
|
411 |
+
inputs=[original_video_path],
|
412 |
+
outputs=[get_audio_from_video_info, original_audio_path]
|
413 |
)
|
414 |
+
get_audio_from_video_info.change(
|
415 |
fn=trascribe_audio,
|
416 |
+
inputs=[original_audio_path, source_languaje],
|
417 |
outputs=[video_transcribed_progress_info]
|
418 |
)
|
419 |
+
# video_transcribed_progress_info.change(
|
420 |
+
# fn=concatenate_transcriptions,
|
421 |
+
# outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
|
422 |
+
# )
|
423 |
+
# transcriptions_concatenated_progress_info.change(
|
424 |
+
# fn=translate_transcription,
|
425 |
+
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
|
426 |
+
# outputs=[video_translated_progress_info, original_audio_translated_path]
|
427 |
+
# )
|
428 |
+
# video_translated_progress_info.change(
|
429 |
+
# fn=add_translated_subtitles_to_video,
|
430 |
+
# inputs=[original_video_path, original_audio_path, original_audio_translated_path],
|
431 |
+
# outputs=[subtitled_video, video_subtitled_progress_info, auxiliar_block3]
|
432 |
+
# )
|
433 |
+
# auxiliar_block3.change(
|
434 |
+
# fn=hide_textbobes_progress_info,
|
435 |
+
# outputs=[video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
436 |
+
# )
|
|
|
|
|
437 |
|
438 |
demo.launch()
|
439 |
|
audio.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import ffmpeg
|
3 |
+
|
4 |
+
DEBUG=True
|
5 |
+
|
6 |
+
def get_audio_from_video(video_path: str, output_folder: str) -> str:
|
7 |
+
"""
|
8 |
+
Extract audio from video and save it as mp3.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
video_path (str): Path to the video file
|
12 |
+
output_folder (str): Path to folder where audio will be saved
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
str: Path to the saved audio file
|
16 |
+
|
17 |
+
Raises:
|
18 |
+
Exception: If video file doesn't exist
|
19 |
+
Exception: If there's an error extracting the audio
|
20 |
+
"""
|
21 |
+
# Validate video exists
|
22 |
+
if not os.path.exists(video_path):
|
23 |
+
raise Exception(f"Video file not found: {video_path}")
|
24 |
+
|
25 |
+
# Create output folder if it doesn't exist
|
26 |
+
if not os.path.exists(output_folder):
|
27 |
+
os.makedirs(output_folder)
|
28 |
+
|
29 |
+
try:
|
30 |
+
# Generate output path
|
31 |
+
audio_filename = "download_audio.mp3"
|
32 |
+
audio_path = os.path.join(output_folder, audio_filename)
|
33 |
+
|
34 |
+
if DEBUG:
|
35 |
+
if os.path.exists(audio_path):
|
36 |
+
return audio_path
|
37 |
+
|
38 |
+
# Extract audio using ffmpeg
|
39 |
+
stream = ffmpeg.input(video_path)
|
40 |
+
stream = ffmpeg.output(stream, audio_path, acodec='libmp3lame')
|
41 |
+
ffmpeg.run(stream, overwrite_output=True)
|
42 |
+
|
43 |
+
return audio_path
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
raise Exception(f"Error extracting audio from video: {str(e)}")
|
requirements.txt
CHANGED
@@ -1,20 +1,31 @@
|
|
1 |
# gradio
|
2 |
gradio
|
3 |
|
4 |
-
#
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Download youtube and twitch videos
|
8 |
-
pytube
|
9 |
-
yt-dlp
|
10 |
-
twitch-dl
|
11 |
|
12 |
# Trascribe audios
|
13 |
-
git+https://github.com/m-bain/whisperx.git
|
14 |
pyannote.audio
|
15 |
|
16 |
-
# Translate
|
17 |
-
protobuf
|
18 |
|
19 |
-
# Add subtitles to videos
|
20 |
-
opencv-python
|
|
|
1 |
# gradio
|
2 |
gradio
|
3 |
|
4 |
+
# spaces
|
5 |
+
spaces
|
6 |
+
|
7 |
+
# pytorch
|
8 |
+
torch
|
9 |
+
torchvision
|
10 |
+
torchaudio
|
11 |
+
|
12 |
+
# Transformers
|
13 |
+
transformers accelerate
|
14 |
+
|
15 |
+
# ffmpeg
|
16 |
+
ffmpeg-python
|
17 |
|
18 |
# Download youtube and twitch videos
|
19 |
+
# pytube
|
20 |
+
# yt-dlp
|
21 |
+
# twitch-dl
|
22 |
|
23 |
# Trascribe audios
|
24 |
+
# git+https://github.com/m-bain/whisperx.git
|
25 |
pyannote.audio
|
26 |
|
27 |
+
# # Translate
|
28 |
+
# protobuf
|
29 |
|
30 |
+
# # Add subtitles to videos
|
31 |
+
# opencv-python
|
slice_audio.py
CHANGED
@@ -4,6 +4,7 @@ from tqdm import tqdm
|
|
4 |
|
5 |
START = 00
|
6 |
FOLDER = "chunks"
|
|
|
7 |
|
8 |
def seconds_to_hms(seconds):
|
9 |
hour = 00
|
@@ -22,53 +23,72 @@ def seconds_to_hms(seconds):
|
|
22 |
def hms_to_seconds(hour, minute, second):
|
23 |
return hour*3600 + minute*60 + second
|
24 |
|
25 |
-
def
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
name, extension = os.path.splitext(filename)
|
30 |
-
seconds = int(args.seconds)
|
31 |
|
32 |
# Get audio duration in seconds
|
33 |
-
duration = float(os.popen(f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {
|
34 |
hour, minute, second = seconds_to_hms(int(duration))
|
|
|
35 |
|
36 |
-
#
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
# Slice audio into
|
40 |
-
hour, minute, second = seconds_to_hms(seconds) # Duration of each chunk
|
41 |
output_files = []
|
42 |
progress_bar = tqdm(total=num_chunks, desc="Slice audio into chunks progress")
|
|
|
43 |
for chunk in range(num_chunks):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
else:
|
57 |
-
command = f'ffmpeg -i {
|
|
|
|
|
58 |
os.system(command)
|
59 |
-
|
60 |
output_files.append(output)
|
61 |
-
|
62 |
progress_bar.update(1)
|
63 |
|
64 |
-
|
65 |
-
|
|
|
|
|
66 |
for output_file in output_files:
|
67 |
f.write(f"{output_file}\n")
|
68 |
-
|
69 |
-
if __name__ == "__main__":
|
70 |
-
argparser = argparse.ArgumentParser(description='Slice audio into smaller chunks')
|
71 |
-
argparser.add_argument('input', help='Input audio file')
|
72 |
-
argparser.add_argument('seconds', help='Duration of each chunk in seconds')
|
73 |
-
args = argparser.parse_args()
|
74 |
-
main(args)
|
|
|
4 |
|
5 |
START = 00
|
6 |
FOLDER = "chunks"
|
7 |
+
DEBUG = True
|
8 |
|
9 |
def seconds_to_hms(seconds):
|
10 |
hour = 00
|
|
|
23 |
def hms_to_seconds(hour, minute, second):
|
24 |
return hour*3600 + minute*60 + second
|
25 |
|
26 |
+
def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
|
27 |
+
"""
|
28 |
+
Slice audio into chunks with specified duration and overlap.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
input_audio_path (str): Path to input audio file
|
32 |
+
output_folder (str): Path to output folder
|
33 |
+
chunks_seconds (int): Duration of each chunk in seconds
|
34 |
+
chunk_overlap_seconds (int): Overlap between chunks in seconds
|
35 |
+
"""
|
36 |
+
_, filename = os.path.split(input_audio_path)
|
37 |
name, extension = os.path.splitext(filename)
|
|
|
38 |
|
39 |
# Get audio duration in seconds
|
40 |
+
duration = float(os.popen(f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {input_audio_path}').read())
|
41 |
hour, minute, second = seconds_to_hms(int(duration))
|
42 |
+
print(f"\tDuration ({duration} seconds): {hour:02d}:{minute:02d}:{second:02d}")
|
43 |
|
44 |
+
# Calculate effective chunk duration considering overlap
|
45 |
+
effective_chunk = chunks_seconds - chunk_overlap_seconds
|
46 |
+
|
47 |
+
# Calculate number of chunks needed
|
48 |
+
if effective_chunk > 0:
|
49 |
+
num_chunks = -(-int(duration - chunk_overlap_seconds) // effective_chunk) # Ceiling division
|
50 |
+
else:
|
51 |
+
raise ValueError("Overlap duration must be less than chunk duration")
|
52 |
|
53 |
+
# Slice audio into chunks with overlap
|
|
|
54 |
output_files = []
|
55 |
progress_bar = tqdm(total=num_chunks, desc="Slice audio into chunks progress")
|
56 |
+
|
57 |
for chunk in range(num_chunks):
|
58 |
+
# Calculate start and end times for this chunk
|
59 |
+
start_time = chunk * effective_chunk
|
60 |
+
end_time = min(start_time + chunks_seconds, duration)
|
61 |
+
|
62 |
+
# Convert times to HH:MM:SS format
|
63 |
+
hour_start, minute_start, second_start = seconds_to_hms(start_time)
|
64 |
+
|
65 |
+
# Calculate chunk duration
|
66 |
+
chunk_duration = end_time - start_time
|
67 |
+
hour_duration, minute_duration, second_duration = seconds_to_hms(chunk_duration)
|
68 |
+
|
69 |
+
# Generate output filename
|
70 |
+
output = f"{output_folder}/{name}_chunk{chunk:003d}{extension}"
|
71 |
|
72 |
+
if DEBUG:
|
73 |
+
if os.path.exists(output):
|
74 |
+
output_files.append(output)
|
75 |
+
progress_bar.update(1)
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Build ffmpeg command with -y flag to overwrite without asking
|
79 |
+
if chunk == num_chunks - 1: # Last chunk
|
80 |
+
command = f'ffmpeg -y -i {input_audio_path} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -loglevel error {output}'
|
81 |
else:
|
82 |
+
command = f'ffmpeg -y -i {input_audio_path} -ss {hour_start:02d}:{minute_start:02d}:{second_start:02d} -t {hour_duration:02d}:{minute_duration:02d}:{second_duration:02d} -loglevel error {output}'
|
83 |
+
|
84 |
+
# Execute command
|
85 |
os.system(command)
|
|
|
86 |
output_files.append(output)
|
|
|
87 |
progress_bar.update(1)
|
88 |
|
89 |
+
progress_bar.close()
|
90 |
+
|
91 |
+
# Write output files to a txt file (with overwrite)
|
92 |
+
with open(f"{output_folder}/output_files.txt", "w") as f:
|
93 |
for output_file in output_files:
|
94 |
f.write(f"{output_file}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transcribe.py
CHANGED
@@ -2,50 +2,86 @@ import os
|
|
2 |
import argparse
|
3 |
from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
|
4 |
from tqdm import tqdm
|
|
|
|
|
5 |
|
6 |
-
# For pyannote.audio diarize
|
7 |
-
from pyannote.audio import Model
|
8 |
-
model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token="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")
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
|
24 |
-
def transcribe(audio_file, language,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
output_folder = "transcriptions"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
model = "large-v2"
|
29 |
-
# word_timestamps = True
|
30 |
-
print_progress = False
|
31 |
-
if device == "cpu":
|
32 |
-
# I supose that I am on huggingface server
|
33 |
-
compute_type = "float32"
|
34 |
-
else:
|
35 |
-
compute_type = "float16"
|
36 |
-
fp16 = True
|
37 |
-
batch_size = 8
|
38 |
-
verbose = False
|
39 |
-
min_speakers = 1
|
40 |
-
max_speakers = num_speakers
|
41 |
-
threads = 4
|
42 |
-
output_format = "srt"
|
43 |
-
hf_token = "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"
|
44 |
-
command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
|
45 |
-
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
|
46 |
-
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device} \
|
47 |
-
--diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
|
48 |
-
os.system(command)
|
49 |
|
50 |
if __name__ == "__main__":
|
51 |
parser = argparse.ArgumentParser(description='Transcribe audio files')
|
@@ -66,5 +102,6 @@ if __name__ == "__main__":
|
|
66 |
_, input_name = input_file.split('/')
|
67 |
extension = "mp3"
|
68 |
file = f'{chunks_folder}/{input_name}.{extension}'
|
|
|
69 |
transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
|
70 |
-
progress_bar.update(1)
|
|
|
2 |
import argparse
|
3 |
from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
|
4 |
from tqdm import tqdm
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
+
def get_language_dict():
|
10 |
+
language_dict = {}
|
11 |
+
# Iterate over the LANGUAGE_NAME_TO_CODE dictionary
|
12 |
+
for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
|
13 |
+
# Extract the language code (the first two characters before the underscore)
|
14 |
+
lang_code = language_code.split('_')[0].lower()
|
15 |
+
|
16 |
+
# Check if the language code is present in WHISPER_LANGUAGES
|
17 |
+
if lang_code in WHISPER_LANGUAGES:
|
18 |
+
# Construct the entry for the resulting dictionary
|
19 |
+
language_dict[language_name] = {
|
20 |
+
"transcriber": lang_code,
|
21 |
+
"translator": language_code
|
22 |
+
}
|
23 |
+
return language_dict
|
24 |
|
25 |
+
def transcribe(audio_file, language, device, chunk_length_s=30, stride_length_s=5):
|
26 |
+
"""
|
27 |
+
Transcribe audio file using Whisper model.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
audio_file (str): Path to audio file
|
31 |
+
language (str): Language code for transcription
|
32 |
+
device (str): Device to use for inference ('cuda' or 'cpu')
|
33 |
+
chunk_length_s (int): Length of audio chunks in seconds
|
34 |
+
stride_length_s (int): Stride length between chunks in seconds
|
35 |
+
"""
|
36 |
output_folder = "transcriptions"
|
37 |
+
if not os.path.exists(output_folder):
|
38 |
+
os.makedirs(output_folder)
|
39 |
+
|
40 |
+
# Get output filename
|
41 |
+
audio_filename = os.path.basename(audio_file)
|
42 |
+
filename_without_ext = os.path.splitext(audio_filename)[0]
|
43 |
+
output_file = os.path.join(output_folder, f"{filename_without_ext}.srt")
|
44 |
+
|
45 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
46 |
+
|
47 |
+
# Load model and processor
|
48 |
+
model_id = "openai/whisper-large-v3-turbo"
|
49 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
50 |
+
model_id,
|
51 |
+
torch_dtype=torch_dtype,
|
52 |
+
low_cpu_mem_usage=True,
|
53 |
+
use_safetensors=True
|
54 |
+
)
|
55 |
+
model.to(device)
|
56 |
+
|
57 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
58 |
+
|
59 |
+
# Create pipeline with timestamp generation
|
60 |
+
pipe = pipeline(
|
61 |
+
"automatic-speech-recognition",
|
62 |
+
model=model,
|
63 |
+
tokenizer=processor.tokenizer,
|
64 |
+
feature_extractor=processor.feature_extractor,
|
65 |
+
torch_dtype=torch_dtype,
|
66 |
+
device=device,
|
67 |
+
chunk_length_s=chunk_length_s,
|
68 |
+
stride_length_s=stride_length_s,
|
69 |
+
return_timestamps=True
|
70 |
+
)
|
71 |
+
|
72 |
+
# Transcribe with timestamps and generate attention mask
|
73 |
+
result = pipe(
|
74 |
+
audio_file,
|
75 |
+
return_timestamps=True,
|
76 |
+
generate_kwargs={
|
77 |
+
"language": language,
|
78 |
+
"task": "transcribe",
|
79 |
+
"use_cache": True,
|
80 |
+
"num_beams": 1
|
81 |
+
}
|
82 |
+
)
|
83 |
|
84 |
+
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
parser = argparse.ArgumentParser(description='Transcribe audio files')
|
|
|
102 |
_, input_name = input_file.split('/')
|
103 |
extension = "mp3"
|
104 |
file = f'{chunks_folder}/{input_name}.{extension}'
|
105 |
+
language_dict = get_language_dict()
|
106 |
transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
|
107 |
+
progress_bar.update(1)
|
ui_config.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Definición de colores
|
2 |
+
BACKGROUND_COLOR = "#0b0f19"
|
3 |
+
BUTTON_COLOR = "#47515f"
|
4 |
+
SVG_COLOR = "#f3f4f6"
|
5 |
+
PANEL_COLOR = "#101827"
|
6 |
+
PRIMARY_TEXT_COLOR = "#f3f4f6"
|
7 |
+
SUBDUED_TEXT_COLOR = "#59616f"
|
8 |
+
BACKGROUND_PRIMARY_COLOR = "#1f2937"
|
9 |
+
BACKGROUND_SECONDARY_COLOR = "#101827"
|
10 |
+
PRIMARY_BODER_COLOR = "#323c4c"
|
11 |
+
BLOCK_TITLE_TEXT_COLOR = "#dfe2e6"
|
12 |
+
INPUT_BACKGROUND_COLOR = "#2f3947"
|
13 |
+
INPUT_BORDER_COLOR = "#313b4b"
|
14 |
+
INPUT_PLACEHOLDER_COLOR = "#616977"
|
15 |
+
ERROR_BACKGROUND_COLOR = "#101827"
|
16 |
+
ERROR_TEXT_COLOR = "#f7f2f2"
|
17 |
+
ERROR_BORDER_COLOR = "#9b3339"
|
18 |
+
BUTTON_SECONDARY_BACKGROUND_COLOR = "#434d5c"
|
19 |
+
BUTTON_SECONDARY_BORDER_COLOR = "#444d5b"
|
20 |
+
BUTTON_SECONDARY_TEXT_COLOR = "#c5c9cc"
|
21 |
+
RED = "#ff0000"
|
22 |
+
GREEN = "#00ff00"
|
23 |
+
BLUE = "#0000ff"
|
24 |
+
|
25 |
+
# HTML para redes sociales
|
26 |
+
html_social_media = f'''
|
27 |
+
<div style="float: right;">
|
28 |
+
<a href="https://maximofn.com/" rel="noopener noreferrer" aria-disabled="false" class="sm secondary svelte-cmf5ev" id="component-1" style="flex-grow: 100;" target="_blank">
|
29 |
+
<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 576 512">
|
30 |
+
<style>
|
31 |
+
svg {"{"}
|
32 |
+
fill: {SVG_COLOR}
|
33 |
+
{"}"}
|
34 |
+
</style>
|
35 |
+
<path d="M208 80c0-26.5 21.5-48 48-48h64c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48h-8v40H464c30.9 0 56 25.1 56 56v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H464c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-4.4-3.6-8-8-8H312v40h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H256c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V280H112c-4.4 0-8 3.6-8 8v32h8c26.5 0 48 21.5 48 48v64c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V368c0-26.5 21.5-48 48-48h8V288c0-30.9 25.1-56 56-56H264V192h-8c-26.5 0-48-21.5-48-48V80z"/>
|
36 |
+
</svg>
|
37 |
+
</a>
|
38 |
+
<!-- Resto de los enlaces de redes sociales... -->
|
39 |
+
</div>
|
40 |
+
'''
|
41 |
+
|
42 |
+
# HTML para el logo
|
43 |
+
def get_html_subtify_logo(new_width, new_height):
|
44 |
+
return f"""
|
45 |
+
<div style="display: flex; justify-content: center; align-items: center;">
|
46 |
+
<img src='https://pub-fb664c455eca46a2ba762a065ac900f7.r2.dev/subtify_logo-scaled.webp' width={new_width}px height={new_height}px >
|
47 |
+
</div>
|
48 |
+
"""
|
49 |
+
|
50 |
+
# HTML para el botón de Buy Me a Coffee
|
51 |
+
html_buy_me_a_coffe = '''
|
52 |
+
<div style="float: right;">
|
53 |
+
<a href="https://www.buymeacoffee.com/maximofn" target="_blank">
|
54 |
+
<img src="https://img.shields.io/badge/Buy_Me_A_Coffee-support_my_work-FFDD00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white&labelColor=101010" alt="buy me a coffe">
|
55 |
+
</a>
|
56 |
+
</div>
|
57 |
+
'''
|
url_manager.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import gradio as gr
|
3 |
+
from pytube import YouTube
|
4 |
+
from lang_list import union_language_dict
|
5 |
+
|
6 |
+
language_dict = union_language_dict()
|
7 |
+
|
8 |
+
def get_youtube_thumbnail(url):
|
9 |
+
yt = YouTube(url)
|
10 |
+
thumbnail_url = yt.thumbnail_url
|
11 |
+
return thumbnail_url
|
12 |
+
|
13 |
+
def is_valid_youtube_url(url):
|
14 |
+
# This regular expression should match the following YouTube URL formats:
|
15 |
+
# - https://youtube.com/watch?v=video_id
|
16 |
+
# - https://www.youtube.com/watch?v=video_id
|
17 |
+
# - https://youtu.be/video_id
|
18 |
+
patron_youtube = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
|
19 |
+
return bool(re.match(patron_youtube, url))
|
20 |
+
|
21 |
+
def is_valid_twitch_url(url):
|
22 |
+
# This regular expression should match the following Twitch URL formats:
|
23 |
+
# - https://twitch.tv/channel_name
|
24 |
+
# - https://www.twitch.tv/channel_name
|
25 |
+
# - https://twitch.tv/videos/video_id
|
26 |
+
twitch_pattern = r'(https?://)?(www\.)?twitch\.tv/(videos/\d+|\w+)'
|
27 |
+
return bool(re.match(twitch_pattern, url))
|
28 |
+
|
29 |
+
def is_valid_url(url):
|
30 |
+
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
31 |
+
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
32 |
+
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
33 |
+
advanced_setings = gr.Accordion(visible=True)
|
34 |
+
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
35 |
+
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
36 |
+
|
37 |
+
# Youtube
|
38 |
+
if "youtube" in url.lower() or "youtu.be" in url.lower():
|
39 |
+
if is_valid_youtube_url(url):
|
40 |
+
thumbnail = get_youtube_thumbnail(url)
|
41 |
+
if thumbnail:
|
42 |
+
return (
|
43 |
+
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
|
44 |
+
source_languaje,
|
45 |
+
target_languaje,
|
46 |
+
advanced_setings,
|
47 |
+
number_of_speakers,
|
48 |
+
subtify_button,
|
49 |
+
)
|
50 |
+
else:
|
51 |
+
return (
|
52 |
+
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
|
53 |
+
source_languaje,
|
54 |
+
target_languaje,
|
55 |
+
advanced_setings,
|
56 |
+
number_of_speakers,
|
57 |
+
subtify_button,
|
58 |
+
)
|
59 |
+
|
60 |
+
# Twitch
|
61 |
+
elif "twitch" in url.lower() or "twitch.tv" in url.lower():
|
62 |
+
if is_valid_twitch_url(url):
|
63 |
+
return (
|
64 |
+
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
|
65 |
+
source_languaje,
|
66 |
+
target_languaje,
|
67 |
+
advanced_setings,
|
68 |
+
number_of_speakers,
|
69 |
+
subtify_button,
|
70 |
+
)
|
71 |
+
|
72 |
+
# Error
|
73 |
+
visible = False
|
74 |
+
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
|
75 |
+
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
76 |
+
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
77 |
+
advanced_setings = gr.Accordion(visible=visible)
|
78 |
+
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
79 |
+
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
80 |
+
return (
|
81 |
+
image,
|
82 |
+
source_languaje,
|
83 |
+
target_languaje,
|
84 |
+
advanced_setings,
|
85 |
+
number_of_speakers,
|
86 |
+
subtify_button,
|
87 |
+
)
|