Spaces:
Build error
Build error
Ruslan Magana Vsevolodovna
commited on
Commit
·
a216bdd
1
Parent(s):
3162e54
Add application file
Browse files- README.md +2 -1
- app.py +229 -0
- demo/tryagain.mp4 +0 -0
- requirements.txt +8 -0
- utils.py +37 -0
README.md
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
---
|
2 |
title: Youtube Video Translator
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.2
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: Youtube Video Translator
|
3 |
+
emoji: 🐨
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
6 |
+
python_version: 3.8.9
|
7 |
sdk: gradio
|
8 |
sdk_version: 3.2
|
9 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf8
|
2 |
+
# Youtube Video Translator
|
3 |
+
# Developed by Ruslan Magana Vsevolodovna
|
4 |
+
# https://ruslanmv.com/
|
5 |
+
|
6 |
+
# importing all necessary libraries
|
7 |
+
import pathlib
|
8 |
+
import sys, os
|
9 |
+
from gtts import gTTS
|
10 |
+
import gradio as gr
|
11 |
+
import os
|
12 |
+
import speech_recognition as sr
|
13 |
+
from googletrans import Translator, constants
|
14 |
+
from pprint import pprint
|
15 |
+
from moviepy.editor import *
|
16 |
+
from pytube import YouTube
|
17 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
18 |
+
from utils import *
|
19 |
+
|
20 |
+
def download_video(url):
|
21 |
+
print("Downloading...")
|
22 |
+
local_file = (
|
23 |
+
YouTube(url)
|
24 |
+
.streams.filter(progressive=True, file_extension="mp4")
|
25 |
+
.first()
|
26 |
+
.download()
|
27 |
+
)
|
28 |
+
print("Downloaded")
|
29 |
+
return local_file
|
30 |
+
|
31 |
+
def validate_url(url):
|
32 |
+
import validators
|
33 |
+
if not validators.url(url):
|
34 |
+
print("Hi there URL seems invalid ")
|
35 |
+
|
36 |
+
|
37 |
+
def cleanup():
|
38 |
+
import pathlib
|
39 |
+
import glob
|
40 |
+
types = ('*.mp4', '*.wav') # the tuple of file types
|
41 |
+
#Finding mp4 and wave files
|
42 |
+
junks = []
|
43 |
+
for files in types:
|
44 |
+
junks.extend(glob.glob(files))
|
45 |
+
try:
|
46 |
+
# Deleting those files
|
47 |
+
for junk in junks:
|
48 |
+
print("Deleting",junk)
|
49 |
+
# Setting the path for the file to delete
|
50 |
+
file = pathlib.Path(junk)
|
51 |
+
# Calling the unlink method on the path
|
52 |
+
file.unlink()
|
53 |
+
except Exception:
|
54 |
+
print("I cannot delete the file because it is being used by another process")
|
55 |
+
|
56 |
+
def getSize(filename):
|
57 |
+
st = os.stat(filename)
|
58 |
+
return st.st_size
|
59 |
+
|
60 |
+
|
61 |
+
def generate_transcript(url,lang_api):
|
62 |
+
id = url[url.index("=")+1:]
|
63 |
+
transcript = YouTubeTranscriptApi.get_transcript(id,languages=[lang_api])
|
64 |
+
script = ""
|
65 |
+
for text in transcript:
|
66 |
+
t = text["text"]
|
67 |
+
if t != '[Music]':
|
68 |
+
script += t + " "
|
69 |
+
return script
|
70 |
+
|
71 |
+
|
72 |
+
def video_to_translate(url,initial_language,final_language):
|
73 |
+
|
74 |
+
#Internal definitions
|
75 |
+
if initial_language == "English":
|
76 |
+
lang_in='en-US'
|
77 |
+
lang_api='en'
|
78 |
+
elif initial_language == "Italian":
|
79 |
+
lang_in='it-IT'
|
80 |
+
lang_api='it'
|
81 |
+
elif initial_language == "Spanish":
|
82 |
+
lang_in='es-MX'
|
83 |
+
lang_api='es'
|
84 |
+
elif initial_language == "Russian":
|
85 |
+
lang_in='ru-RU'
|
86 |
+
lang_api='rus'
|
87 |
+
elif initial_language == "German":
|
88 |
+
lang_in='de-DE'
|
89 |
+
lang_api='de'
|
90 |
+
elif initial_language == "Japanese":
|
91 |
+
lang_in='ja-JP'
|
92 |
+
lang_api='ja'
|
93 |
+
if final_language == "English":
|
94 |
+
lang='en'
|
95 |
+
elif final_language == "Italian":
|
96 |
+
lang='it'
|
97 |
+
elif final_language == "Spanish":
|
98 |
+
lang='es'
|
99 |
+
elif final_language == "Russian":
|
100 |
+
lang='ru'
|
101 |
+
elif final_language == "German":
|
102 |
+
lang='de'
|
103 |
+
elif final_language == "Japanese":
|
104 |
+
lang='ja'
|
105 |
+
|
106 |
+
# Initial directory
|
107 |
+
home_dir = os.getcwd()
|
108 |
+
print('Initial directory:',home_dir)
|
109 |
+
cleanup()
|
110 |
+
# Temporal directory
|
111 |
+
temp_dir=os.path.join(home_dir, "temp")
|
112 |
+
print('Temporal directory:',temp_dir)
|
113 |
+
#Create temp directory
|
114 |
+
pathlib.Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
115 |
+
# Go to temp directory
|
116 |
+
os.chdir(temp_dir)
|
117 |
+
print('Changing temporal directory',os.getcwd())
|
118 |
+
# Cleaning previous files
|
119 |
+
cleanup()
|
120 |
+
file_obj=download_video(url)
|
121 |
+
print(file_obj)
|
122 |
+
# Insert Local Video File Path
|
123 |
+
videoclip = VideoFileClip(file_obj)
|
124 |
+
try:
|
125 |
+
# Trying to get transcripts
|
126 |
+
text = generate_transcript(url,lang_api)
|
127 |
+
print("Transcript Found")
|
128 |
+
except Exception:
|
129 |
+
print("No Transcript Found")
|
130 |
+
# Trying to recognize audio
|
131 |
+
# Insert Local Audio File Path
|
132 |
+
videoclip.audio.write_audiofile("audio.wav",codec='pcm_s16le')
|
133 |
+
# initialize the recognizer
|
134 |
+
r = sr.Recognizer()
|
135 |
+
# open the file
|
136 |
+
with sr.AudioFile("audio.wav") as source:
|
137 |
+
# listen for the data (load audio to memory)
|
138 |
+
audio_data = r.record(source)
|
139 |
+
# recognize (convert from speech to text)
|
140 |
+
print("Recognize from ",lang_in)
|
141 |
+
#There is a limit of 10 MB on all single requests sent to the API using local file
|
142 |
+
size_wav=getSize("audio.wav")
|
143 |
+
if size_wav > 50000000:
|
144 |
+
print("The wav is too large")
|
145 |
+
audio_chunks=split_audio_wav("audio.wav")
|
146 |
+
text=""
|
147 |
+
for chunk in audio_chunks:
|
148 |
+
print("Converting audio to text",chunk)
|
149 |
+
try:
|
150 |
+
text_chunk= r.recognize_google(audio_data, language = lang_in)
|
151 |
+
except Exception:
|
152 |
+
print("This video cannot be recognized")
|
153 |
+
cleanup()
|
154 |
+
# Return back to main directory
|
155 |
+
os.chdir(home_dir)
|
156 |
+
return "./demo/tryagain.mp4"
|
157 |
+
text=text+text_chunk+" "
|
158 |
+
text=str(text)
|
159 |
+
print(type(text))
|
160 |
+
|
161 |
+
else:
|
162 |
+
text = r.recognize_google(audio_data, language = lang_in)
|
163 |
+
#print(text)
|
164 |
+
print("Destination language ",lang)
|
165 |
+
|
166 |
+
# init the Google API translator
|
167 |
+
translator = Translator()
|
168 |
+
|
169 |
+
|
170 |
+
try:
|
171 |
+
translation = translator.translate(text, dest=lang)
|
172 |
+
except Exception:
|
173 |
+
print("This text cannot be translated")
|
174 |
+
cleanup()
|
175 |
+
# Return back to main directory
|
176 |
+
os.chdir(home_dir)
|
177 |
+
return "./demo/tryagain.mp4"
|
178 |
+
|
179 |
+
#translation.text
|
180 |
+
trans=translation.text
|
181 |
+
|
182 |
+
myobj = gTTS(text=trans, lang=lang, slow=False)
|
183 |
+
myobj.save("audio.wav")
|
184 |
+
# loading audio file
|
185 |
+
audioclip = AudioFileClip("audio.wav")
|
186 |
+
|
187 |
+
# adding audio to the video clip
|
188 |
+
new_audioclip = CompositeAudioClip([audioclip])
|
189 |
+
videoclip.audio = new_audioclip
|
190 |
+
new_video="video_translated_"+lang+".mp4"
|
191 |
+
|
192 |
+
# Return back to main directory
|
193 |
+
os.chdir(home_dir)
|
194 |
+
print('Final directory',os.getcwd())
|
195 |
+
|
196 |
+
videoclip.write_videofile(new_video)
|
197 |
+
|
198 |
+
videoclip.close()
|
199 |
+
del file_obj
|
200 |
+
|
201 |
+
return new_video
|
202 |
+
|
203 |
+
initial_language = gr.inputs.Dropdown(["English","Italian","Japanese","Russian","Spanish","German"])
|
204 |
+
final_language = gr.inputs.Dropdown([ "Russian","Italian","Spanish","German","English","Japanese"])
|
205 |
+
url =gr.inputs.Textbox(label = "Enter the YouTube URL below:")
|
206 |
+
|
207 |
+
|
208 |
+
gr.Interface(fn = video_to_translate,
|
209 |
+
inputs = [url,initial_language,final_language],
|
210 |
+
outputs = 'video',
|
211 |
+
verbose = True,
|
212 |
+
title = 'Video Youtube Translator',
|
213 |
+
description = 'A simple application that translates Youtube videos from English, Italian, Japanese, Russian, Spanish, and German to Italian, Spanish, Russian, English and Japanese. Wait one minute to process.',
|
214 |
+
article =
|
215 |
+
'''<div>
|
216 |
+
<p style="text-align: center"> All you need to do is to paste the Youtube link and hit submit, then wait for compiling. After that click on Play/Pause for listing to the video. The video is saved in an mp4 format.
|
217 |
+
For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
|
218 |
+
</p>
|
219 |
+
</div>''',
|
220 |
+
|
221 |
+
examples = [
|
222 |
+
["https://www.youtube.com/watch?v=Cu3R5it4cQs&list", "English","Italian"],
|
223 |
+
["https://www.youtube.com/watch?v=fkGCLIQx1MI", "English","Spanish"],
|
224 |
+
["https://www.youtube.com/watch?v=fkGCLIQx1MI", "English","Russian"],
|
225 |
+
["https://www.youtube.com/watch?v=_5YeX8eCLgA&ab_channel=TheTelegraph", "Russian","English"],
|
226 |
+
["https://www.youtube.com/watch?v=qzzweIQoIOU", "Japanese","English"],
|
227 |
+
["https://www.youtube.com/watch?v=eo17uDr2_XA", "German","Spanish"]
|
228 |
+
]
|
229 |
+
).launch()
|
demo/tryagain.mp4
ADDED
Binary file (307 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip==22.2.2
|
2 |
+
gradio==3.0.24
|
3 |
+
googletrans==4.0.0rc1
|
4 |
+
moviepy
|
5 |
+
SpeechRecognition
|
6 |
+
gTTS
|
7 |
+
youtube_transcript_api
|
8 |
+
pytube
|
utils.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydub import AudioSegment
|
2 |
+
#from pydub.utils import mediainfo
|
3 |
+
from pydub.utils import make_chunks
|
4 |
+
import math
|
5 |
+
#flac_audio = AudioSegment.from_file("sample.flac", "flac")
|
6 |
+
#flac_audio.export("audio.wav", format="wav")
|
7 |
+
def split_audio_wav(filename):
|
8 |
+
myaudio = AudioSegment.from_file(filename , "wav")
|
9 |
+
channel_count = myaudio.channels #Get channels
|
10 |
+
sample_width = myaudio.sample_width #Get sample width
|
11 |
+
duration_in_sec = len(myaudio) / 1000#Length of audio in sec
|
12 |
+
sample_rate = myaudio.frame_rate
|
13 |
+
print("sample_width=", sample_width)
|
14 |
+
print("channel_count=", channel_count)
|
15 |
+
print("duration_in_sec=", duration_in_sec)
|
16 |
+
print("frame_rate=", sample_rate)
|
17 |
+
bit_rate =16 #assumption , you can extract from mediainfo("test.wav") dynamically
|
18 |
+
wav_file_size = (sample_rate * bit_rate * channel_count * duration_in_sec) / 8
|
19 |
+
print("wav_file_size = ",wav_file_size)
|
20 |
+
file_split_size = 40000000 # 40mb OR 40, 000, 000 bytes
|
21 |
+
total_chunks = wav_file_size // file_split_size
|
22 |
+
#Get chunk size by following method #There are more than one ofcourse
|
23 |
+
#for duration_in_sec (X) --> wav_file_size (Y)
|
24 |
+
#So whats duration in sec (K) --> for file size of 40Mb
|
25 |
+
# K = X * 40Mb / Y
|
26 |
+
chunk_length_in_sec = math.ceil((duration_in_sec * 40000000 ) /wav_file_size) #in sec
|
27 |
+
chunk_length_ms = chunk_length_in_sec * 1000
|
28 |
+
chunks = make_chunks(myaudio, chunk_length_ms)
|
29 |
+
number_chunks=len(chunks)
|
30 |
+
chunks_list=[]
|
31 |
+
#Export all of the individual chunks as wav files
|
32 |
+
for i, chunk in enumerate(chunks):
|
33 |
+
chunk_name = "chunk{0}.wav".format(i)
|
34 |
+
print("exporting", chunk_name)
|
35 |
+
chunk.export(chunk_name, format="wav")
|
36 |
+
chunks_list.append(chunk_name)
|
37 |
+
return chunks_list
|