wendel pereira commited on
Commit
1f4d75d
·
1 Parent(s): 0d7283b

adicionado português na lista

Browse files
Files changed (1) hide show
  1. app.py +309 -341
app.py CHANGED
@@ -1,342 +1,310 @@
1
- # coding=utf8
2
- # Youtube Video Translator
3
- # Developed by Ruslan Magana Vsevolodovna
4
- # https://ruslanmv.com/
5
-
6
- # importing all necessary libraries
7
- import pathlib
8
- import sys, os
9
- from gtts import gTTS
10
- import gradio as gr
11
- import os
12
- import speech_recognition as sr
13
- from googletrans import Translator, constants
14
- from pprint import pprint
15
- from moviepy.editor import *
16
- from pytube import YouTube
17
- from youtube_transcript_api import YouTubeTranscriptApi
18
- from utils import *
19
-
20
- def download_video(url):
21
- print("Downloading...")
22
- local_file = (
23
- YouTube(url)
24
- .streams.filter(progressive=True, file_extension="mp4")
25
- .first()
26
- .download()
27
- )
28
- print("Downloaded")
29
- return local_file
30
-
31
- def validate_youtube(url):
32
- #This creates a youtube objet
33
- try:
34
- yt = YouTube(url)
35
- except Exception:
36
- print("Hi there URL seems invalid")
37
- return True
38
- #This will return the length of the video in sec as an int
39
- video_length = yt.length
40
- if video_length > 600:
41
- print("Your video is larger than 10 minutes")
42
- return True
43
- else:
44
- print("Your video is less than 10 minutes")
45
- return False
46
-
47
- def validate_url(url):
48
- import validators
49
- if not validators.url(url):
50
- print("Hi there URL seems invalid ")
51
- return True
52
- else:
53
- return False
54
-
55
-
56
- def cleanup():
57
- import pathlib
58
- import glob
59
- types = ('*.mp4', '*.wav') # the tuple of file types
60
- #Finding mp4 and wave files
61
- junks = []
62
- for files in types:
63
- junks.extend(glob.glob(files))
64
- try:
65
- # Deleting those files
66
- for junk in junks:
67
- print("Deleting",junk)
68
- # Setting the path for the file to delete
69
- file = pathlib.Path(junk)
70
- # Calling the unlink method on the path
71
- file.unlink()
72
- except Exception:
73
- print("I cannot delete the file because it is being used by another process")
74
-
75
- def getSize(filename):
76
- st = os.stat(filename)
77
- return st.st_size
78
-
79
-
80
- def clean_transcript(transcript_list):
81
- script = ""
82
- for text in transcript_list:
83
- t = text["text"]
84
- if( (t != '[music]') and \
85
- (t != '[Music]') and \
86
- (t != '[музыка]') and \
87
- (t != '[Музыка]') and \
88
- (t != '[musik]') and \
89
- (t != '[Musik]') and \
90
- (t != '[musica]') and \
91
- (t != '[Musica]') and \
92
- (t != '[música]') and \
93
- (t != '[Música]') and \
94
- (t != '[音楽]') and \
95
- (t != '[音乐]')
96
- ) :
97
- script += t + " "
98
- return script
99
-
100
-
101
- def get_transcript(url,desired_language):
102
- id_you= url[url.index("=")+1:]
103
- try:
104
- # retrieve the available transcripts
105
- transcript_list = YouTubeTranscriptApi.list_transcripts(id_you)
106
-
107
- except Exception:
108
- print('TranscriptsDisabled:')
109
- is_translated = False
110
- return " ", " ", is_translated
111
-
112
- lista=[]
113
- transcript_translation_languages=[]
114
- # iterate over all available transcripts
115
- for transcript in transcript_list:
116
- lista.extend([
117
- transcript.language_code,
118
- transcript.is_generated,
119
- transcript.is_translatable,
120
- transcript_translation_languages.append(transcript.translation_languages),
121
- ])
122
- print(lista)
123
- n_size=int(len(lista)/4)
124
- print("There are {} avialable scripts".format(n_size))
125
- import numpy as np
126
- matrix = np.array(lista)
127
- shape = (n_size,4)
128
- matrix=matrix.reshape(shape)
129
- matrix=matrix.tolist()
130
- is_manually=False
131
- is_automatic=False
132
- for lista in matrix:
133
- #print(lista)
134
- language_code=lista[0]
135
- is_generated=lista[1]
136
- is_translatable=lista[2]
137
- if not is_generated and is_translatable :
138
- print("Script found manually generated")
139
- is_manually=True
140
- language_code_man=language_code
141
- if is_generated and is_translatable :
142
- print("Script found automatic generated")
143
- is_automatic=True
144
- language_code_au=language_code
145
-
146
- if is_manually:
147
- # we try filter for manually created transcripts
148
- print('We extract manually created transcripts')
149
- transcript = transcript_list.find_manually_created_transcript([language_code])
150
-
151
- elif is_automatic:
152
- print('We extract generated transcript')
153
- # or automatically generated ones, but not translated
154
- transcript = transcript_list.find_generated_transcript([language_code])
155
- else:
156
- print('We try find the transcript')
157
- # we directly filter for the language you are looking for, using the transcript list
158
- transcript = transcript_list.find_transcript([language_code])
159
-
160
- is_translated = False
161
- if is_translatable :
162
- for available_trad in transcript_translation_languages[0]:
163
- if available_trad['language_code']==desired_language:
164
- print("It was found the translation for lang:",desired_language)
165
- print('We translate directly the transcript')
166
- transcript_translated = transcript.translate(desired_language)
167
- transcript_translated=transcript_translated.fetch()
168
- translated=clean_transcript(transcript_translated)
169
- is_translated = True
170
- script_translated = ""
171
- if is_translated :
172
- script_translated = translated
173
-
174
- transcript=transcript.fetch()
175
- script = clean_transcript(transcript)
176
-
177
- return script, script_translated, is_translated
178
-
179
- # Set environment variables
180
- home_dir = os.getcwd()
181
- temp_dir=os.path.join(home_dir, "temp")
182
- #Create temp directory
183
- pathlib.Path(temp_dir).mkdir(parents=True, exist_ok=True)
184
- os.environ['home_dir'] = home_dir
185
- os.environ['temp_dir'] = temp_dir
186
-
187
- def video_to_translate(url,initial_language,final_language):
188
- print('Checking the url')
189
- check =validate_youtube(url)
190
- if check is True: return "./demo/tryagain2.mp4"
191
-
192
- #Internal definitions
193
- if initial_language == "English":
194
- lang_in='en-US'
195
- lang_api='en'
196
- elif initial_language == "Italian":
197
- lang_in='it-IT'
198
- lang_api='it'
199
- elif initial_language == "Spanish":
200
- lang_in='es-MX'
201
- lang_api='es'
202
- elif initial_language == "Russian":
203
- lang_in='ru-RU'
204
- lang_api='rus'
205
- elif initial_language == "German":
206
- lang_in='de-DE'
207
- lang_api='de'
208
- elif initial_language == "Japanese":
209
- lang_in='ja-JP'
210
- lang_api='ja'
211
- if final_language == "English":
212
- lang='en'
213
- elif final_language == "Italian":
214
- lang='it'
215
- elif final_language == "Spanish":
216
- lang='es'
217
- elif final_language == "Russian":
218
- lang='ru'
219
- elif final_language == "German":
220
- lang='de'
221
- elif final_language == "Japanese":
222
- lang='ja'
223
- # Initial directory
224
- home_dir= os.getenv('home_dir')
225
- print('Initial directory:',home_dir)
226
- # Cleaning previous files
227
- cleanup()
228
- file_obj=download_video(url)
229
- print(file_obj)
230
- # Insert Local Video File Path
231
- videoclip = VideoFileClip(file_obj)
232
- is_traduc=False
233
- # Trying to get transcripts
234
-
235
- text, trans, is_traduc = get_transcript(url,desired_language=lang)
236
- print("Transcript Found")
237
-
238
- if not is_traduc:
239
- print("No Transcript Found")
240
- # Trying to recognize audio
241
- # Insert Local Audio File Path
242
- videoclip.audio.write_audiofile("audio.wav",codec='pcm_s16le')
243
- # initialize the recognizer
244
- r = sr.Recognizer()
245
- # open the file
246
- with sr.AudioFile("audio.wav") as source:
247
- # listen for the data (load audio to memory)
248
- audio_data = r.record(source)
249
- # recognize (convert from speech to text)
250
- print("Recognize from ",lang_in)
251
- #There is a limit of 10 MB on all single requests sent to the API using local file
252
- size_wav=getSize("audio.wav")
253
- if size_wav > 50000000:
254
- print("The wav is too large")
255
- audio_chunks=split_audio_wav("audio.wav")
256
- text=""
257
- for chunk in audio_chunks:
258
- print("Converting audio to text",chunk)
259
- try:
260
- text_chunk= r.recognize_google(audio_data, language = lang_in)
261
- except Exception:
262
- print("This video cannot be recognized")
263
- cleanup()
264
- return "./demo/tryagain.mp4"
265
- text=text+text_chunk+" "
266
- text=str(text)
267
- print(type(text))
268
-
269
- else:
270
- try:
271
- text = r.recognize_google(audio_data, language = lang_in)
272
- except Exception:
273
- print("This video cannot be recognized")
274
- cleanup()
275
- return "./demo/tryagain.mp4"
276
-
277
- #print(text)
278
- print("Destination language ",lang)
279
-
280
- # init the Google API translator
281
- translator = Translator()
282
-
283
-
284
- try:
285
- translation = translator.translate(text, dest=lang)
286
- except Exception:
287
- print("This text cannot be translated")
288
- cleanup()
289
- return "./demo/tryagain.mp4"
290
-
291
- #translation.text
292
- trans=translation.text
293
-
294
- myobj = gTTS(text=trans, lang=lang, slow=False)
295
- myobj.save("audio.wav")
296
- # loading audio file
297
- audioclip = AudioFileClip("audio.wav")
298
-
299
- # adding audio to the video clip
300
- new_audioclip = CompositeAudioClip([audioclip])
301
- videoclip.audio = new_audioclip
302
- new_video="video_translated_"+lang+".mp4"
303
-
304
- # Return back to main directory
305
- os.chdir(home_dir)
306
- print('Final directory',os.getcwd())
307
-
308
- videoclip.write_videofile(new_video)
309
-
310
- videoclip.close()
311
- del file_obj
312
-
313
- return new_video
314
-
315
- initial_language = gr.inputs.Dropdown(["English","Italian","Japanese","Russian","Spanish","German"])
316
- final_language = gr.inputs.Dropdown([ "Russian","Italian","Spanish","German","English","Japanese"])
317
- url =gr.inputs.Textbox(label = "Enter the YouTube URL below:")
318
-
319
-
320
- gr.Interface(fn = video_to_translate,
321
- inputs = [url,initial_language,final_language],
322
- outputs = 'video',
323
- verbose = True,
324
- title = 'Video Youtube Translator',
325
- description = 'A simple application that translates Youtube small videos from English, Italian, Japanese, Russian, Spanish, and German to Italian, Spanish, Russian, English and Japanese. Wait one minute to process.',
326
- article =
327
- '''<div>
328
- <p style="text-align: center"> All you need to do is to paste the Youtube link and hit submit,, then wait for compiling. After that click on Play/Pause for listing to the video. The video is saved in an mp4 format.
329
- The lenght video limit is 10 minutes. For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
330
- </p>
331
- </div>''',
332
-
333
- examples = [
334
- ["https://www.youtube.com/watch?v=uLVRZE8OAI4", "English","Spanish"],
335
- ["https://www.youtube.com/watch?v=fkGCLIQx1MI", "English","Russian"],
336
- ["https://www.youtube.com/watch?v=6Q6hFtitthQ", "Italian","English"],
337
- ["https://www.youtube.com/watch?v=s5XvjAC7ai8", "Russian","English"],
338
- ["https://www.youtube.com/watch?v=qzzweIQoIOU", "Japanese","English"],
339
- ["https://www.youtube.com/watch?v=nOGZvu6tJFE", "German","Spanish"]
340
-
341
- ]
342
  ).launch()
 
1
+ # coding=utf8
2
+ # Youtube Video Translator
3
+ # Developed by Ruslan Magana Vsevolodovna
4
+ # https://ruslanmv.com/
5
+
6
+ # importing all necessary libraries
7
+ import pathlib
8
+ import sys, os
9
+ from gtts import gTTS
10
+ import gradio as gr
11
+ import os
12
+ import speech_recognition as sr
13
+ from googletrans import Translator, constants
14
+ from pprint import pprint
15
+ from moviepy.editor import *
16
+ from pytube import YouTube
17
+ from youtube_transcript_api import YouTubeTranscriptApi
18
+ from utils import *
19
+
20
+ def download_video(url):
21
+ print("Downloading...")
22
+ local_file = (
23
+ YouTube(url)
24
+ .streams.filter(progressive=True, file_extension="mp4")
25
+ .first()
26
+ .download()
27
+ )
28
+ print("Downloaded")
29
+ return local_file
30
+
31
+ def validate_youtube(url):
32
+ #This creates a youtube objet
33
+ try:
34
+ yt = YouTube(url)
35
+ except Exception:
36
+ print("Hi there URL seems invalid")
37
+ return True
38
+ #This will return the length of the video in sec as an int
39
+ video_length = yt.length
40
+ if video_length > 600:
41
+ print("Your video is larger than 10 minutes")
42
+ return True
43
+ else:
44
+ print("Your video is less than 10 minutes")
45
+ return False
46
+
47
+ def validate_url(url):
48
+ import validators
49
+ if not validators.url(url):
50
+ print("Hi there URL seems invalid ")
51
+ return True
52
+ else:
53
+ return False
54
+
55
+
56
+ def cleanup():
57
+ import pathlib
58
+ import glob
59
+ types = ('*.mp4', '*.wav') # the tuple of file types
60
+ #Finding mp4 and wave files
61
+ junks = []
62
+ for files in types:
63
+ junks.extend(glob.glob(files))
64
+ try:
65
+ # Deleting those files
66
+ for junk in junks:
67
+ print("Deleting",junk)
68
+ # Setting the path for the file to delete
69
+ file = pathlib.Path(junk)
70
+ # Calling the unlink method on the path
71
+ file.unlink()
72
+ except Exception:
73
+ print("I cannot delete the file because it is being used by another process")
74
+
75
+ def getSize(filename):
76
+ st = os.stat(filename)
77
+ return st.st_size
78
+
79
+
80
+ def clean_transcript(transcript_list):
81
+ script = ""
82
+ for text in transcript_list:
83
+ t = text["text"]
84
+ if( (t != '[music]') and \
85
+ (t != '[Music]') and \
86
+ (t != '[музыка]') and \
87
+ (t != '[Музыка]') and \
88
+ (t != '[musik]') and \
89
+ (t != '[Musik]') and \
90
+ (t != '[musica]') and \
91
+ (t != '[Musica]') and \
92
+ (t != '[música]') and \
93
+ (t != '[Música]') and \
94
+ (t != '[音楽]') and \
95
+ (t != '[音乐]')
96
+ ) :
97
+ script += t + " "
98
+ return script
99
+
100
+
101
+ def get_transcript(url,desired_language):
102
+ id_you= url[url.index("=")+1:]
103
+ try:
104
+ # retrieve the available transcripts
105
+ transcript_list = YouTubeTranscriptApi.list_transcripts(id_you)
106
+
107
+ except Exception:
108
+ print('TranscriptsDisabled:')
109
+ is_translated = False
110
+ return " ", " ", is_translated
111
+
112
+ lista=[]
113
+ transcript_translation_languages=[]
114
+ # iterate over all available transcripts
115
+ for transcript in transcript_list:
116
+ lista.extend([
117
+ transcript.language_code,
118
+ transcript.is_generated,
119
+ transcript.is_translatable,
120
+ transcript_translation_languages.append(transcript.translation_languages),
121
+ ])
122
+ print(lista)
123
+ n_size=int(len(lista)/4)
124
+ print("There are {} avialable scripts".format(n_size))
125
+ import numpy as np
126
+ matrix = np.array(lista)
127
+ shape = (n_size,4)
128
+ matrix=matrix.reshape(shape)
129
+ matrix=matrix.tolist()
130
+ is_manually=False
131
+ is_automatic=False
132
+ for lista in matrix:
133
+ #print(lista)
134
+ language_code=lista[0]
135
+ is_generated=lista[1]
136
+ is_translatable=lista[2]
137
+ if not is_generated and is_translatable :
138
+ print("Script found manually generated")
139
+ is_manually=True
140
+ language_code_man=language_code
141
+ if is_generated and is_translatable :
142
+ print("Script found automatic generated")
143
+ is_automatic=True
144
+ language_code_au=language_code
145
+
146
+ if is_manually:
147
+ # we try filter for manually created transcripts
148
+ print('We extract manually created transcripts')
149
+ transcript = transcript_list.find_manually_created_transcript([language_code])
150
+
151
+ elif is_automatic:
152
+ print('We extract generated transcript')
153
+ # or automatically generated ones, but not translated
154
+ transcript = transcript_list.find_generated_transcript([language_code])
155
+ else:
156
+ print('We try find the transcript')
157
+ # we directly filter for the language you are looking for, using the transcript list
158
+ transcript = transcript_list.find_transcript([language_code])
159
+
160
+ is_translated = False
161
+ if is_translatable :
162
+ for available_trad in transcript_translation_languages[0]:
163
+ if available_trad['language_code']==desired_language:
164
+ print("It was found the translation for lang:",desired_language)
165
+ print('We translate directly the transcript')
166
+ transcript_translated = transcript.translate(desired_language)
167
+ transcript_translated=transcript_translated.fetch()
168
+ translated=clean_transcript(transcript_translated)
169
+ is_translated = True
170
+ script_translated = ""
171
+ if is_translated :
172
+ script_translated = translated
173
+
174
+ transcript=transcript.fetch()
175
+ script = clean_transcript(transcript)
176
+
177
+ return script, script_translated, is_translated
178
+
179
+ # Set environment variables
180
+ home_dir = os.getcwd()
181
+ temp_dir=os.path.join(home_dir, "temp")
182
+ #Create temp directory
183
+ pathlib.Path(temp_dir).mkdir(parents=True, exist_ok=True)
184
+ os.environ['home_dir'] = home_dir
185
+ os.environ['temp_dir'] = temp_dir
186
+
187
+ def video_to_translate(url,initial_language,final_language):
188
+ print('Checking the url')
189
+ check =validate_youtube(url)
190
+ if check is True: return "./demo/tryagain2.mp4"
191
+
192
+ #Internal definitions
193
+ if initial_language == "English":
194
+ lang_in='en-US'
195
+ lang_api='en'
196
+ elif initial_language == "Português":
197
+ lang_in='pt-BR'
198
+ lang_api='pt'
199
+ # Initial directory
200
+ home_dir= os.getenv('home_dir')
201
+ print('Initial directory:',home_dir)
202
+ # Cleaning previous files
203
+ cleanup()
204
+ file_obj=download_video(url)
205
+ print(file_obj)
206
+ # Insert Local Video File Path
207
+ videoclip = VideoFileClip(file_obj)
208
+ is_traduc=False
209
+ # Trying to get transcripts
210
+
211
+ text, trans, is_traduc = get_transcript(url,desired_language=lang)
212
+ print("Transcript Found")
213
+
214
+ if not is_traduc:
215
+ print("No Transcript Found")
216
+ # Trying to recognize audio
217
+ # Insert Local Audio File Path
218
+ videoclip.audio.write_audiofile("audio.wav",codec='pcm_s16le')
219
+ # initialize the recognizer
220
+ r = sr.Recognizer()
221
+ # open the file
222
+ with sr.AudioFile("audio.wav") as source:
223
+ # listen for the data (load audio to memory)
224
+ audio_data = r.record(source)
225
+ # recognize (convert from speech to text)
226
+ print("Recognize from ",lang_in)
227
+ #There is a limit of 10 MB on all single requests sent to the API using local file
228
+ size_wav=getSize("audio.wav")
229
+ if size_wav > 50000000:
230
+ print("The wav is too large")
231
+ audio_chunks=split_audio_wav("audio.wav")
232
+ text=""
233
+ for chunk in audio_chunks:
234
+ print("Converting audio to text",chunk)
235
+ try:
236
+ text_chunk= r.recognize_google(audio_data, language = lang_in)
237
+ except Exception:
238
+ print("This video cannot be recognized")
239
+ cleanup()
240
+ return "./demo/tryagain.mp4"
241
+ text=text+text_chunk+" "
242
+ text=str(text)
243
+ print(type(text))
244
+
245
+ else:
246
+ try:
247
+ text = r.recognize_google(audio_data, language = lang_in)
248
+ except Exception:
249
+ print("This video cannot be recognized")
250
+ cleanup()
251
+ return "./demo/tryagain.mp4"
252
+
253
+ #print(text)
254
+ print("Destination language ",lang)
255
+
256
+ # init the Google API translator
257
+ translator = Translator()
258
+
259
+
260
+ try:
261
+ translation = translator.translate(text, dest=lang)
262
+ except Exception:
263
+ print("This text cannot be translated")
264
+ cleanup()
265
+ return "./demo/tryagain.mp4"
266
+
267
+ #translation.text
268
+ trans=translation.text
269
+
270
+ myobj = gTTS(text=trans, lang=lang, slow=False)
271
+ myobj.save("audio.wav")
272
+ # loading audio file
273
+ audioclip = AudioFileClip("audio.wav")
274
+
275
+ # adding audio to the video clip
276
+ new_audioclip = CompositeAudioClip([audioclip])
277
+ videoclip.audio = new_audioclip
278
+ new_video="video_translated_"+lang+".mp4"
279
+
280
+ # Return back to main directory
281
+ os.chdir(home_dir)
282
+ print('Final directory',os.getcwd())
283
+
284
+ videoclip.write_videofile(new_video)
285
+
286
+ videoclip.close()
287
+ del file_obj
288
+
289
+ return new_video
290
+
291
+ initial_language = gr.inputs.Dropdown(["English","Português"])
292
+ final_language = gr.inputs.Dropdown([ "Russian","Português"])
293
+ url =gr.inputs.Textbox(label = "Enter the YouTube URL below:")
294
+
295
+
296
+ gr.Interface(fn = video_to_translate,
297
+ inputs = [url,initial_language,final_language],
298
+ outputs = 'video',
299
+ verbose = True,
300
+ title = 'Video Youtube Translator',
301
+ description = 'A simple application that translates Youtube small videos from English, Italian, Japanese, Russian, Spanish, and German to Italian, Spanish, Russian, English and Japanese. Wait one minute to process.',
302
+ article =
303
+ '''<div>
304
+ <p style="text-align: center"> All you need to do is to paste the Youtube link and hit submit,, then wait for compiling. After that click on Play/Pause for listing to the video. The video is saved in an mp4 format.
305
+ The lenght video limit is 10 minutes. For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
306
+ </p>
307
+ </div>''',
308
+
309
+ examples = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  ).launch()