thun888 commited on
Commit
cb0791d
·
1 Parent(s): 01d6def

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.bin
4
+ *.srt
5
+ *.mp4
6
+ *.wav
7
+ *.mb
8
+ *.exe
9
+ .locks
10
+ venv/
11
+ ckpt-0.data-00000-of-00001
12
+ ckpt-0.meta
13
+ tokenizer.json
14
+ *.model
15
+
16
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 RVC-Boss
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
4
+
5
+ # ffmpeg_path = f"{ROOT_DIR}/bin" # 替换成你的 FFmpeg bin 目录
6
+ # os.environ["PATH"] = os.environ.get("PATH", "") + os.pathsep + ffmpeg_path
7
+
8
+ import gradio as gr
9
+
10
+ from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh,make_srt_sv,make_tran_qwen2,make_tran_deep
11
+
12
+ from subtitle_to_audio import generate_audio
13
+ import pyttsx3
14
+
15
+ engine = pyttsx3.init()
16
+ voices = engine.getProperty('voices') # getting details of current voice
17
+ vlist = []
18
+ num = 0
19
+ for voice in voices:
20
+ print(" - Name: %s" % voice.name)
21
+ vlist.append((voice.name,num))
22
+ num += 1
23
+
24
+
25
+ initial_md = """
26
+
27
+ 项目地址:https://github.com/v3ucn/Modelscope_Faster_Whisper_Multi_Subtitle
28
+
29
+ 作者:刘悦的技术博客 https://space.bilibili.com/3031494
30
+
31
+ """
32
+
33
+ def do_pyttsx3(srt,speed,voice):
34
+
35
+ print(srt,speed,voice)
36
+
37
+ voice = int(voice)
38
+
39
+ generate_audio(path=srt,rate=int(speed),voice_idx=voice)
40
+
41
+ return f"output/{vlist[voice][0]}.wav"
42
+
43
+ def do_speech(video):
44
+
45
+ res = movie2audio(video)
46
+
47
+ return res
48
+
49
+
50
+ def do_trans_video(model_type,video_path):
51
+
52
+ srt_text = make_srt(video_path,model_type)
53
+
54
+ return srt_text
55
+
56
+ def do_trans_video_sv(video_path):
57
+
58
+ srt_text = make_srt_sv(video_path)
59
+
60
+ return srt_text
61
+
62
+ def do_trans_audio(model_type):
63
+
64
+ srt_text = make_srt(f'{ROOT_DIR}/audio.wav',model_type)
65
+
66
+ return srt_text
67
+
68
+ def do_trans_en2zh(srt_path):
69
+
70
+ return make_tran(srt_path)
71
+
72
+
73
+ def do_trans_en2zh_deep(srt_path):
74
+
75
+ return make_tran_deep(srt_path,"EN","ZH")
76
+
77
+ def do_trans_zh2en_deep(srt_path):
78
+
79
+ return make_tran_deep(srt_path,"ZH","EN")
80
+
81
+ def do_trans_zh2ja_deep(srt_path):
82
+
83
+ return make_tran_deep(srt_path,"ZH","JA")
84
+
85
+ def do_trans_zh2ko_deep(srt_path):
86
+
87
+ return make_tran_deep(srt_path,"ZH","KO")
88
+
89
+ def do_trans_ja2zh_deep(srt_path):
90
+
91
+ return make_tran_deep(srt_path,"JA","ZH")
92
+
93
+ def do_trans_ko2zh_deep(srt_path):
94
+
95
+ return make_tran_deep(srt_path,"KO","ZH")
96
+
97
+
98
+
99
+
100
+ def do_trans_en2zh_qwen2(model_path_qwen2,srt_path):
101
+
102
+ return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
103
+
104
+ def do_trans_zh2en_qwen2(model_path_qwen2,srt_path):
105
+
106
+ return make_tran_qwen2(model_path_qwen2,srt_path,"en")
107
+
108
+ def do_trans_ja2zh_qwen2(model_path_qwen2,srt_path):
109
+
110
+ return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
111
+
112
+ def do_trans_ko2zh_qwen2(model_path_qwen2,srt_path):
113
+
114
+ return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
115
+
116
+ def do_trans_zh2en(srt_path):
117
+
118
+ return make_tran_zh2en(srt_path)
119
+
120
+ def do_trans_ja2zh(srt_path):
121
+
122
+ return make_tran_ja2zh(srt_path)
123
+
124
+ def do_trans_ko2zh(srt_path):
125
+
126
+ return make_tran_ko2zh(srt_path)
127
+
128
+ def do_srt_sin(video_path):
129
+
130
+ return merge_sub(video_path,f"{ROOT_DIR}/output/video.srt")
131
+
132
+ def do_srt_two(video_path):
133
+
134
+ return merge_sub(video_path,f"{ROOT_DIR}/output/two.srt")
135
+
136
+
137
+ def do_srt_two_single(video_path):
138
+
139
+ return merge_sub(video_path,f"{ROOT_DIR}/output/two_single.srt")
140
+
141
+
142
+ def save_srt(text):
143
+
144
+ with open(rf'{ROOT_DIR}/output/video.srt','w',encoding='utf-8') as f:
145
+ f.write(text + "\n")
146
+
147
+ gr.Info('字幕文件修改成功,字幕保存在output目录')
148
+
149
+
150
+ def save_two(text,text_2):
151
+
152
+ with open(rf'{ROOT_DIR}/output/two.srt','w',encoding='utf-8') as f:
153
+ f.write(text + "\n")
154
+
155
+ with open(rf'{ROOT_DIR}/output/two_single.srt','w',encoding='utf-8') as f:
156
+ f.write(text_2 + "\n")
157
+
158
+ gr.Info('字幕文件修改成功,字幕保存在output目录')
159
+
160
+
161
+
162
+
163
+
164
+ with gr.Blocks() as app:
165
+ gr.Markdown(initial_md)
166
+
167
+ with gr.Accordion("视频处理(Video)"):
168
+ with gr.Row():
169
+
170
+ ori_video = gr.Video(label="请上传视频(Upload Video)")
171
+
172
+ speech_button = gr.Button("提取人声(如果视频没有背景音也可以不做)Extract human voice (you don't have to do it if the video has no background sound)")
173
+
174
+ speech_audio = gr.Audio(label="提取的人声(Extract voice)")
175
+
176
+
177
+ speech_button.click(do_speech,inputs=[ori_video],outputs=[speech_audio])
178
+
179
+ with gr.Accordion("转写字幕"):
180
+
181
+ with gr.Row():
182
+ with gr.Column():
183
+
184
+ # model_type = gr.Dropdown(choices=["small","medium","large-v3","large-v2"], value="small", label="选择faster_Whisper模型/Select faster_Whisper model",interactive=True)
185
+
186
+ model_type = gr.Textbox(label="填写faster_Whisper模型/Fill in the faster_Whisper model,也可以填写small,medium,large,large-v2,large-v3,faster-whisper-large-v3-turbo-ct2,模型越大,速度越慢,但字幕的准确度越高,酌情填写,用文本框是因为你可以填写其他huggingface上的开源模型地址",value="faster-whisper-large-v3-turbo-ct2")
187
+
188
+ # with gr.Row():
189
+ # with gr.Column():
190
+
191
+ # language = gr.Dropdown(["ja", "en", "zh","ko","yue"], value="zh", label="选择转写��语言",interactive=True)
192
+
193
+
194
+ with gr.Row():
195
+
196
+ transcribe_button_whisper = gr.Button("Whisper视频直接转写字幕(Video direct rewriting subtitles)")
197
+
198
+ transcribe_button_audio = gr.Button("Whisper提取人声转写字幕(Extract voice transliteration subtitles)")
199
+
200
+
201
+ # transcribe_button_video_sv = gr.Button("阿里SenseVoice视频直接转写字幕")
202
+
203
+ result1 = gr.Textbox(label="字幕結果(会在项目目录生成video.srt/video.srt is generated in the current directory)",value=" ",interactive=True)
204
+
205
+ transcribe_button_audio_save = gr.Button("保存字幕修改结果")
206
+
207
+ transcribe_button_whisper.click(do_trans_video,inputs=[model_type,ori_video],outputs=[result1])
208
+
209
+ transcribe_button_audio_save.click(save_srt,inputs=[result1],outputs=[])
210
+
211
+ # transcribe_button_video_sv.click(do_trans_video_sv,inputs=[ori_video],outputs=[result1])
212
+
213
+ transcribe_button_audio.click(do_trans_audio,inputs=[model_type],outputs=[result1])
214
+
215
+
216
+ # with gr.Accordion("HuggingFace大模型字幕翻译"):
217
+ # with gr.Row():
218
+
219
+
220
+ # srt_path = gr.Textbox(label="原始字幕地址,默认为项目目录中的video.srt,也可以输入其他路径",value="./video.srt")
221
+
222
+ # trans_button_en2zh = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
223
+
224
+ # trans_button_zh2en = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
225
+
226
+ # trans_button_ja2zh = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
227
+
228
+ # trans_button_ko2zh = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
229
+
230
+ # result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)")
231
+
232
+ # trans_button_en2zh.click(do_trans_en2zh,[srt_path],outputs=[result2])
233
+
234
+ # trans_button_zh2en.click(do_trans_zh2en,[srt_path],outputs=[result2])
235
+
236
+ # trans_button_ja2zh.click(do_trans_ja2zh,[srt_path],outputs=[result2])
237
+
238
+ # trans_button_ko2zh.click(do_trans_ko2zh,[srt_path],outputs=[result2])
239
+
240
+ with gr.Accordion("Qwen2大模型字幕翻译"):
241
+ with gr.Row():
242
+
243
+
244
+ srt_path_qwen2 = gr.Textbox(label="原始字幕地址,默认为项目目录中的output/video.srt,也可以输入其他路径",value=f"{ROOT_DIR}/output/video.srt")
245
+
246
+ model_path_qwen2 = gr.Textbox(label="ollama中模型名称",value="qwen2:7b")
247
+
248
+ trans_button_en2zh_qwen2 = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
249
+
250
+ trans_button_zh2en_qwen2 = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
251
+
252
+ trans_button_ja2zh_qwen2 = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
253
+
254
+ trans_button_ko2zh_qwen2 = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
255
+
256
+ with gr.Row():
257
+
258
+ result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)",value=" ",interactive=True)
259
+
260
+ result3 = gr.Textbox(label="翻译结果(会在项目目录生成two_single.srt)",value=" ",interactive=True)
261
+
262
+ trans_button_ko2zh_qwen2_save = gr.Button("保存修改结果")
263
+
264
+ trans_button_en2zh_qwen2.click(do_trans_en2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
265
+
266
+ trans_button_zh2en_qwen2.click(do_trans_zh2en_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
267
+
268
+ trans_button_ja2zh_qwen2.click(do_trans_ja2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
269
+
270
+ trans_button_ko2zh_qwen2.click(do_trans_ko2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
271
+
272
+ trans_button_ko2zh_qwen2_save.click(save_two,[result2,result3],outputs=[])
273
+
274
+
275
+ with gr.Accordion("Deepl字幕翻译"):
276
+ with gr.Row():
277
+
278
+
279
+ srt_path_deep = gr.Textbox(label="原始字幕地址,默认为项目目录中的output/video.srt,也可以输入其他路径",value=f"{ROOT_DIR}/output/video.srt")
280
+
281
+ trans_button_en2zh_deep = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
282
+
283
+ trans_button_zh2en_deep = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
284
+
285
+ trans_button_zh2ja_deep = gr.Button("翻译中文字幕为日文/Translate Chinese subtitles into Japanese")
286
+
287
+ trans_button_zh2ko_deep = gr.Button("翻译中文字幕为韩文/Translate Chinese subtitles into Korea")
288
+
289
+ trans_button_ja2zh_deep = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
290
+
291
+ trans_button_ko2zh_deep = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
292
+
293
+ with gr.Row():
294
+
295
+ result2_deep = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)",value=" ",interactive=True)
296
+
297
+ result3_deep = gr.Textbox(label="翻译结果(会在项目目录生成two_single.srt)",value=" ",interactive=True)
298
+
299
+ trans_button_ko2zh_deep_save = gr.Button("保存修改结果")
300
+
301
+
302
+
303
+ trans_button_ko2zh_deep_save.click(save_two,[result2_deep,result3_deep],outputs=[])
304
+
305
+
306
+ with gr.Accordion("字幕配音(pyttsx3)"):
307
+ with gr.Row():
308
+
309
+ srt_path_pyttsx3 = gr.Textbox(label="字幕地址,也可以输入其他路径",value=f"{ROOT_DIR}/output/eng.srt",interactive=True)
310
+
311
+ speed_pyttsx3 = gr.Textbox(label="配音语速(很重要,否则会引起时间轴错乱的问题)",value="240")
312
+
313
+ voice_pyttsx3 = gr.Dropdown(choices=vlist,value=3,label="配音的音色选择",interactive=True)
314
+
315
+ button_pyttsx3 = gr.Button("生成配音")
316
+
317
+ pyttsx3_audio = gr.Audio(label="配音的结果")
318
+
319
+
320
+ trans_button_en2zh_deep.click(do_trans_en2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
321
+
322
+ trans_button_zh2ja_deep.click(do_trans_zh2ja_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
323
+
324
+ trans_button_zh2en_deep.click(do_trans_zh2en_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
325
+
326
+ trans_button_zh2ko_deep.click(do_trans_zh2ko_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
327
+
328
+ trans_button_ja2zh_deep.click(do_trans_ja2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
329
+
330
+ trans_button_ko2zh_deep.click(do_trans_ko2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
331
+
332
+
333
+ button_pyttsx3.click(do_pyttsx3,inputs=[srt_path_pyttsx3,speed_pyttsx3,voice_pyttsx3],outputs=[pyttsx3_audio])
334
+
335
+
336
+
337
+ with gr.Accordion("字幕合并"):
338
+ with gr.Row():
339
+
340
+
341
+ srt_button_sin = gr.Button("将单语字幕合并到视频/Merge monolingual subtitles into video")
342
+
343
+ srt_button_two = gr.Button("将双语字幕合并到视频/Merge bilingual subtitles into video")
344
+
345
+ srt_button_two_single = gr.Button("将翻译的单语字幕合并到视频")
346
+
347
+ result3 = gr.Video(label="带字幕视频")
348
+
349
+ srt_button_sin.click(do_srt_sin,inputs=[ori_video],outputs=[result3])
350
+ srt_button_two.click(do_srt_two,inputs=[ori_video],outputs=[result3])
351
+ srt_button_two.click(do_srt_two_single,inputs=[ori_video],outputs=[result3])
352
+
353
+
354
+
355
+
356
+ parser = argparse.ArgumentParser()
357
+ parser.add_argument(
358
+ "--server-name",
359
+ type=str,
360
+ default=None,
361
+ help="Server name for Gradio app",
362
+ )
363
+ parser.add_argument(
364
+ "--no-autolaunch",
365
+ action="store_true",
366
+ default=False,
367
+ help="Do not launch app automatically",
368
+ )
369
+ args = parser.parse_args()
370
+
371
+ app.queue()
372
+ app.launch(inbrowser=True, server_name=args.server_name)
app.spec ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ import sys
3
+ sys.setrecursionlimit(5000)
4
+ from PyInstaller.utils.hooks import collect_data_files
5
+
6
+ datas = []
7
+ datas += collect_data_files('gradio_client')
8
+ datas += collect_data_files('gradio')
9
+
10
+ # datas += [('./utils.py',".")]
11
+ # datas += [('./slicer2.py',".")]
12
+
13
+
14
+
15
+ a = Analysis(
16
+ ['app.py',
17
+ ],
18
+ pathex=['/Users/liuyue/Downloads/FunAsr_Faster_Whisper_Multi_Subs'],
19
+ binaries=[],
20
+ datas=datas,
21
+ hiddenimports=[],
22
+ hookspath=[],
23
+ hooksconfig={},
24
+ runtime_hooks=[],
25
+ excludes=[],
26
+ noarchive=False,
27
+ optimize=0,
28
+ module_collection_mode={ 'gradio': 'py'}
29
+ )
30
+ pyz = PYZ(a.pure)
31
+
32
+ exe = EXE(
33
+ pyz,
34
+ a.scripts,
35
+ [],
36
+ exclude_binaries=True,
37
+ name='whisper_turbo',
38
+ # icon='AnyConv.com__paints_logo.icns',
39
+ debug=True,
40
+ bootloader_ignore_signals=False,
41
+ strip=False,
42
+ upx=True,
43
+ console=True,
44
+ disable_windowed_traceback=False,
45
+ argv_emulation=False,
46
+ target_arch=None,
47
+ codesign_identity=None,
48
+ entitlements_file=None,
49
+ )
50
+
51
+ a.datas += Tree('./faster-whisper-large-v3-turbo-ct2', prefix='faster-whisper-large-v3-turbo-ct2')
52
+ a.datas += Tree('./models_from_modelscope', prefix='models_from_modelscope')
53
+ a.datas += Tree('./output', prefix='output')
54
+ # a.datas += Tree('./bin', prefix='bin')
55
+
56
+
57
+ # a.datas += Tree('./output', prefix='output')
58
+
59
+
60
+
61
+ coll = COLLECT(
62
+ exe,
63
+ a.binaries,
64
+ a.datas,
65
+ strip=False,
66
+ upx=True,
67
+ upx_exclude=[],
68
+ name='whisper_turbo',
69
+ )
70
+
71
+
app_video_src.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ import gradio as gr
5
+
6
+ from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh
7
+
8
+
9
+
10
+
11
+ initial_md = """
12
+
13
+
14
+ 作者:刘悦的技术博客 https://space.bilibili.com/3031494
15
+
16
+ """
17
+
18
+
19
+ def do_speech(video):
20
+
21
+ res = movie2audio(video)
22
+
23
+ return res
24
+
25
+
26
+ def do_trans_video(model_type,video_path):
27
+
28
+ srt_text = make_srt(video_path,model_type)
29
+
30
+ return srt_text
31
+
32
+ def do_trans_audio(model_type):
33
+
34
+ srt_text = make_srt('./audio.wav',model_type)
35
+
36
+ return srt_text
37
+
38
+ def do_trans_en2zh(srt_path):
39
+
40
+ return make_tran(srt_path)
41
+
42
+ def do_trans_zh2en(srt_path):
43
+
44
+ return make_tran_zh2en(srt_path)
45
+
46
+ def do_trans_ja2zh(srt_path):
47
+
48
+ return make_tran_ja2zh(srt_path)
49
+
50
+ def do_trans_ko2zh(srt_path):
51
+
52
+ return make_tran_ko2zh(srt_path)
53
+
54
+ def do_srt_sin(video_path):
55
+
56
+ return merge_sub(video_path,"./video.srt")
57
+
58
+ def do_srt_two(video_path):
59
+
60
+ return merge_sub(video_path,"./two.srt")
61
+
62
+
63
+
64
+ with gr.Blocks() as app:
65
+ gr.Markdown(initial_md)
66
+
67
+ with gr.Accordion("视频处理(Video)"):
68
+ with gr.Row():
69
+
70
+ ori_video = gr.Textbox(label="请输入视频的路径地址,如:d:/123.mp4")
71
+
72
+ speech_button = gr.Button("提取人声(如果视频没有背景音也可以不做)Extract human voice (you don't have to do it if the video has no background sound)")
73
+
74
+ speech_audio = gr.Audio(label="提取的人声(Extract voice)")
75
+
76
+
77
+ speech_button.click(do_speech,inputs=[ori_video],outputs=[speech_audio])
78
+
79
+ with gr.Accordion("转写字幕"):
80
+ with gr.Row():
81
+ with gr.Column():
82
+
83
+
84
+ # model_type = gr.Dropdown(choices=["small","medium","large-v3","large-v2"], value="small", label="选择faster_Whisper模型/Select faster_Whisper model",interactive=True)
85
+
86
+ model_type = gr.Textbox(label="填写faster_Whisper模型/Fill in the faster_Whisper model,也可以填写small,medium,large,large-v2,large-v3,模型越大,速度越慢,但字幕的准确度越高,酌情填写,用文本框是因为你可以填写其他huggingface上的开源模型地址",value="medium")
87
+
88
+
89
+ transcribe_button_whisper = gr.Button("视频直接转写字幕(Video direct rewriting subtitles)")
90
+
91
+ transcribe_button_audio = gr.Button("提取人声转写字幕(Extract voice transliteration subtitles)")
92
+
93
+ result1 = gr.Textbox(label="字幕結果(会在项目目录生成video.srt/video.srt is generated in the current directory)")
94
+
95
+ transcribe_button_whisper.click(do_trans_video,inputs=[model_type,ori_video],outputs=[result1])
96
+
97
+ transcribe_button_audio.click(do_trans_audio,inputs=[model_type],outputs=[result1])
98
+
99
+
100
+ with gr.Accordion("字幕翻译"):
101
+ with gr.Row():
102
+
103
+
104
+ srt_path = gr.Textbox(label="原始字幕地址,默认为项目目录中的video.srt,也可以输入其他路径",value="./video.srt")
105
+
106
+ trans_button_en2zh = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
107
+
108
+ trans_button_zh2en = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
109
+
110
+ trans_button_ja2zh = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
111
+
112
+ trans_button_ko2zh = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
113
+
114
+ result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)")
115
+
116
+ trans_button_en2zh.click(do_trans_en2zh,[srt_path],outputs=[result2])
117
+
118
+ trans_button_zh2en.click(do_trans_zh2en,[srt_path],outputs=[result2])
119
+
120
+ trans_button_ja2zh.click(do_trans_ja2zh,[srt_path],outputs=[result2])
121
+
122
+ trans_button_ko2zh.click(do_trans_ko2zh,[srt_path],outputs=[result2])
123
+
124
+ with gr.Accordion("字幕合并"):
125
+ with gr.Row():
126
+
127
+
128
+ srt_button_sin = gr.Button("将单语字幕合并到视频/Merge monolingual subtitles into video")
129
+
130
+ srt_button_two = gr.Button("将双语字幕合并到视频/Merge bilingual subtitles into video")
131
+
132
+ result3 = gr.Textbox(label="合成字幕后的视频路径地址")
133
+
134
+ srt_button_sin.click(do_srt_sin,inputs=[ori_video],outputs=[result3])
135
+ srt_button_two.click(do_srt_two,inputs=[ori_video],outputs=[result3])
136
+
137
+
138
+
139
+
140
+ parser = argparse.ArgumentParser()
141
+ parser.add_argument(
142
+ "--server-name",
143
+ type=str,
144
+ default=None,
145
+ help="Server name for Gradio app",
146
+ )
147
+ parser.add_argument(
148
+ "--no-autolaunch",
149
+ action="store_true",
150
+ default=False,
151
+ help="Do not launch app automatically",
152
+ )
153
+ args = parser.parse_args()
154
+
155
+ app.launch(inbrowser=not args.no_autolaunch, server_name=args.server_name)
download_model.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+
3
+ repo_id = "deepdml/faster-whisper-large-v3-turbo-ct2"
4
+ local_dir = "faster-whisper-large-v3-turbo-ct2"
5
+ snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="model")
faster-whisper-large-v3-turbo-ct2/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
faster-whisper-large-v3-turbo-ct2/README.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - zh
5
+ - de
6
+ - es
7
+ - ru
8
+ - ko
9
+ - fr
10
+ - ja
11
+ - pt
12
+ - tr
13
+ - pl
14
+ - ca
15
+ - nl
16
+ - ar
17
+ - sv
18
+ - it
19
+ - id
20
+ - hi
21
+ - fi
22
+ - vi
23
+ - he
24
+ - uk
25
+ - el
26
+ - ms
27
+ - cs
28
+ - ro
29
+ - da
30
+ - hu
31
+ - ta
32
+ - 'no'
33
+ - th
34
+ - ur
35
+ - hr
36
+ - bg
37
+ - lt
38
+ - la
39
+ - mi
40
+ - ml
41
+ - cy
42
+ - sk
43
+ - te
44
+ - fa
45
+ - lv
46
+ - bn
47
+ - sr
48
+ - az
49
+ - sl
50
+ - kn
51
+ - et
52
+ - mk
53
+ - br
54
+ - eu
55
+ - is
56
+ - hy
57
+ - ne
58
+ - mn
59
+ - bs
60
+ - kk
61
+ - sq
62
+ - sw
63
+ - gl
64
+ - mr
65
+ - pa
66
+ - si
67
+ - km
68
+ - sn
69
+ - yo
70
+ - so
71
+ - af
72
+ - oc
73
+ - ka
74
+ - be
75
+ - tg
76
+ - sd
77
+ - gu
78
+ - am
79
+ - yi
80
+ - lo
81
+ - uz
82
+ - fo
83
+ - ht
84
+ - ps
85
+ - tk
86
+ - nn
87
+ - mt
88
+ - sa
89
+ - lb
90
+ - my
91
+ - bo
92
+ - tl
93
+ - mg
94
+ - as
95
+ - tt
96
+ - haw
97
+ - ln
98
+ - ha
99
+ - ba
100
+ - jw
101
+ - su
102
+ - yue
103
+ tags:
104
+ - audio
105
+ - automatic-speech-recognition
106
+ license: mit
107
+ library_name: ctranslate2
108
+ ---
109
+
110
+ # Whisper large-v3 model for CTranslate2
111
+
112
+ This repository contains the conversion of [deepdml/whisper-large-v3-turbo](https://huggingface.co/deepdml/whisper-large-v3-turbo) to the [CTranslate2](https://github.com/OpenNMT/CTranslate2) model format.
113
+
114
+ This model can be used in CTranslate2 or projects based on CTranslate2 such as [faster-whisper](https://github.com/systran/faster-whisper).
115
+
116
+ ## Example
117
+
118
+ ```python
119
+ from faster_whisper import WhisperModel
120
+
121
+ model = WhisperModel("faster-whisper-large-v3-turbo-ct2")
122
+
123
+ segments, info = model.transcribe("audio.mp3")
124
+ for segment in segments:
125
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
126
+ ```
127
+
128
+ ## Conversion details
129
+
130
+ The original model was converted with the following command:
131
+
132
+ ```
133
+ ct2-transformers-converter --model deepdml/whisper-large-v3-turbo --output_dir faster-whisper-large-v3-turbo \
134
+ --copy_files tokenizer.json preprocessor_config.json --quantization float16
135
+ ```
136
+
137
+ Note that the model weights are saved in FP16. This type can be changed when the model is loaded using the [`compute_type` option in CTranslate2](https://opennmt.net/CTranslate2/quantization.html).
138
+
139
+ ## More information
140
+
141
+ **For more information about the original model, see its [model card](https://huggingface.co/openai/whisper-large-v3).**
faster-whisper-large-v3-turbo-ct2/config.json ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 2,
5
+ 0
6
+ ],
7
+ [
8
+ 2,
9
+ 1
10
+ ],
11
+ [
12
+ 2,
13
+ 2
14
+ ],
15
+ [
16
+ 2,
17
+ 3
18
+ ],
19
+ [
20
+ 2,
21
+ 4
22
+ ],
23
+ [
24
+ 2,
25
+ 5
26
+ ],
27
+ [
28
+ 2,
29
+ 6
30
+ ],
31
+ [
32
+ 2,
33
+ 7
34
+ ],
35
+ [
36
+ 2,
37
+ 8
38
+ ],
39
+ [
40
+ 2,
41
+ 9
42
+ ],
43
+ [
44
+ 2,
45
+ 10
46
+ ],
47
+ [
48
+ 2,
49
+ 11
50
+ ],
51
+ [
52
+ 2,
53
+ 12
54
+ ],
55
+ [
56
+ 2,
57
+ 13
58
+ ],
59
+ [
60
+ 2,
61
+ 14
62
+ ],
63
+ [
64
+ 2,
65
+ 15
66
+ ],
67
+ [
68
+ 2,
69
+ 16
70
+ ],
71
+ [
72
+ 2,
73
+ 17
74
+ ],
75
+ [
76
+ 2,
77
+ 18
78
+ ],
79
+ [
80
+ 2,
81
+ 19
82
+ ],
83
+ [
84
+ 3,
85
+ 0
86
+ ],
87
+ [
88
+ 3,
89
+ 1
90
+ ],
91
+ [
92
+ 3,
93
+ 2
94
+ ],
95
+ [
96
+ 3,
97
+ 3
98
+ ],
99
+ [
100
+ 3,
101
+ 4
102
+ ],
103
+ [
104
+ 3,
105
+ 5
106
+ ],
107
+ [
108
+ 3,
109
+ 6
110
+ ],
111
+ [
112
+ 3,
113
+ 7
114
+ ],
115
+ [
116
+ 3,
117
+ 8
118
+ ],
119
+ [
120
+ 3,
121
+ 9
122
+ ],
123
+ [
124
+ 3,
125
+ 10
126
+ ],
127
+ [
128
+ 3,
129
+ 11
130
+ ],
131
+ [
132
+ 3,
133
+ 12
134
+ ],
135
+ [
136
+ 3,
137
+ 13
138
+ ],
139
+ [
140
+ 3,
141
+ 14
142
+ ],
143
+ [
144
+ 3,
145
+ 15
146
+ ],
147
+ [
148
+ 3,
149
+ 16
150
+ ],
151
+ [
152
+ 3,
153
+ 17
154
+ ],
155
+ [
156
+ 3,
157
+ 18
158
+ ],
159
+ [
160
+ 3,
161
+ 19
162
+ ]
163
+ ],
164
+ "lang_ids": [
165
+ 50259,
166
+ 50260,
167
+ 50261,
168
+ 50262,
169
+ 50263,
170
+ 50264,
171
+ 50265,
172
+ 50266,
173
+ 50267,
174
+ 50268,
175
+ 50269,
176
+ 50270,
177
+ 50271,
178
+ 50272,
179
+ 50273,
180
+ 50274,
181
+ 50275,
182
+ 50276,
183
+ 50277,
184
+ 50278,
185
+ 50279,
186
+ 50280,
187
+ 50281,
188
+ 50282,
189
+ 50283,
190
+ 50284,
191
+ 50285,
192
+ 50286,
193
+ 50287,
194
+ 50288,
195
+ 50289,
196
+ 50290,
197
+ 50291,
198
+ 50292,
199
+ 50293,
200
+ 50294,
201
+ 50295,
202
+ 50296,
203
+ 50297,
204
+ 50298,
205
+ 50299,
206
+ 50300,
207
+ 50301,
208
+ 50302,
209
+ 50303,
210
+ 50304,
211
+ 50305,
212
+ 50306,
213
+ 50307,
214
+ 50308,
215
+ 50309,
216
+ 50310,
217
+ 50311,
218
+ 50312,
219
+ 50313,
220
+ 50314,
221
+ 50315,
222
+ 50316,
223
+ 50317,
224
+ 50318,
225
+ 50319,
226
+ 50320,
227
+ 50321,
228
+ 50322,
229
+ 50323,
230
+ 50324,
231
+ 50325,
232
+ 50326,
233
+ 50327,
234
+ 50328,
235
+ 50329,
236
+ 50330,
237
+ 50331,
238
+ 50332,
239
+ 50333,
240
+ 50334,
241
+ 50335,
242
+ 50336,
243
+ 50337,
244
+ 50338,
245
+ 50339,
246
+ 50340,
247
+ 50341,
248
+ 50342,
249
+ 50343,
250
+ 50344,
251
+ 50345,
252
+ 50346,
253
+ 50347,
254
+ 50348,
255
+ 50349,
256
+ 50350,
257
+ 50351,
258
+ 50352,
259
+ 50353,
260
+ 50354,
261
+ 50355,
262
+ 50356,
263
+ 50357,
264
+ 50358
265
+ ],
266
+ "suppress_ids": [
267
+ 1,
268
+ 2,
269
+ 7,
270
+ 8,
271
+ 9,
272
+ 10,
273
+ 14,
274
+ 25,
275
+ 26,
276
+ 27,
277
+ 28,
278
+ 29,
279
+ 31,
280
+ 58,
281
+ 59,
282
+ 60,
283
+ 61,
284
+ 62,
285
+ 63,
286
+ 90,
287
+ 91,
288
+ 92,
289
+ 93,
290
+ 359,
291
+ 503,
292
+ 522,
293
+ 542,
294
+ 873,
295
+ 893,
296
+ 902,
297
+ 918,
298
+ 922,
299
+ 931,
300
+ 1350,
301
+ 1853,
302
+ 1982,
303
+ 2460,
304
+ 2627,
305
+ 3246,
306
+ 3253,
307
+ 3268,
308
+ 3536,
309
+ 3846,
310
+ 3961,
311
+ 4183,
312
+ 4667,
313
+ 6585,
314
+ 6647,
315
+ 7273,
316
+ 9061,
317
+ 9383,
318
+ 10428,
319
+ 10929,
320
+ 11938,
321
+ 12033,
322
+ 12331,
323
+ 12562,
324
+ 13793,
325
+ 14157,
326
+ 14635,
327
+ 15265,
328
+ 15618,
329
+ 16553,
330
+ 16604,
331
+ 18362,
332
+ 18956,
333
+ 20075,
334
+ 21675,
335
+ 22520,
336
+ 26130,
337
+ 26161,
338
+ 26435,
339
+ 28279,
340
+ 29464,
341
+ 31650,
342
+ 32302,
343
+ 32470,
344
+ 36865,
345
+ 42863,
346
+ 47425,
347
+ 49870,
348
+ 50254,
349
+ 50258,
350
+ 50359,
351
+ 50360,
352
+ 50361,
353
+ 50362,
354
+ 50363
355
+ ],
356
+ "suppress_ids_begin": [
357
+ 220,
358
+ 50257
359
+ ]
360
+ }
faster-whisper-large-v3-turbo-ct2/preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 128,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
faster-whisper-large-v3-turbo-ct2/vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
gen_english.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
4
+
5
+ # ffmpeg_path = f"{ROOT_DIR}/bin" # 替换成你的 FFmpeg bin 目录
6
+ # os.environ["PATH"] = os.environ.get("PATH", "") + os.pathsep + ffmpeg_path
7
+
8
+ import gradio as gr
9
+
10
+ from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh,make_srt_sv,make_tran_qwen2,make_tran_deep
11
+
12
+ from subtitle_to_audio import generate_audio
13
+ import pyttsx3
14
+
15
+ engine = pyttsx3.init()
16
+ voices = engine.getProperty('voices') # getting details of current voice
17
+ vlist = []
18
+ num = 0
19
+ for voice in voices:
20
+ print(" - Name: %s" % voice.name)
21
+ vlist.append((voice.name,num))
22
+ num += 1
23
+
24
+ def do_pyttsx3(srt,speed,voice):
25
+
26
+ print(srt,speed,voice)
27
+
28
+ voice = int(voice)
29
+
30
+ generate_audio(path=srt,rate=int(speed),voice_idx=voice)
31
+
32
+ return f"output/{vlist[voice][0]}.wav"
33
+
34
+
35
+ if __name__ == '__main__':
36
+
37
+ do_pyttsx3("./output/eng.srt",240,3)
38
+
39
+
img/sample.png ADDED
model_from_hg/model here ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modelscope.pipelines import pipeline
2
+ from modelscope.utils.constant import Tasks
3
+ from moviepy.editor import VideoFileClip
4
+
5
+ model_dir_cirm = './models_from_modelscope/damo/speech_frcrn_ans_cirm_16k'
6
+
7
+
8
+ # 提取人声
9
+ def movie2audio(video_path):
10
+
11
+ # 读取视频文件
12
+ video = VideoFileClip(video_path)
13
+
14
+ # 提取视频文件中的声音
15
+ audio = video.audio
16
+
17
+ # 将声音保存为WAV格式
18
+ audio.write_audiofile("./audio.wav")
19
+
20
+ ans = pipeline(
21
+ Tasks.acoustic_noise_suppression,
22
+ model=model_dir_cirm)
23
+
24
+ ans('./audio.wav',output_path='./output.wav')
25
+
26
+ return "./output.wav"
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.mdl ADDED
Binary file (53 Bytes). View file
 
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.msc ADDED
Binary file (616 Bytes). View file
 
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/README.md ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tasks:
3
+ - acoustic-noise-suppression
4
+ widgets:
5
+ - task: acoustic-noise-suppression
6
+ inputs:
7
+ - type: audio
8
+ name: input
9
+ title: 带噪音的原始音频
10
+ validator:
11
+ max_size: 10M
12
+ examples:
13
+ - name: 1
14
+ title: 示例1
15
+ inputs:
16
+ - name: input
17
+ data: git://examples/speech_with_noise1.wav
18
+ - name: 2
19
+ title: 示例2
20
+ inputs:
21
+ - name: input
22
+ data: git://examples/speech_with_noise.wav
23
+ inferencespec:
24
+ cpu: 1
25
+ memory: 1000
26
+ gpu: 0
27
+ gpu_memory: 1000
28
+ model_type:
29
+ - complex-nn
30
+ domain:
31
+ - audio
32
+ frameworks:
33
+ - pytorch
34
+ model-backbone:
35
+ - frcrn
36
+ customized-quickstart: True
37
+ finetune-support: True
38
+ license: Apache License 2.0
39
+ tags:
40
+ - Alibaba
41
+ - Mind DNS
42
+ - ANS
43
+ - AI降噪
44
+ - 语音增强
45
+ - 音频前处理
46
+ - 3A
47
+ datasets:
48
+ train:
49
+ - modelscope/ICASSP_2021_DNS_Challenge
50
+ evaluation:
51
+ - modelscope/ICASSP_2021_DNS_Challenge
52
+
53
+ ---
54
+
55
+
56
+ # FRCRN语音降噪模型介绍
57
+
58
+ 我们日常可能会碰到一些录音质量不佳的场景。比如,想录制一段干净的语音却发现周围都很吵,录制的语音里往往混杂着噪声。当我们在噪杂的地铁或者巴士上通电话,为了让对方听清楚,不得不提高嗓门和音量。这都是因为环境噪声的影响,使我们在使用语音应用时出现障碍。这是语音通讯中一个普遍存在且又非常棘手的问题。语音质量(quality)和可懂度(intelligibility)容易受到环境噪声、拾音设备、混响及回声的干扰,使通话质量和交流效率大幅降低,如何在嘈杂的环境中保持较高的语音质量和可懂度一直以来是众多企业和学者追求的目标。
59
+
60
+ 语音降噪问题通过多年研发积累,已经取得一定的突破,尤其针对复杂环境中的语音降噪问题,通过融入复数域深度学习算法,在性能上获得大幅度的提升,在保障更小语音失真度的情况下,最大限度地消除背景噪声,还原目标语音的清晰度,因而语音降噪模型也通常被叫做语音增强模型。
61
+
62
+ 语音降噪模型的作用是从污染的语音中提取目标语音,还原目标语音质量和可懂度,同时提升语音识别的效果和性能。我们的语音降噪模型只需要输入单麦克风的录音音频,便能够输出降噪后的干净语音音频,即保持音频的格式不变,仅消除音频中的噪声和混响部分,最大限度地保留原始语音。
63
+
64
+ ## 模型描述
65
+
66
+ FRCRN语音降噪模型是基于频率循环 CRN (FRCRN) 新框架开发出来的。该框架是在卷积编-解码(Convolutional Encoder-Decoder)架构的基础上,通过进一步增加循环层获得的卷积循环编-解码(Convolutional Recurrent Encoder-Decoder)新型架构,可以明显改善卷积核的视野局限性,提升降噪模型对频率维度的特征表达,尤其是在频率长距离相关性表达上获得提升,可以在消除噪声的同时,对语音进行更针对性的辨识和保护。
67
+
68
+ 另外,我们引入前馈序列记忆网络(Feedforward Sequential Memory Network: FSMN)来降低循环网络的复杂性,以及结合复数域网络运算,实现全复数深度网络模型算法,不仅更有效地对长序列语音进行建模,同时对语音的幅度和相位进行同时增强,相关模型在IEEE/INTERSpeech DNS Challenge上有较好的表现。本次开放的模型在参赛版本基础上做了进一步优化,使用了两个Unet级联和SE layer,可以获得更为稳定的效果。如果用户需要因果模型,也可以自行修改代码,把模型中的SElayer替换成卷积层或者加上掩蔽即可。
69
+
70
+ 该模型神经网络结构如下图所示。
71
+
72
+ ![model.png](description/model.png)
73
+
74
+ 模型输入和输出均为16kHz采样率单通道语音时域波形信号,输入信号可由单通道麦克风直接进行录制,输出为噪声抑制后的语音音频信号[1]。模型输入信号通过STFT变换转换成复数频谱特征作为输入,并采用Complex FSMN在频域上进行关联性处理和在时序特征上进行长序处理,预测中间输出目标Complex ideal ratio mask, 然后使用预测的mask和输入频谱相乘后得到增强后的频谱,最后通过STFT逆变换得到增强后语音波形信号。
75
+
76
+ ## 期望模型使用方式以及适用范围
77
+
78
+
79
+ ### 如何使用
80
+
81
+ 在安装ModelScope完成之后即可使用```speech_frcrn_ans_cirm_16k```进行推理。模型输入和输出均为16kHz采样率单通道语音时域波形信号,输入信号可由单通道麦克风直接进行录制,输出为噪声抑制后的语音音频信号。为了方便使用在pipeline在模型处理前后增加了wav文件处理逻辑,可以直接读取一个wav文件,并把输出结果保存在指定的wav文件中。
82
+
83
+ #### 环境准备:
84
+
85
+ * 本模型支持Linxu,Windows和MacOS平台。
86
+ * 本模型已经在1.8~1.11和1.13 下测试通过,由于PyTorch v1.12的[BUG](https://github.com/pytorch/pytorch/issues/80837),无法在v1.12上运行,请升级到新版或���行以下命令回退到v1.11
87
+
88
+ ```
89
+ conda install pytorch==1.11 torchaudio torchvision -c pytorch
90
+ ```
91
+
92
+ * 本模型的pipeline中使用了三方库SoundFile进行wav文件处理,**在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile**,在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为例,用户需要执行如下命令:
93
+
94
+ ```shell
95
+ sudo apt-get update
96
+ sudo apt-get install libsndfile1
97
+ ```
98
+
99
+ #### 代码范例
100
+
101
+ ```python
102
+ from modelscope.pipelines import pipeline
103
+ from modelscope.utils.constant import Tasks
104
+
105
+
106
+ ans = pipeline(
107
+ Tasks.acoustic_noise_suppression,
108
+ model='damo/speech_frcrn_ans_cirm_16k')
109
+ result = ans(
110
+ 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise1.wav',
111
+ output_path='output.wav')
112
+ ```
113
+
114
+ ### 模型局限性以及可能的偏差
115
+
116
+ 模型在存在多说话人干扰声的场景噪声抑制性能有不同程度的下降。
117
+
118
+ ## 训练数据介绍
119
+
120
+ 模型的训练数据来自DNS-Challenge开源数据集,是Microsoft团队为ICASSP相关挑战赛提供的,[官方网址](https://github.com/microsoft/DNS-Challenge)[2]。我们这个模型是用来处理16k音频,因此只使用了其中的fullband数据,并做了少量调整。为便于大家使用,我们把DNS Challenge 2020的数据集迁移在modelscope的[DatasetHub](https://modelscope.cn/datasets/modelscope/ICASSP_2021_DNS_Challenge/summary)上,用户可参照数据集说明文档下载使用。
121
+
122
+ ## 模型训练流程
123
+
124
+ ### 复制官方模型
125
+ 要训练您自己的降噪模型,首先需要一份官方模型的副本。ModelScope 框架默认把官方模型保存在本地缓存中,可以把本地缓存的模型目录copy一份到您的工作目录。
126
+
127
+ 检查目录./speech_frcrn_ans_cirm_16k,其中的 pytorch_model.bin 就是模型文件。如果想从头开始训练一个全新的模型,请删除掉这里的 pytorch_model.bin,避免程序运行时加载;如果想基于官方模型继续训练则不要删除。
128
+
129
+ ```bash
130
+ cp -r ~/.cache/modelscope/hub/damo/speech_frcrn_ans_cirm_16k ./
131
+ cd ./speech_frcrn_ans_cirm_16k
132
+ rm pytorch_model.bin
133
+ ```
134
+
135
+ 目录中的configuration.json文件中是模型和训练的配置项,建议用户对代码逻辑非常熟悉以后再尝试修改。
136
+
137
+ ### 运行训练代码
138
+
139
+ 以下列出的为训练示例代码,其中有两个地方需要替换成您的本地路径:
140
+
141
+ 1. 用您前面下载的本地数据集路径替换`/your_local_path/ICASSP_2021_DNS_Challenge`
142
+ 2. 用您复制的官方模型路径替换模型路径
143
+
144
+ ```python
145
+ import os
146
+
147
+ from datasets import load_dataset
148
+
149
+ from modelscope.metainfo import Trainers
150
+ from modelscope.msdatasets import MsDataset
151
+ from modelscope.trainers import build_trainer
152
+ from modelscope.utils.audio.audio_utils import to_segment
153
+
154
+ tmp_dir = './checkpoint'
155
+ if not os.path.exists(tmp_dir):
156
+ os.makedirs(tmp_dir)
157
+
158
+ hf_ds = load_dataset(
159
+ '/your_local_path/ICASSP_2021_DNS_Challenge',
160
+ 'train',
161
+ split='train')
162
+ mapped_ds = hf_ds.map(
163
+ to_segment,
164
+ remove_columns=['duration'],
165
+ num_proc=8,
166
+ batched=True,
167
+ batch_size=36)
168
+ mapped_ds = mapped_ds.train_test_split(test_size=3000)
169
+ mapped_ds = mapped_ds.shuffle()
170
+ dataset = MsDataset.from_hf_dataset(mapped_ds)
171
+
172
+ kwargs = dict(
173
+ model='your_local_path/speech_frcrn_ans_cirm_16k',
174
+ train_dataset=dataset['train'],
175
+ eval_dataset=dataset['test'],
176
+ work_dir=tmp_dir)
177
+ trainer = build_trainer(
178
+ Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
179
+ trainer.train()
180
+ ```
181
+
182
+ 训练按照默认配置共200轮,每轮2000个batch,训练出的模型文件会保存在代码中tmp_dir = './checkpoint'指定的目录。目录下还有一个log文件,记录了每个模型的训练和测试loss数据。
183
+
184
+ ### 使用您的模型
185
+
186
+ 从您训练出的模型中选择效果最好的,把模型文件copy到 `/your_local_path/speech_frcrn_ans_cirm_16k` ,重命名为 `pytorch_model.bin` 。
187
+ 把以下代码中模型路径 `/your_local_path/speech_frcrn_ans_cirm_16k` 替换为您复制的模型目录,就可以测试您的模型效果了。
188
+
189
+ ```python
190
+ from modelscope.pipelines import pipeline
191
+ from modelscope.utils.constant import Tasks
192
+
193
+
194
+ ans = pipeline(
195
+ Tasks.acoustic_noise_suppression,
196
+ model='/your_local_path/speech_frcrn_ans_cirm_16k')
197
+ result = ans(
198
+ 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise.wav',
199
+ output_path='output.wav')
200
+ ```
201
+
202
+ 代码中的http地址也可以换成您的本地音频文件路径,注意模型支持的音频格式是采样率16000,16bit的单通道wav文件。如果您有多个文件需要处理,只需要循环调用ans()方法即可。如果要多线程处理则需要在每个线程内运行pipeline()初始化一个ans对象。
203
+
204
+ ## 数据评估及结果
205
+
206
+ 与其他SOTA模型在DNS Challenge 2020官方测试集上对比效果如下:
207
+
208
+ ![matrix.png](description/matrix.png)
209
+
210
+ 指标说明:
211
+
212
+ * PESQ (Perceptual Evaluation Of Speech Quality) 语音质量感知评估,是一种客观的、全参考的语音质量评估方法,得分范围在-0.5--4.5之间,得分越高表示语音质量越好。
213
+ * STOI (Short-Time Objective Intelligibility) 短时客观可懂度,反映人类的听觉感知系统对语音可懂度的客观评价,STOI 值介于0~1 之间,值越大代表语音可懂度越高,越清晰。
214
+ * SI-SNR (Scale Invariant Signal-to-Noise Ratio) 尺度不变的信噪比,是在普通信噪比基础上通过正则化消减信号变化导致的影响,是针对宽带噪声失真的语音增强算法的常规衡量方法。
215
+
216
+ DNS Challenge的结果列表在[这里](https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-icassp-2022/results/)。
217
+
218
+ ### 模型评估代码
219
+ 可通过如下代码对模型进行评估验证,我们在modelscope的[DatasetHub](https://modelscope.cn/datasets/modelscope/ICASSP_2021_DNS_Challenge/summary)上存储了DNS Challenge 2020的验证集,方便用户下载调用。
220
+
221
+ ```python
222
+ import os
223
+ import tempfile
224
+
225
+ from modelscope.metainfo import Trainers
226
+ from modelscope.msdatasets import MsDataset
227
+ from modelscope.trainers import build_trainer
228
+ from modelscope.utils.audio.audio_utils import to_segment
229
+
230
+ tmp_dir = tempfile.TemporaryDirectory().name
231
+ if not os.path.exists(tmp_dir):
232
+ os.makedirs(tmp_dir)
233
+
234
+ hf_ds = MsDataset.load(
235
+ 'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
236
+ mapped_ds = hf_ds.map(
237
+ to_segment,
238
+ remove_columns=['duration'],
239
+ # num_proc=5, # Comment this line to avoid error in Jupyter notebook
240
+ batched=True,
241
+ batch_size=36)
242
+ dataset = MsDataset.from_hf_dataset(mapped_ds)
243
+ kwargs = dict(
244
+ model='damo/speech_frcrn_ans_cirm_16k',
245
+ model_revision='beta',
246
+ train_dataset=None,
247
+ eval_dataset=dataset,
248
+ val_iters_per_epoch=125,
249
+ work_dir=tmp_dir)
250
+
251
+ trainer = build_trainer(
252
+ Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
253
+
254
+ eval_res = trainer.evaluate()
255
+ print(eval_res['avg_sisnr'])
256
+
257
+ ```
258
+
259
+ 更多详情请参考下面相关论文。
260
+
261
+ ### 相关论文以及引用信息
262
+
263
+ [1]
264
+
265
+ ```BibTeX
266
+ @INPROCEEDINGS{9747578,
267
+ author={Zhao, Shengkui and Ma, Bin and Watcharasupat, Karn N. and Gan, Woon-Seng},
268
+ booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
269
+ title={FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement},
270
+ year={2022},
271
+ pages={9281-9285},
272
+ doi={10.1109/ICASSP43922.2022.9747578}}
273
+ ```
274
+
275
+ [2]
276
+
277
+ ```BibTeX
278
+ @INPROCEEDINGS{9747230,
279
+ author={Dubey, Harishchandra and Gopal, Vishak and Cutler, Ross and Aazami, Ashkan and Matusevych, Sergiy and Braun, Sebastian and Eskimez, Sefik Emre and Thakker, Manthan and Yoshioka, Takuya and Gamper, Hannes and Aichner, Robert},
280
+ booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
281
+ title={Icassp 2022 Deep Noise Suppression Challenge},
282
+ year={2022},
283
+ pages={9271-9275},
284
+ doi={10.1109/ICASSP43922.2022.9747230}}
285
+ ```
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/configuration.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task": "acoustic-noise-suppression",
4
+ "framework": "pytorch",
5
+ "pipeline": {
6
+ "type": "speech_frcrn_ans_cirm_16k"
7
+ },
8
+ "model": {
9
+ "type": "speech_frcrn_ans_cirm_16k",
10
+ "complex": true,
11
+ "model_complexity": 45,
12
+ "model_depth": 14,
13
+ "log_amp": false,
14
+ "padding_mode": "zeros",
15
+ "win_len": 640,
16
+ "win_inc": 320,
17
+ "fft_len": 640,
18
+ "win_type": "hann"
19
+ },
20
+ "preprocessor": {},
21
+ "train": {
22
+ "max_epochs": 200,
23
+ "train_iters_per_epoch": 2000,
24
+ "dataloader": {
25
+ "batch_size_per_gpu": 12,
26
+ "workers_per_gpu": 0
27
+ },
28
+ "seed": 20,
29
+ "optimizer": {
30
+ "type": "Adam",
31
+ "lr": 0.001,
32
+ "weight_decay": 0.00001,
33
+ "options": {
34
+ "grad_clip": {
35
+ "max_norm": 10.0
36
+ }
37
+ }
38
+ },
39
+ "lr_scheduler": {
40
+ "type": "ReduceLROnPlateau",
41
+ "mode": "min",
42
+ "factor": 0.98,
43
+ "patience": 2,
44
+ "verbose": true
45
+ },
46
+ "lr_scheduler_hook": {
47
+ "type": "PlateauLrSchedulerHook",
48
+ "metric_key": "avg_loss"
49
+ },
50
+ "hooks": [
51
+ {
52
+ "type": "EvaluationHook",
53
+ "interval": 1
54
+ }
55
+ ]
56
+ },
57
+ "evaluation": {
58
+ "val_iters_per_epoch": 200,
59
+ "dataloader": {
60
+ "batch_size_per_gpu": 12,
61
+ "workers_per_gpu": 0
62
+ },
63
+ "metrics": ["audio-noise-metric"]
64
+ }
65
+ }
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/matrix.png ADDED
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/model.png ADDED
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/faq.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Q: 模型处理后的音频听起来有问题?
2
+ A: 建议先确认一下音频格式是否16KHz采样率单通道wav音频,音频内容是否带噪音的语音。
3
+
4
+ ## Q: 这个模型 cpu 推理较慢怎么办?
5
+ A: FRCRN语音降噪这一版模型的运算量是比较大的,特别是在CPU上处理耗时相对比较长,在模型不变的情况下没有什么很好的优化方案。建议使用GPU来提升速度,通常能够比CPU提升几倍到几十倍,不过GPU第一次使用需要初始化CUDA所以会比第二次调用耗时长一些。
6
+
7
+ ## Q: 模型是否支持导出为ONNX格式?
8
+ A: 不支持导出。
9
+
10
+ ## Q: 模型训练速度很慢,一个epoch要跑10个小时左右,请问这是正常的吗?
11
+ A: 这种情况不正常,目前训练流程默认使用单卡,通常V100单卡跑一个epoch约40分钟。您训练的时候可以观察一下cpu和gpu的占用情况。
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ faster-whisper
2
+ ffmpeg-python
3
+ gradio
4
+ modelscope==1.10.0
5
+ moviepy==1.0.3
6
+ transformers==4.36.2
7
+ sentencepiece
8
+ librosa
9
+ tensorflow
10
+ sacremoses
11
+ subword_nmt
12
+ jieba
13
+ funasr>=1.1.1
14
+ ollama
15
+ pysub-parser==1.7.0
16
+ pyttsx3==2.90
slicer2.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ *,
8
+ frame_length=2048,
9
+ hop_length=512,
10
+ pad_mode="constant",
11
+ ):
12
+ padding = (int(frame_length // 2), int(frame_length // 2))
13
+ y = np.pad(y, padding, mode=pad_mode)
14
+
15
+ axis = -1
16
+ # put our new within-frame axis at the end for now
17
+ out_strides = y.strides + tuple([y.strides[axis]])
18
+ # Reduce the shape on the framing axis
19
+ x_shape_trimmed = list(y.shape)
20
+ x_shape_trimmed[axis] -= frame_length - 1
21
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
22
+ xw = np.lib.stride_tricks.as_strided(
23
+ y, shape=out_shape, strides=out_strides
24
+ )
25
+ if axis < 0:
26
+ target_axis = axis - 1
27
+ else:
28
+ target_axis = axis + 1
29
+ xw = np.moveaxis(xw, -1, target_axis)
30
+ # Downsample along the target axis
31
+ slices = [slice(None)] * xw.ndim
32
+ slices[axis] = slice(0, None, hop_length)
33
+ x = xw[tuple(slices)]
34
+
35
+ # Calculate power
36
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
37
+
38
+ return np.sqrt(power)
39
+
40
+
41
+ class Slicer:
42
+ def __init__(self,
43
+ sr: int,
44
+ threshold: float = -40.,
45
+ min_length: int = 5000,
46
+ min_interval: int = 300,
47
+ hop_size: int = 20,
48
+ max_sil_kept: int = 5000):
49
+ if not min_length >= min_interval >= hop_size:
50
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
51
+ if not max_sil_kept >= hop_size:
52
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
53
+ min_interval = sr * min_interval / 1000
54
+ self.threshold = 10 ** (threshold / 20.)
55
+ self.hop_size = round(sr * hop_size / 1000)
56
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
57
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
58
+ self.min_interval = round(min_interval / self.hop_size)
59
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
60
+
61
+ def _apply_slice(self, waveform, begin, end):
62
+ if len(waveform.shape) > 1:
63
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
64
+ else:
65
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
66
+
67
+ # @timeit
68
+ def slice(self, waveform):
69
+ if len(waveform.shape) > 1:
70
+ samples = waveform.mean(axis=0)
71
+ else:
72
+ samples = waveform
73
+ if (samples.shape[0] + self.hop_size - 1) // self.hop_size <= self.min_length:
74
+ return [waveform]
75
+ rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
76
+ sil_tags = []
77
+ silence_start = None
78
+ clip_start = 0
79
+ for i, rms in enumerate(rms_list):
80
+ # Keep looping while frame is silent.
81
+ if rms < self.threshold:
82
+ # Record start of silent frames.
83
+ if silence_start is None:
84
+ silence_start = i
85
+ continue
86
+ # Keep looping while frame is not silent and silence start has not been recorded.
87
+ if silence_start is None:
88
+ continue
89
+ # Clear recorded silence start if interval is not enough or clip is too short
90
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
91
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
92
+ if not is_leading_silence and not need_slice_middle:
93
+ silence_start = None
94
+ continue
95
+ # Need slicing. Record the range of silent frames to be removed.
96
+ if i - silence_start <= self.max_sil_kept:
97
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
98
+ if silence_start == 0:
99
+ sil_tags.append((0, pos))
100
+ else:
101
+ sil_tags.append((pos, pos))
102
+ clip_start = pos
103
+ elif i - silence_start <= self.max_sil_kept * 2:
104
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
105
+ pos += i - self.max_sil_kept
106
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
107
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
108
+ if silence_start == 0:
109
+ sil_tags.append((0, pos_r))
110
+ clip_start = pos_r
111
+ else:
112
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
113
+ clip_start = max(pos_r, pos)
114
+ else:
115
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
116
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
117
+ if silence_start == 0:
118
+ sil_tags.append((0, pos_r))
119
+ else:
120
+ sil_tags.append((pos_l, pos_r))
121
+ clip_start = pos_r
122
+ silence_start = None
123
+ # Deal with trailing silence.
124
+ total_frames = rms_list.shape[0]
125
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
126
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
127
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
128
+ sil_tags.append((pos, total_frames + 1))
129
+ # Apply and return slices.
130
+ if len(sil_tags) == 0:
131
+ return [waveform]
132
+ else:
133
+ chunks = []
134
+ if sil_tags[0][0] > 0:
135
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
136
+ for i in range(len(sil_tags) - 1):
137
+ chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
138
+ if sil_tags[-1][1] < total_frames:
139
+ chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
140
+ return chunks
141
+
142
+
143
+ def main():
144
+ import os.path
145
+ from argparse import ArgumentParser
146
+
147
+ import librosa
148
+ import soundfile
149
+
150
+ parser = ArgumentParser()
151
+ parser.add_argument('audio', type=str, help='The audio to be sliced')
152
+ parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
153
+ parser.add_argument('--db_thresh', type=float, required=False, default=-40,
154
+ help='The dB threshold for silence detection')
155
+ parser.add_argument('--min_length', type=int, required=False, default=1500,
156
+ help='The minimum milliseconds required for each sliced audio clip')
157
+ parser.add_argument('--min_interval', type=int, required=False, default=300,
158
+ help='The minimum milliseconds for a silence part to be sliced')
159
+ parser.add_argument('--hop_size', type=int, required=False, default=10,
160
+ help='Frame length in milliseconds')
161
+ parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
162
+ help='The maximum silence length kept around the sliced clip, presented in milliseconds')
163
+ args = parser.parse_args()
164
+ out = args.out
165
+ if out is None:
166
+ out = os.path.dirname(os.path.abspath(args.audio))
167
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
168
+ slicer = Slicer(
169
+ sr=sr,
170
+ threshold=args.db_thresh,
171
+ min_length=args.min_length,
172
+ min_interval=args.min_interval,
173
+ hop_size=args.hop_size,
174
+ max_sil_kept=args.max_sil_kept
175
+ )
176
+ chunks = slicer.slice(audio)
177
+ if not os.path.exists(out):
178
+ os.makedirs(out)
179
+ for i, chunk in enumerate(chunks):
180
+ if len(chunk.shape) > 1:
181
+ chunk = chunk.T
182
+ soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
183
+
184
+
185
+ if __name__ == '__main__':
186
+ main()
subtitle_to_audio.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import argparse
4
+ from pysubparser import parser
5
+ from pydub import AudioSegment
6
+ import pyttsx3
7
+
8
+
9
+ engine = pyttsx3.init()
10
+ voices = engine.getProperty('voices') # getting details of current voice
11
+ vlist = []
12
+ for voice in voices:
13
+ vlist.append(voice.name)
14
+
15
+ def time_to_ms(time):
16
+ return ((time.hour * 60 + time.minute) * 60 + time.second) * 1000 + time.microsecond / 1000
17
+
18
+ def generate_audio(path, rate=200, voice_idx=0):
19
+ print("Generating audio file for {} with {}".format(path, "pyttsx3"))
20
+
21
+ subtitles = parser.parse(path)
22
+
23
+ tts_engine = pyttsx3.init()
24
+ tts_engine.setProperty('rate', rate)
25
+ tts_engine.setProperty('voice', tts_engine.getProperty('voices')[voice_idx].id)
26
+
27
+ audio_sum = AudioSegment.empty()
28
+
29
+ with tempfile.TemporaryDirectory() as tmpdirname:
30
+ print('created temporary directory', tmpdirname)
31
+
32
+ temp_file_path = os.path.join(tmpdirname, "temp.wav")
33
+ prev_subtitle = None
34
+ prev_audio_duration_ms = 0
35
+ for subtitle in subtitles:
36
+ tts_engine.save_to_file(subtitle.text, temp_file_path)
37
+ tts_engine.runAndWait()
38
+
39
+ audio_segment = AudioSegment.from_wav(temp_file_path)
40
+
41
+ print(subtitle.start, subtitle.text)
42
+
43
+ if prev_subtitle is None:
44
+ silence_duration_ms = time_to_ms(subtitle.start)
45
+ else:
46
+ silence_duration_ms = time_to_ms(subtitle.start) - time_to_ms(prev_subtitle.start) - prev_audio_duration_ms
47
+
48
+ audio_sum = audio_sum + AudioSegment.silent(duration=silence_duration_ms) + audio_segment
49
+
50
+ prev_subtitle = subtitle
51
+ prev_audio_duration_ms = len(audio_segment)
52
+
53
+ with open(f'output/{vlist[voice_idx]}.wav', 'wb') as out_f:
54
+ audio_sum.export(out_f, format='wav')
55
+
56
+ if __name__ == "__main__":
57
+ arg_parser = argparse.ArgumentParser()
58
+ arg_parser.add_argument("-p", "--path", help="subtitle file path",default="two_single.srt")
59
+ arg_parser.add_argument("-r", "--rate", help="speech rate(words per minute)", type=int, default=240)
60
+ arg_parser.add_argument("-v", "--voice-idx", help="voice selection", type=int, default=1, choices=[0, 1])
61
+
62
+ args = arg_parser.parse_args()
63
+
64
+ generate_audio(path=args.path, rate=args.rate, voice_idx=args.voice_idx)
65
+
test_deep.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx, json
2
+
3
+ deeplx_api = "http://127.0.0.1:1188/translate"
4
+
5
+ data = {
6
+ "text": "Hello World",
7
+ "source_lang": "EN",
8
+ "target_lang": "ZH"
9
+ }
10
+
11
+ # JA KO
12
+
13
+ post_data = json.dumps(data)
14
+ r = httpx.post(url = deeplx_api, data = post_data).json()
15
+ print(r["data"])
test_ollama.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import ollama
2
+
3
+
4
+ response = ollama.chat(model='qwen2:7b',messages=[
5
+ {
6
+ 'role':'user',
7
+ 'content':'"you fucked up , bitch" 翻译为中文,只给我文本的翻译,别添加其他的内容,因为我要做字幕,谢谢'
8
+ }])
9
+ print(response['message']['content'])
test_turbo.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+
3
+ model = WhisperModel("faster-whisper-large-v3-turbo-ct2")
4
+
5
+ segments, info = model.transcribe("audio.wav")
6
+ for segment in segments:
7
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
utils.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modelscope.pipelines import pipeline as pipeline_ali
2
+ from modelscope.utils.constant import Tasks
3
+ from moviepy.editor import VideoFileClip
4
+
5
+ import httpx, json
6
+
7
+ import os
8
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
9
+
10
+
11
+ import ffmpeg
12
+
13
+ from faster_whisper import WhisperModel
14
+ import math
15
+
16
+ import torch
17
+
18
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
19
+
20
+ from slicer2 import Slicer
21
+
22
+ import librosa
23
+
24
+ import soundfile
25
+
26
+ from funasr import AutoModel
27
+
28
+ from funasr.utils.postprocess_utils import rich_transcription_postprocess
29
+
30
+ # 指定本地目录
31
+ local_dir_root = "./models_from_modelscope"
32
+
33
+ # model_dir_cirm = snapshot_download('damo/speech_frcrn_ans_cirm_16k', cache_dir=local_dir_root)
34
+
35
+ # model_dir_ins = snapshot_download('damo/nlp_csanmt_translation_en2zh', cache_dir=local_dir_root)
36
+
37
+
38
+ model_dir_cirm = f'{ROOT_DIR}/models_from_modelscope/damo/speech_frcrn_ans_cirm_16k'
39
+
40
+ model_dir_ins = f'{ROOT_DIR}/models_from_modelscope/damo/nlp_csanmt_translation_en2zh'
41
+
42
+
43
+ device = "cuda" if torch.cuda.is_available() else "cpu"
44
+
45
+ import ollama
46
+
47
+
48
+ def deep_tran(text,_s,_t):
49
+
50
+ deeplx_api = "http://127.0.0.1:1188/translate"
51
+
52
+ data = {
53
+ "text": text,
54
+ "source_lang": _s,
55
+ "target_lang": _t
56
+ }
57
+
58
+ post_data = json.dumps(data)
59
+ r = httpx.post(url = deeplx_api, data = post_data).json()
60
+ print(r["data"])
61
+ return r["data"]
62
+
63
+ # 合并字幕
64
+ def merge_sub(video_path,srt_path):
65
+
66
+ if os.path.exists("test_srt.mp4"):
67
+ os.remove("test_srt.mp4")
68
+
69
+ ffmpeg.input(video_path).output("test_srt.mp4", vf="subtitles=" + srt_path).run()
70
+
71
+ return "test_srt.mp4"
72
+
73
+
74
+ def make_tran_ja2zh_neverLife(srt_path):
75
+
76
+ model_path = "neverLife/nllb-200-distilled-600M-ja-zh"
77
+
78
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path, from_pt=True)
79
+ tokenizer = AutoTokenizer.from_pretrained(model_path, src_lang="jpn_Jpan", tgt_lang="zho_Hans", from_pt=True)
80
+
81
+ # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh")
82
+
83
+ with open(srt_path, 'r',encoding="utf-8") as file:
84
+ gweight_data = file.read()
85
+
86
+ result = gweight_data.split("\n\n")
87
+
88
+ if os.path.exists("./two.srt"):
89
+ os.remove("./two.srt")
90
+
91
+ for res in result:
92
+
93
+ line_srt = res.split("\n")
94
+
95
+ try:
96
+ # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
97
+ # print(translated_text)
98
+ input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt')
99
+ outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
100
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
+ print(translated_text)
102
+
103
+ except IndexError as e:
104
+ # 处理下标越界异常
105
+ print(f"翻译完毕")
106
+ break
107
+ except Exception as e:
108
+ print(str(e))
109
+
110
+
111
+ with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
112
+
113
+ with open("./two.srt","r",encoding="utf-8") as f:
114
+ content = f.read()
115
+
116
+ return content
117
+
118
+
119
+
120
+ def make_tran_ko2zh(srt_path):
121
+
122
+ # pipe = pipeline(model="yesj1234/mbart_cycle1_ko-zh",device=device,from_pt=True)
123
+
124
+ model_path = "./model_from_hg/ko-zh/"
125
+
126
+ tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
127
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
128
+
129
+ with open(srt_path, 'r',encoding="utf-8") as file:
130
+ gweight_data = file.read()
131
+
132
+ result = gweight_data.split("\n\n")
133
+
134
+ if os.path.exists("./two.srt"):
135
+ os.remove("./two.srt")
136
+
137
+ for res in result:
138
+
139
+ line_srt = res.split("\n")
140
+
141
+ try:
142
+
143
+ # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
144
+ # print(translated_text)
145
+
146
+ input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt')
147
+ outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
148
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
149
+ print(translated_text)
150
+
151
+ except IndexError as e:
152
+ # 处理下标越界异常
153
+ print(f"翻译完毕")
154
+ break
155
+ except Exception as e:
156
+ print(str(e))
157
+
158
+
159
+ with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
160
+
161
+ with open("./two.srt","r",encoding="utf-8") as f:
162
+ content = f.read()
163
+
164
+ return content
165
+
166
+ def make_tran_ja2zh(srt_path):
167
+
168
+ # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh",device=device)
169
+
170
+
171
+ model_path = "./model_from_hg/ja-zh/"
172
+
173
+ tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
174
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
175
+
176
+ with open(srt_path, 'r',encoding="utf-8") as file:
177
+ gweight_data = file.read()
178
+
179
+ result = gweight_data.split("\n\n")
180
+
181
+ if os.path.exists("./two.srt"):
182
+ os.remove("./two.srt")
183
+
184
+ for res in result:
185
+
186
+ line_srt = res.split("\n")
187
+
188
+ try:
189
+
190
+ # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
191
+ # print(translated_text)
192
+
193
+ input_ids = tokenizer.encode(f'<-ja2zh-> {line_srt[2]}', max_length=128, padding=True, return_tensors='pt')
194
+ outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
195
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
196
+ print(translated_text)
197
+
198
+
199
+
200
+ except IndexError as e:
201
+ # 处理下标越界异常
202
+ print(f"翻译完毕")
203
+ break
204
+ except Exception as e:
205
+ print(str(e))
206
+
207
+
208
+ with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
209
+
210
+ with open("./two.srt","r",encoding="utf-8") as f:
211
+ content = f.read()
212
+
213
+ return content
214
+
215
+
216
+ def make_tran_zh2en(srt_path):
217
+
218
+ model_path = "./model_from_hg/zh-en/"
219
+
220
+ tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
221
+
222
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
223
+
224
+ with open(srt_path, 'r',encoding="utf-8") as file:
225
+ gweight_data = file.read()
226
+
227
+ result = gweight_data.split("\n\n")
228
+
229
+ if os.path.exists("./two.srt"):
230
+ os.remove("./two.srt")
231
+
232
+ for res in result:
233
+
234
+ line_srt = res.split("\n")
235
+ try:
236
+
237
+ tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt')
238
+ translation = model.generate(**tokenized_text)
239
+ translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
240
+ translated_text = translated_text.replace("<pad>","").replace("</s>","").strip()
241
+ print(translated_text)
242
+
243
+ except IndexError as e:
244
+ # 处理下标越界异常
245
+ print(f"翻译完毕")
246
+ break
247
+ except Exception as e:
248
+ print(str(e))
249
+
250
+
251
+ with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
252
+
253
+ with open("./two.srt","r",encoding="utf-8") as f:
254
+ content = f.read()
255
+
256
+ return content
257
+
258
+
259
+ # 翻译字幕 英译中
260
+ def make_tran(srt_path):
261
+
262
+
263
+ model_path = "./model_from_hg/en-zh/"
264
+
265
+ tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
266
+
267
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
268
+
269
+ with open(srt_path, 'r',encoding="utf-8") as file:
270
+ gweight_data = file.read()
271
+
272
+ result = gweight_data.split("\n\n")
273
+
274
+ if os.path.exists("./two.srt"):
275
+ os.remove("./two.srt")
276
+
277
+ for res in result:
278
+
279
+ line_srt = res.split("\n")
280
+ try:
281
+
282
+ tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt')
283
+ translation = model.generate(**tokenized_text)
284
+ translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
285
+ translated_text = translated_text.replace("<pad>","").replace("</s>","").strip()
286
+ print(translated_text)
287
+
288
+ except IndexError as e:
289
+ # 处理下标越界异常
290
+ print(f"翻译完毕")
291
+ break
292
+ except Exception as e:
293
+ print(str(e))
294
+
295
+
296
+ with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
297
+
298
+ with open("./two.srt","r",encoding="utf-8") as f:
299
+ content = f.read()
300
+
301
+ return content
302
+
303
+
304
+ # 翻译字幕 deepl
305
+ def make_tran_deep(srt_path,_s,_t):
306
+
307
+
308
+
309
+ with open(srt_path, 'r',encoding="utf-8") as file:
310
+ gweight_data = file.read()
311
+
312
+ result = gweight_data.split("\n\n")
313
+
314
+
315
+ if os.path.exists(f"{ROOT_DIR}/output/two.srt"):
316
+ os.remove(f"{ROOT_DIR}/output/two.srt")
317
+
318
+ if os.path.exists(f"{ROOT_DIR}/output/t_sin_{_t}.srt"):
319
+ os.remove(f"{ROOT_DIR}/output/t_sin_{_t}.srt")
320
+
321
+ for res in result:
322
+
323
+ line_srt = res.split("\n")
324
+
325
+ try:
326
+ text = line_srt[2]
327
+ translated_text = deep_tran(text,_s,_t)
328
+
329
+
330
+ with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
331
+ with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n")
332
+
333
+
334
+ except IndexError as e:
335
+ print(str(e))
336
+ # 处理下标越界异常
337
+ print(f"翻译完毕")
338
+ break
339
+ except Exception as e:
340
+ print(str(e))
341
+
342
+
343
+
344
+
345
+ with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f:
346
+ content = f.read()
347
+
348
+ with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","r",encoding="utf-8") as f:
349
+ content_2 = f.read()
350
+
351
+ return content,content_2,f"{ROOT_DIR}/output/t_sin_{_t}.srt"
352
+ # 翻译字幕 英译中 qwen2
353
+ def make_tran_qwen2(model_name,srt_path,lang):
354
+
355
+ with open(srt_path, 'r',encoding="utf-8") as file:
356
+ gweight_data = file.read()
357
+
358
+ result = gweight_data.split("\n\n")
359
+
360
+ if os.path.exists(f"{ROOT_DIR}/output/two.srt"):
361
+ os.remove(f"{ROOT_DIR}/output/two.srt")
362
+
363
+ if os.path.exists(f"{ROOT_DIR}/output/two_single.srt"):
364
+ os.remove(f"{ROOT_DIR}/output/two_single.srt")
365
+
366
+ for res in result:
367
+
368
+ line_srt = res.split("\n")
369
+ try:
370
+
371
+ if lang == "zh":
372
+ lang = "中文"
373
+ elif lang == "en":
374
+ lang = "英文"
375
+ elif lang == "ja":
376
+ lang = "日文"
377
+ elif lang == "ko":
378
+ lang = "韩文"
379
+
380
+ text = line_srt[2]
381
+
382
+ content = f'"{text}" 翻译为{lang},只给我文本的翻译,别添加其他的内容,因为我要做字幕,谢谢'
383
+
384
+ response = ollama.chat(model=model_name,messages=[
385
+ {
386
+ 'role':'user',
387
+ 'content':content
388
+ }])
389
+ translated_text = response['message']['content']
390
+ print(translated_text)
391
+
392
+ except IndexError as e:
393
+ # 处理下标越界异常
394
+ print(f"翻译完毕")
395
+ break
396
+ except Exception as e:
397
+ print(str(e))
398
+
399
+
400
+ with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
401
+ with open(f"{ROOT_DIR}/output/two_single.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n")
402
+
403
+ with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f:
404
+ content = f.read()
405
+
406
+ with open(f"{ROOT_DIR}/output/two_single.srt","r",encoding="utf-8") as f:
407
+ content_2 = f.read()
408
+
409
+ return content,content_2
410
+
411
+ # # 翻译字幕
412
+ # def make_tran_ali():
413
+
414
+ # pipeline_ins = pipeline(task=Tasks.translation, model=model_dir_ins)
415
+
416
+ # with open("./video.srt", 'r',encoding="utf-8") as file:
417
+ # gweight_data = file.read()
418
+
419
+ # result = gweight_data.split("\n\n")
420
+
421
+ # if os.path.exists("./two.srt"):
422
+ # os.remove("./two.srt")
423
+
424
+ # for res in result:
425
+
426
+ # line_srt = res.split("\n")
427
+ # try:
428
+ # outputs = pipeline_ins(input=line_srt[2])
429
+ # print(outputs['translation'])
430
+ # except IndexError as e:
431
+ # # 处理下标越界异常
432
+ # print(f"翻译完毕")
433
+ # break
434
+ # except Exception as e:
435
+ # print(str(e))
436
+
437
+
438
+ # with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{outputs['translation']}\n\n")
439
+
440
+ # return "翻译完毕"
441
+
442
+
443
+
444
+ def convert_seconds_to_hms(seconds):
445
+ hours, remainder = divmod(seconds, 3600)
446
+ minutes, seconds = divmod(remainder, 60)
447
+ milliseconds = math.floor((seconds % 1) * 1000)
448
+ output = f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"
449
+ return output
450
+
451
+
452
+ emo_dict = {
453
+ "<|HAPPY|>": "😊",
454
+ "<|SAD|>": "😔",
455
+ "<|ANGRY|>": "😡",
456
+ "<|NEUTRAL|>": "",
457
+ "<|FEARFUL|>": "😰",
458
+ "<|DISGUSTED|>": "🤢",
459
+ "<|SURPRISED|>": "😮",
460
+ }
461
+
462
+ event_dict = {
463
+ "<|BGM|>": "🎼",
464
+ "<|Speech|>": "",
465
+ "<|Applause|>": "👏",
466
+ "<|Laughter|>": "😀",
467
+ "<|Cry|>": "😭",
468
+ "<|Sneeze|>": "🤧",
469
+ "<|Breath|>": "",
470
+ "<|Cough|>": "🤧",
471
+ }
472
+
473
+ emoji_dict = {
474
+ "<|nospeech|><|Event_UNK|>": "",
475
+ "<|zh|>": "",
476
+ "<|en|>": "",
477
+ "<|yue|>": "",
478
+ "<|ja|>": "",
479
+ "<|ko|>": "",
480
+ "<|nospeech|>": "",
481
+ "<|HAPPY|>": "",
482
+ "<|SAD|>": "",
483
+ "<|ANGRY|>": "",
484
+ "<|NEUTRAL|>": "",
485
+ "<|BGM|>": "",
486
+ "<|Speech|>": "",
487
+ "<|Applause|>": "",
488
+ "<|Laughter|>": "",
489
+ "<|FEARFUL|>": "",
490
+ "<|DISGUSTED|>": "",
491
+ "<|SURPRISED|>": "",
492
+ "<|Cry|>": "",
493
+ "<|EMO_UNKNOWN|>": "",
494
+ "<|Sneeze|>": "",
495
+ "<|Breath|>": "",
496
+ "<|Cough|>": "",
497
+ "<|Sing|>": "",
498
+ "<|Speech_Noise|>": "",
499
+ "<|withitn|>": "",
500
+ "<|woitn|>": "",
501
+ "<|GBG|>": "",
502
+ "<|Event_UNK|>": "",
503
+ }
504
+
505
+ lang_dict = {
506
+ "<|zh|>": "<|lang|>",
507
+ "<|en|>": "<|lang|>",
508
+ "<|yue|>": "<|lang|>",
509
+ "<|ja|>": "<|lang|>",
510
+ "<|ko|>": "<|lang|>",
511
+ "<|nospeech|>": "<|lang|>",
512
+ }
513
+
514
+ emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
515
+ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
516
+
517
+ lang2token = {
518
+ 'zh': "ZH|",
519
+ 'ja': "JP|",
520
+ "en": "EN|",
521
+ "ko": "KO|",
522
+ "yue": "YUE|",
523
+ }
524
+
525
+ def format_str(s):
526
+ for sptk in emoji_dict:
527
+ s = s.replace(sptk, emoji_dict[sptk])
528
+ return s
529
+
530
+
531
+ def format_str_v2(s):
532
+ sptk_dict = {}
533
+ for sptk in emoji_dict:
534
+ sptk_dict[sptk] = s.count(sptk)
535
+ s = s.replace(sptk, "")
536
+ emo = "<|NEUTRAL|>"
537
+ for e in emo_dict:
538
+ if sptk_dict[e] > sptk_dict[emo]:
539
+ emo = e
540
+ for e in event_dict:
541
+ if sptk_dict[e] > 0:
542
+ s = event_dict[e] + s
543
+ s = s + emo_dict[emo]
544
+
545
+ for emoji in emo_set.union(event_set):
546
+ s = s.replace(" " + emoji, emoji)
547
+ s = s.replace(emoji + " ", emoji)
548
+ return s.strip()
549
+
550
+ def format_str_v3(s):
551
+ def get_emo(s):
552
+ return s[-1] if s[-1] in emo_set else None
553
+ def get_event(s):
554
+ return s[0] if s[0] in event_set else None
555
+
556
+ s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
557
+ for lang in lang_dict:
558
+ s = s.replace(lang, "<|lang|>")
559
+ s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
560
+ new_s = " " + s_list[0]
561
+ cur_ent_event = get_event(new_s)
562
+ for i in range(1, len(s_list)):
563
+ if len(s_list[i]) == 0:
564
+ continue
565
+ if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
566
+ s_list[i] = s_list[i][1:]
567
+ #else:
568
+ cur_ent_event = get_event(s_list[i])
569
+ if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
570
+ new_s = new_s[:-1]
571
+ new_s += s_list[i].strip().lstrip()
572
+ new_s = new_s.replace("The.", " ")
573
+ return new_s.strip()
574
+
575
+ def ms_to_srt_time(ms):
576
+ N = int(ms)
577
+ hours, remainder = divmod(N, 3600000)
578
+ minutes, remainder = divmod(remainder, 60000)
579
+ seconds, milliseconds = divmod(remainder, 1000)
580
+ timesrt = f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
581
+ # print(timesrt)
582
+ return timesrt
583
+
584
+ def time_to_srt(time_in_seconds):
585
+ """
586
+ 将秒数转换为 SRT 时间戳格式。
587
+
588
+ Args:
589
+ time_in_seconds: 秒数。
590
+
591
+ Returns:
592
+ 一个 SRT 时间戳字符串。
593
+ """
594
+ milliseconds = int(time_in_seconds * 1000)
595
+ hours = milliseconds // 3600000
596
+ minutes = (milliseconds % 3600000) // 60000
597
+ seconds = (milliseconds % 60000) // 1000
598
+ milliseconds = milliseconds % 1000
599
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
600
+ # 制作字幕文件 阿里
601
+ def make_srt_sv(file_path):
602
+
603
+
604
+ model_dir = "iic/SenseVoiceSmall"
605
+ input_file = (file_path)
606
+
607
+ model = AutoModel(model=model_dir,
608
+ vad_model="fsmn-vad",
609
+ vad_kwargs={"max_single_segment_time": 30000},
610
+ trust_remote_code=True, device="cuda:0")
611
+
612
+ res = model.generate(
613
+ input=input_file,
614
+ cache={},
615
+ language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
616
+ use_itn=False,
617
+ batch_size_s=0,
618
+ )
619
+
620
+ print(res)
621
+ text = res[0]["text"]
622
+ # text = format_str_v3(text)
623
+ text = rich_transcription_postprocess(text)
624
+
625
+ print(text)
626
+
627
+ return text
628
+
629
+
630
+ # for filename in os.listdir("./wavs"):
631
+ # if filename.endswith(".wav"):
632
+ # filepath = os.path.join("./wavs/", filename)
633
+ # try:
634
+ # if os.path.isfile(filepath):
635
+ # os.remove(filepath)
636
+ # print(f"已删除文件: {filepath}")
637
+ # except Exception as e:
638
+ # print(f"删除文件时出错: {filepath} - {e}")
639
+
640
+ # # 第一步,先切片
641
+
642
+ # audio, sr = librosa.load(file_path, sr=None, mono=False)
643
+
644
+ # # 创建Slicer对象
645
+ # slicer = Slicer(
646
+ # sr=sr,
647
+ # threshold=-40,
648
+ # min_length=1500,
649
+ # min_interval=300,
650
+ # hop_size=1,
651
+ # max_sil_kept=150000
652
+ # )
653
+
654
+ # # 切割音频
655
+ # chunks = slicer.slice(audio)
656
+ # for i, chunk in enumerate(chunks):
657
+ # if len(chunk.shape) > 1:
658
+ # chunk = chunk.T # Swap axes if the audio is stereo.
659
+ # soundfile.write(f'./wavs/chunk_{i}.wav', chunk, sr)
660
+
661
+
662
+ # srtlines = []
663
+ # audio_samples = 0
664
+ # audio_opt = []
665
+ # for filename in os.listdir("./wavs"):
666
+ # if filename.endswith(".wav"):
667
+ # filepath = os.path.join("./wavs/", filename)
668
+ # print(filepath)
669
+
670
+ # model_dir = "iic/SenseVoiceSmall"
671
+ # input_file = (filepath)
672
+
673
+ # model = AutoModel(model=model_dir,
674
+ # vad_model="fsmn-vad",
675
+ # vad_kwargs={"max_single_segment_time": 30000},
676
+ # trust_remote_code=True, device="cuda:0")
677
+
678
+ # res = model.generate(
679
+ # input=input_file,
680
+ # cache={},
681
+ # language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
682
+ # use_itn=False,
683
+ # batch_size_s=0,
684
+ # )
685
+
686
+ # # print(res)
687
+ # text = res[0]["text"]
688
+ # # text = format_str_v3(text)
689
+ # text = rich_transcription_postprocess(text)
690
+
691
+ # print(text)
692
+
693
+ # audio, sampling_rate = soundfile.read(filepath)
694
+
695
+ # audio_opt.append(audio)
696
+
697
+ # srtline_begin=ms_to_srt_time(audio_samples*1000.0 / sampling_rate)
698
+ # audio_samples += audio.size
699
+ # srtline_end=ms_to_srt_time(audio_samples*1000.0 / sampling_rate)
700
+
701
+ # srtlines.append(f"{len(audio_opt)}\n")
702
+ # srtlines.append(srtline_begin+' --> '+srtline_end+"\n")
703
+
704
+ # srtlines.append(text+"\n\n")
705
+
706
+ # exit(-1)
707
+
708
+ with open('./video.srt', 'w', encoding='utf-8') as f:
709
+ f.writelines(srtlines)
710
+
711
+ with open("./video.srt","r",encoding="utf-8") as f:
712
+ content = f.read()
713
+
714
+
715
+
716
+ return content
717
+ # 制作字幕文件
718
+ def make_srt(file_path,model_name="small"):
719
+
720
+
721
+ # if device == "cuda":
722
+ # model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False)
723
+ # else:
724
+ # model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False)
725
+ # or run on GPU with INT8
726
+ # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
727
+
728
+
729
+ if model_name != "faster-whisper-large-v3-turbo-ct2":
730
+
731
+ if device == "cuda":
732
+ try:
733
+ model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False)
734
+ except Exception as e:
735
+ model = WhisperModel(model_name, device="cuda", compute_type="int8_float16",download_root="./model_from_whisper",local_files_only=False)
736
+ else:
737
+ model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False)
738
+ else:
739
+
740
+ model_name = f"{ROOT_DIR}/faster-whisper-large-v3-turbo-ct2"
741
+ print(model_name)
742
+
743
+ if device == "cuda":
744
+ try:
745
+ model = WhisperModel(model_name, device="cuda", compute_type="float16")
746
+ except Exception as e:
747
+ model = WhisperModel(model_name, device="cuda", compute_type="int8_float16")
748
+ else:
749
+ model = WhisperModel(model_name, device="cpu", compute_type="int8")
750
+
751
+
752
+
753
+
754
+ segments, info = model.transcribe(file_path, beam_size=5,vad_filter=True,vad_parameters=dict(min_silence_duration_ms=500))
755
+
756
+ print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
757
+ count = 0
758
+ with open(f'{ROOT_DIR}/output/video.srt', 'w',encoding="utf-8") as f: # Open file for writing
759
+ for segment in segments:
760
+ count +=1
761
+ duration = f"{convert_seconds_to_hms(segment.start)} --> {convert_seconds_to_hms(segment.end)}\n"
762
+ text = f"{segment.text.lstrip()}\n\n"
763
+
764
+ f.write(f"{count}\n{duration}{text}") # Write formatted string to the file
765
+ print(f"{duration}{text}",end='')
766
+
767
+ with open(f"{ROOT_DIR}/output/video.srt","r",encoding="utf-8") as f:
768
+ content = f.read()
769
+
770
+ return content
771
+
772
+
773
+
774
+ # 提取人声
775
+ def movie2audio(video_path):
776
+
777
+ # 读取视频文件
778
+ video = VideoFileClip(video_path)
779
+
780
+ # 提取视频文件中的声音
781
+ audio = video.audio
782
+
783
+ # 将声音保存为WAV格式
784
+ audio.write_audiofile(f"{ROOT_DIR}/audio.wav")
785
+
786
+ ans = pipeline_ali(
787
+ Tasks.acoustic_noise_suppression,
788
+ model=model_dir_cirm)
789
+
790
+ ans(f'{ROOT_DIR}/audio.wav',output_path=f'{ROOT_DIR}/output.wav')
791
+
792
+ return f"{ROOT_DIR}/output.wav"
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
生成英文配音.bat ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SET PYTHON_PATH=%cd%\venv\
2
+ rem overriding default python env vars in order not to interfere with any system python installation
3
+ SET PYTHONHOME=
4
+ SET PYTHONPATH=
5
+ SET PYTHONEXECUTABLE=%PYTHON_PATH%\python.exe
6
+ SET PYTHONWEXECUTABLE=%PYTHON_PATH%pythonw.exe
7
+ SET PYTHON_EXECUTABLE=%PYTHON_PATH%\python.exe
8
+ SET PYTHONW_EXECUTABLE=%PYTHON_PATH%pythonw.exe
9
+ SET PYTHON_BIN_PATH=%PYTHON_EXECUTABLE%
10
+ SET PYTHON_LIB_PATH=%PYTHON_PATH%\Lib\site-packages
11
+ SET FFMPEG_PATH=%cd%\venv\ffmpeg\bin
12
+ SET PATH=%PYTHON_PATH%;%PYTHON_PATH%\Scripts;%FFMPEG_PATH%;%PATH%
13
+ @REM set HF_ENDPOINT=https://hf-mirror.com
14
+ @REM set HF_HOME=%CD%\hf_download
15
+ @REM set PYTHONPATH=third_party/AcademiCodec;third_party/Matcha-TTS
16
+ "%PYTHON_EXECUTABLE%" gen_english.py
17
+ pause
运行.bat ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SET PYTHON_PATH=%cd%\venv\
2
+ rem overriding default python env vars in order not to interfere with any system python installation
3
+ SET PYTHONHOME=
4
+ SET PYTHONPATH=
5
+ SET PYTHONEXECUTABLE=%PYTHON_PATH%\python.exe
6
+ SET PYTHONWEXECUTABLE=%PYTHON_PATH%pythonw.exe
7
+ SET PYTHON_EXECUTABLE=%PYTHON_PATH%\python.exe
8
+ SET PYTHONW_EXECUTABLE=%PYTHON_PATH%pythonw.exe
9
+ SET PYTHON_BIN_PATH=%PYTHON_EXECUTABLE%
10
+ SET PYTHON_LIB_PATH=%PYTHON_PATH%\Lib\site-packages
11
+ SET FFMPEG_PATH=%cd%\venv\ffmpeg\bin
12
+ SET PATH=%PYTHON_PATH%;%PYTHON_PATH%\Scripts;%FFMPEG_PATH%;%PATH%
13
+ @REM set HF_ENDPOINT=https://hf-mirror.com
14
+ @REM set HF_HOME=%CD%\hf_download
15
+ @REM set PYTHONPATH=third_party/AcademiCodec;third_party/Matcha-TTS
16
+ "%PYTHON_EXECUTABLE%" app.py
17
+ pause