Spaces:

oceannnn
/

Modelscope_Faster_Whisper_Multi_Subtitle

Runtime error

App Files Files Community

thun888 commited on Jan 27

Commit

cb0791d

1 Parent(s): 01d6def

Add application file

Browse files

Files changed (30) hide show

.gitignore +16 -0
LICENSE +21 -0
app.py +372 -0
app.spec +71 -0
app_video_src.py +155 -0
download_model.py +5 -0
faster-whisper-large-v3-turbo-ct2/.gitattributes +35 -0
faster-whisper-large-v3-turbo-ct2/README.md +141 -0
faster-whisper-large-v3-turbo-ct2/config.json +360 -0
faster-whisper-large-v3-turbo-ct2/preprocessor_config.json +14 -0
faster-whisper-large-v3-turbo-ct2/vocabulary.json +0 -0
gen_english.py +39 -0
img/sample.png +0 -0
model_from_hg/model here +26 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.mdl +0 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.msc +0 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/README.md +285 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/configuration.json +65 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/matrix.png +0 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/model.png +0 -0
models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/faq.md +11 -0
requirements.txt +16 -0
slicer2.py +186 -0
subtitle_to_audio.py +65 -0
test_deep.py +15 -0
test_ollama.py +9 -0
test_turbo.py +7 -0
utils.py +802 -0
生成英文配音.bat +17 -0
运行.bat +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.bin
+*.srt
+*.mp4
+*.wav
+*.mb
+*.exe
+.locks
+venv/
+ckpt-0.data-00000-of-00001
+ckpt-0.meta
+tokenizer.json
+*.model

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 RVC-Boss
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import argparse
+import os
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+# ffmpeg_path = f"{ROOT_DIR}/bin" # 替换成你的 FFmpeg bin 目录
+# os.environ["PATH"] = os.environ.get("PATH", "") + os.pathsep + ffmpeg_path
+import gradio as gr
+from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh,make_srt_sv,make_tran_qwen2,make_tran_deep
+from subtitle_to_audio import generate_audio
+import pyttsx3
+engine = pyttsx3.init()
+voices = engine.getProperty('voices')       # getting details of current voice
+vlist = []
+num = 0
+for voice in voices:
+    print(" - Name: %s" % voice.name)
+    vlist.append((voice.name,num))
+    num += 1
+initial_md = """
+项目地址:https://github.com/v3ucn/Modelscope_Faster_Whisper_Multi_Subtitle
+作者：刘悦的技术博客  https://space.bilibili.com/3031494
+"""
+def do_pyttsx3(srt,speed,voice):
+    print(srt,speed,voice)
+    voice = int(voice)
+    generate_audio(path=srt,rate=int(speed),voice_idx=voice)
+    return f"output/{vlist[voice][0]}.wav"
+def do_speech(video):
+    res = movie2audio(video)
+    return res
+def do_trans_video(model_type,video_path):
+    srt_text = make_srt(video_path,model_type)
+    return srt_text
+def do_trans_video_sv(video_path):
+    srt_text = make_srt_sv(video_path)
+    return srt_text
+def do_trans_audio(model_type):
+    srt_text = make_srt(f'{ROOT_DIR}/audio.wav',model_type)
+    return srt_text
+def do_trans_en2zh(srt_path):
+    return make_tran(srt_path)
+def do_trans_en2zh_deep(srt_path):
+    return make_tran_deep(srt_path,"EN","ZH")
+def do_trans_zh2en_deep(srt_path):
+    return make_tran_deep(srt_path,"ZH","EN")
+def do_trans_zh2ja_deep(srt_path):
+    return make_tran_deep(srt_path,"ZH","JA")
+def do_trans_zh2ko_deep(srt_path):
+    return make_tran_deep(srt_path,"ZH","KO")
+def do_trans_ja2zh_deep(srt_path):
+    return make_tran_deep(srt_path,"JA","ZH")
+def do_trans_ko2zh_deep(srt_path):
+    return make_tran_deep(srt_path,"KO","ZH")
+def do_trans_en2zh_qwen2(model_path_qwen2,srt_path):
+    return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
+def do_trans_zh2en_qwen2(model_path_qwen2,srt_path):
+    return make_tran_qwen2(model_path_qwen2,srt_path,"en")
+def do_trans_ja2zh_qwen2(model_path_qwen2,srt_path):
+    return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
+def do_trans_ko2zh_qwen2(model_path_qwen2,srt_path):
+    return make_tran_qwen2(model_path_qwen2,srt_path,"zh")
+def do_trans_zh2en(srt_path):
+    return make_tran_zh2en(srt_path)
+def do_trans_ja2zh(srt_path):
+    return make_tran_ja2zh(srt_path)
+def do_trans_ko2zh(srt_path):
+    return make_tran_ko2zh(srt_path)
+def do_srt_sin(video_path):
+    return merge_sub(video_path,f"{ROOT_DIR}/output/video.srt")
+def do_srt_two(video_path):
+    return merge_sub(video_path,f"{ROOT_DIR}/output/two.srt")
+def do_srt_two_single(video_path):
+    return merge_sub(video_path,f"{ROOT_DIR}/output/two_single.srt")
+def save_srt(text):
+    with open(rf'{ROOT_DIR}/output/video.srt','w',encoding='utf-8') as f:
+        f.write(text + "\n")
+    gr.Info('字幕文件修改成功,字幕保存在output目录')
+def save_two(text,text_2):
+    with open(rf'{ROOT_DIR}/output/two.srt','w',encoding='utf-8') as f:
+        f.write(text + "\n")
+    with open(rf'{ROOT_DIR}/output/two_single.srt','w',encoding='utf-8') as f:
+        f.write(text_2 + "\n")
+    gr.Info('字幕文件修改成功,字幕保存在output目录')
+with gr.Blocks() as app:
+    gr.Markdown(initial_md)
+    with gr.Accordion("视频处理(Video)"):
+        with gr.Row():
+            ori_video = gr.Video(label="请上传视频(Upload Video)")
+            speech_button = gr.Button("提取人声(如果视频没有背景音也可以不做)Extract human voice (you don't have to do it if the video has no background sound)")
+            speech_audio = gr.Audio(label="提取的人声(Extract voice)")
+    speech_button.click(do_speech,inputs=[ori_video],outputs=[speech_audio])
+    with gr.Accordion("转写字幕"):
+        with gr.Row():
+            with gr.Column():
+                # model_type = gr.Dropdown(choices=["small","medium","large-v3","large-v2"], value="small", label="选择faster_Whisper模型/Select faster_Whisper model",interactive=True)
+                model_type = gr.Textbox(label="填写faster_Whisper模型/Fill in the faster_Whisper model,也可以填写small,medium,large,large-v2,large-v3,faster-whisper-large-v3-turbo-ct2,模型越大，速度越慢，但字幕的准确度越高，酌情填写，用文本框是因为你可以填写其他huggingface上的开源模型地址",value="faster-whisper-large-v3-turbo-ct2")
+        # with gr.Row():
+        #     with gr.Column():
+        #         language = gr.Dropdown(["ja", "en", "zh","ko","yue"], value="zh", label="选择转写��语言",interactive=True)
+        with gr.Row():
+            transcribe_button_whisper = gr.Button("Whisper视频直接转写字幕(Video direct rewriting subtitles)")
+            transcribe_button_audio = gr.Button("Whisper提取人声转写字幕(Extract voice transliteration subtitles)")
+            # transcribe_button_video_sv = gr.Button("阿里SenseVoice视频直接转写字幕")
+            result1 = gr.Textbox(label="字幕結果(会在项目目录生成video.srt/video.srt is generated in the current directory)",value=" ",interactive=True)
+            transcribe_button_audio_save = gr.Button("保存字幕修改结果")
+        transcribe_button_whisper.click(do_trans_video,inputs=[model_type,ori_video],outputs=[result1])
+        transcribe_button_audio_save.click(save_srt,inputs=[result1],outputs=[])
+        # transcribe_button_video_sv.click(do_trans_video_sv,inputs=[ori_video],outputs=[result1])
+        transcribe_button_audio.click(do_trans_audio,inputs=[model_type],outputs=[result1])
+    # with gr.Accordion("HuggingFace大模型字幕翻译"):
+    #     with gr.Row():
+    #         srt_path = gr.Textbox(label="原始字幕地址，默认为项目目录中的video.srt,也可以输入其他路径",value="./video.srt")
+    #         trans_button_en2zh = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
+    #         trans_button_zh2en = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
+    #         trans_button_ja2zh = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
+    #         trans_button_ko2zh = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
+    #         result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)")
+    #     trans_button_en2zh.click(do_trans_en2zh,[srt_path],outputs=[result2])
+    #     trans_button_zh2en.click(do_trans_zh2en,[srt_path],outputs=[result2])
+    #     trans_button_ja2zh.click(do_trans_ja2zh,[srt_path],outputs=[result2])
+    #     trans_button_ko2zh.click(do_trans_ko2zh,[srt_path],outputs=[result2])
+    with gr.Accordion("Qwen2大模型字幕翻译"):
+        with gr.Row():
+            srt_path_qwen2 = gr.Textbox(label="原始字幕地址，默认为项目目录中的output/video.srt,也可以输入其他路径",value=f"{ROOT_DIR}/output/video.srt")
+            model_path_qwen2 = gr.Textbox(label="ollama中模型名称",value="qwen2:7b")
+            trans_button_en2zh_qwen2 = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
+            trans_button_zh2en_qwen2 = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
+            trans_button_ja2zh_qwen2 = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
+            trans_button_ko2zh_qwen2 = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
+        with gr.Row():
+            result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)",value=" ",interactive=True)
+            result3 = gr.Textbox(label="翻译结果(会在项目目录生成two_single.srt)",value=" ",interactive=True)
+            trans_button_ko2zh_qwen2_save = gr.Button("保存修改结果")
+        trans_button_en2zh_qwen2.click(do_trans_en2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
+        trans_button_zh2en_qwen2.click(do_trans_zh2en_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
+        trans_button_ja2zh_qwen2.click(do_trans_ja2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
+        trans_button_ko2zh_qwen2.click(do_trans_ko2zh_qwen2,[model_path_qwen2,srt_path_qwen2],outputs=[result2,result3])
+        trans_button_ko2zh_qwen2_save.click(save_two,[result2,result3],outputs=[])
+    with gr.Accordion("Deepl字幕翻译"):
+        with gr.Row():
+            srt_path_deep = gr.Textbox(label="原始字幕地址，默认为项目目录中的output/video.srt,也可以输入其他路径",value=f"{ROOT_DIR}/output/video.srt")
+            trans_button_en2zh_deep = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
+            trans_button_zh2en_deep = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
+            trans_button_zh2ja_deep = gr.Button("翻译中文字幕为日文/Translate Chinese subtitles into Japanese")
+            trans_button_zh2ko_deep = gr.Button("翻译中文字幕为韩文/Translate Chinese subtitles into Korea")
+            trans_button_ja2zh_deep = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
+            trans_button_ko2zh_deep = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
+        with gr.Row():
+            result2_deep = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)",value=" ",interactive=True)
+            result3_deep = gr.Textbox(label="翻译结果(会在项目目录生成two_single.srt)",value=" ",interactive=True)
+            trans_button_ko2zh_deep_save = gr.Button("保存修改结果")
+        trans_button_ko2zh_deep_save.click(save_two,[result2_deep,result3_deep],outputs=[])
+    with gr.Accordion("字幕配音(pyttsx3)"):
+        with gr.Row():
+            srt_path_pyttsx3 = gr.Textbox(label="字幕地址,也可以输入其他路径",value=f"{ROOT_DIR}/output/eng.srt",interactive=True)
+            speed_pyttsx3 = gr.Textbox(label="配音语速(很重要,否则会引起时间轴错乱的问题)",value="240")
+            voice_pyttsx3 = gr.Dropdown(choices=vlist,value=3,label="配音的音色选择",interactive=True)
+            button_pyttsx3 = gr.Button("生成配音")
+            pyttsx3_audio = gr.Audio(label="配音的结果")
+    trans_button_en2zh_deep.click(do_trans_en2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    trans_button_zh2ja_deep.click(do_trans_zh2ja_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    trans_button_zh2en_deep.click(do_trans_zh2en_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    trans_button_zh2ko_deep.click(do_trans_zh2ko_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    trans_button_ja2zh_deep.click(do_trans_ja2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    trans_button_ko2zh_deep.click(do_trans_ko2zh_deep,[srt_path_deep],outputs=[result2_deep,result3_deep,srt_path_pyttsx3])
+    button_pyttsx3.click(do_pyttsx3,inputs=[srt_path_pyttsx3,speed_pyttsx3,voice_pyttsx3],outputs=[pyttsx3_audio])
+    with gr.Accordion("字幕合并"):
+        with gr.Row():
+            srt_button_sin = gr.Button("将单语字幕合并到视频/Merge monolingual subtitles into video")
+            srt_button_two = gr.Button("将双语字幕合并到视频/Merge bilingual subtitles into video")
+            srt_button_two_single = gr.Button("将翻译的单语字幕合并到视频")
+            result3 = gr.Video(label="带字幕视频")
+    srt_button_sin.click(do_srt_sin,inputs=[ori_video],outputs=[result3])
+    srt_button_two.click(do_srt_two,inputs=[ori_video],outputs=[result3])
+    srt_button_two.click(do_srt_two_single,inputs=[ori_video],outputs=[result3])
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--server-name",
+    type=str,
+    default=None,
+    help="Server name for Gradio app",
+)
+parser.add_argument(
+    "--no-autolaunch",
+    action="store_true",
+    default=False,
+    help="Do not launch app automatically",
+)
+args = parser.parse_args()
+app.queue()
+app.launch(inbrowser=True, server_name=args.server_name)

app.spec ADDED Viewed

	@@ -0,0 +1,71 @@

+# -*- mode: python ; coding: utf-8 -*-
+import sys
+sys.setrecursionlimit(5000)
+from PyInstaller.utils.hooks import collect_data_files
+datas = []
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('gradio')
+# datas += [('./utils.py',".")]
+# datas += [('./slicer2.py',".")]
+a = Analysis(
+    ['app.py',
+    ],
+    pathex=['/Users/liuyue/Downloads/FunAsr_Faster_Whisper_Multi_Subs'],
+    binaries=[],
+    datas=datas,
+    hiddenimports=[],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+    module_collection_mode={ 'gradio': 'py'}
+)
+pyz = PYZ(a.pure)
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='whisper_turbo',
+    # icon='AnyConv.com__paints_logo.icns',
+    debug=True,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+a.datas += Tree('./faster-whisper-large-v3-turbo-ct2', prefix='faster-whisper-large-v3-turbo-ct2')
+a.datas += Tree('./models_from_modelscope', prefix='models_from_modelscope')
+a.datas += Tree('./output', prefix='output')
+# a.datas += Tree('./bin', prefix='bin')
+# a.datas += Tree('./output', prefix='output')
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='whisper_turbo',
+)

app_video_src.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import argparse
+import os
+import gradio as gr
+from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh
+initial_md = """
+作者：刘悦的技术博客  https://space.bilibili.com/3031494
+"""
+def do_speech(video):
+    res = movie2audio(video)
+    return res
+def do_trans_video(model_type,video_path):
+    srt_text = make_srt(video_path,model_type)
+    return srt_text
+def do_trans_audio(model_type):
+    srt_text = make_srt('./audio.wav',model_type)
+    return srt_text
+def do_trans_en2zh(srt_path):
+    return make_tran(srt_path)
+def do_trans_zh2en(srt_path):
+    return make_tran_zh2en(srt_path)
+def do_trans_ja2zh(srt_path):
+    return make_tran_ja2zh(srt_path)
+def do_trans_ko2zh(srt_path):
+    return make_tran_ko2zh(srt_path)
+def do_srt_sin(video_path):
+    return merge_sub(video_path,"./video.srt")
+def do_srt_two(video_path):
+    return merge_sub(video_path,"./two.srt")
+with gr.Blocks() as app:
+    gr.Markdown(initial_md)
+    with gr.Accordion("视频处理(Video)"):
+        with gr.Row():
+            ori_video = gr.Textbox(label="请输入视频的路径地址，如:d:/123.mp4")
+            speech_button = gr.Button("提取人声(如果视频没有背景音也可以不做)Extract human voice (you don't have to do it if the video has no background sound)")
+            speech_audio = gr.Audio(label="提取的人声(Extract voice)")
+    speech_button.click(do_speech,inputs=[ori_video],outputs=[speech_audio])
+    with gr.Accordion("转写字幕"):
+        with gr.Row():
+            with gr.Column():
+                # model_type = gr.Dropdown(choices=["small","medium","large-v3","large-v2"], value="small", label="选择faster_Whisper模型/Select faster_Whisper model",interactive=True)
+                model_type = gr.Textbox(label="填写faster_Whisper模型/Fill in the faster_Whisper model,也可以填写small,medium,large,large-v2,large-v3,模型越大，速度越慢，但字幕的准确度越高，酌情填写，用文本框是因为你可以填写其他huggingface上的开源模型地址",value="medium")
+            transcribe_button_whisper = gr.Button("视频直接转写字幕(Video direct rewriting subtitles)")
+            transcribe_button_audio = gr.Button("提取人声转写字幕(Extract voice transliteration subtitles)")
+            result1 = gr.Textbox(label="字幕結果(会在项目目录生成video.srt/video.srt is generated in the current directory)")
+        transcribe_button_whisper.click(do_trans_video,inputs=[model_type,ori_video],outputs=[result1])
+        transcribe_button_audio.click(do_trans_audio,inputs=[model_type],outputs=[result1])
+    with gr.Accordion("字幕翻译"):
+        with gr.Row():
+            srt_path = gr.Textbox(label="原始字幕地址，默认为项目目录中的video.srt,也可以输入其他路径",value="./video.srt")
+            trans_button_en2zh = gr.Button("翻译英语字幕为中文/Translate English subtitles into Chinese")
+            trans_button_zh2en = gr.Button("翻译中文字幕为英文/Translate Chinese subtitles into English")
+            trans_button_ja2zh = gr.Button("翻译日文字幕为中文/Translate Japanese subtitles into Chinese")
+            trans_button_ko2zh = gr.Button("翻译韩文字幕为中文/Translate Korea subtitles into Chinese")
+            result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)")
+        trans_button_en2zh.click(do_trans_en2zh,[srt_path],outputs=[result2])
+        trans_button_zh2en.click(do_trans_zh2en,[srt_path],outputs=[result2])
+        trans_button_ja2zh.click(do_trans_ja2zh,[srt_path],outputs=[result2])
+        trans_button_ko2zh.click(do_trans_ko2zh,[srt_path],outputs=[result2])
+    with gr.Accordion("字幕合并"):
+        with gr.Row():
+            srt_button_sin = gr.Button("将单语字幕合并到视频/Merge monolingual subtitles into video")
+            srt_button_two = gr.Button("将双语字幕合并到视频/Merge bilingual subtitles into video")
+            result3 = gr.Textbox(label="合成字幕后的视频路径地址")
+    srt_button_sin.click(do_srt_sin,inputs=[ori_video],outputs=[result3])
+    srt_button_two.click(do_srt_two,inputs=[ori_video],outputs=[result3])
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--server-name",
+    type=str,
+    default=None,
+    help="Server name for Gradio app",
+)
+parser.add_argument(
+    "--no-autolaunch",
+    action="store_true",
+    default=False,
+    help="Do not launch app automatically",
+)
+args = parser.parse_args()
+app.launch(inbrowser=not args.no_autolaunch, server_name=args.server_name)

download_model.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from huggingface_hub import snapshot_download
+repo_id = "deepdml/faster-whisper-large-v3-turbo-ct2"
+local_dir = "faster-whisper-large-v3-turbo-ct2"
+snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="model")

faster-whisper-large-v3-turbo-ct2/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

faster-whisper-large-v3-turbo-ct2/README.md ADDED Viewed

	@@ -0,0 +1,141 @@

+---
+language:
+  - en
+  - zh
+  - de
+  - es
+  - ru
+  - ko
+  - fr
+  - ja
+  - pt
+  - tr
+  - pl
+  - ca
+  - nl
+  - ar
+  - sv
+  - it
+  - id
+  - hi
+  - fi
+  - vi
+  - he
+  - uk
+  - el
+  - ms
+  - cs
+  - ro
+  - da
+  - hu
+  - ta
+  - 'no'
+  - th
+  - ur
+  - hr
+  - bg
+  - lt
+  - la
+  - mi
+  - ml
+  - cy
+  - sk
+  - te
+  - fa
+  - lv
+  - bn
+  - sr
+  - az
+  - sl
+  - kn
+  - et
+  - mk
+  - br
+  - eu
+  - is
+  - hy
+  - ne
+  - mn
+  - bs
+  - kk
+  - sq
+  - sw
+  - gl
+  - mr
+  - pa
+  - si
+  - km
+  - sn
+  - yo
+  - so
+  - af
+  - oc
+  - ka
+  - be
+  - tg
+  - sd
+  - gu
+  - am
+  - yi
+  - lo
+  - uz
+  - fo
+  - ht
+  - ps
+  - tk
+  - nn
+  - mt
+  - sa
+  - lb
+  - my
+  - bo
+  - tl
+  - mg
+  - as
+  - tt
+  - haw
+  - ln
+  - ha
+  - ba
+  - jw
+  - su
+  - yue
+tags:
+  - audio
+  - automatic-speech-recognition
+license: mit
+library_name: ctranslate2
+---
+# Whisper large-v3 model for CTranslate2
+This repository contains the conversion of [deepdml/whisper-large-v3-turbo](https://huggingface.co/deepdml/whisper-large-v3-turbo) to the [CTranslate2](https://github.com/OpenNMT/CTranslate2) model format.
+This model can be used in CTranslate2 or projects based on CTranslate2 such as [faster-whisper](https://github.com/systran/faster-whisper).
+## Example
+```python
+from faster_whisper import WhisperModel
+model = WhisperModel("faster-whisper-large-v3-turbo-ct2")
+segments, info = model.transcribe("audio.mp3")
+for segment in segments:
+    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+```
+## Conversion details
+The original model was converted with the following command:
+```
+ct2-transformers-converter --model deepdml/whisper-large-v3-turbo --output_dir faster-whisper-large-v3-turbo \
+    --copy_files tokenizer.json preprocessor_config.json --quantization float16
+```
+Note that the model weights are saved in FP16. This type can be changed when the model is loaded using the [`compute_type` option in CTranslate2](https://opennmt.net/CTranslate2/quantization.html).
+## More information
+**For more information about the original model, see its [model card](https://huggingface.co/openai/whisper-large-v3).**

faster-whisper-large-v3-turbo-ct2/config.json ADDED Viewed

	@@ -0,0 +1,360 @@

+{
+  "alignment_heads": [
+    [
+      2,
+      0
+    ],
+    [
+      2,
+      1
+    ],
+    [
+      2,
+      2
+    ],
+    [
+      2,
+      3
+    ],
+    [
+      2,
+      4
+    ],
+    [
+      2,
+      5
+    ],
+    [
+      2,
+      6
+    ],
+    [
+      2,
+      7
+    ],
+    [
+      2,
+      8
+    ],
+    [
+      2,
+      9
+    ],
+    [
+      2,
+      10
+    ],
+    [
+      2,
+      11
+    ],
+    [
+      2,
+      12
+    ],
+    [
+      2,
+      13
+    ],
+    [
+      2,
+      14
+    ],
+    [
+      2,
+      15
+    ],
+    [
+      2,
+      16
+    ],
+    [
+      2,
+      17
+    ],
+    [
+      2,
+      18
+    ],
+    [
+      2,
+      19
+    ],
+    [
+      3,
+      0
+    ],
+    [
+      3,
+      1
+    ],
+    [
+      3,
+      2
+    ],
+    [
+      3,
+      3
+    ],
+    [
+      3,
+      4
+    ],
+    [
+      3,
+      5
+    ],
+    [
+      3,
+      6
+    ],
+    [
+      3,
+      7
+    ],
+    [
+      3,
+      8
+    ],
+    [
+      3,
+      9
+    ],
+    [
+      3,
+      10
+    ],
+    [
+      3,
+      11
+    ],
+    [
+      3,
+      12
+    ],
+    [
+      3,
+      13
+    ],
+    [
+      3,
+      14
+    ],
+    [
+      3,
+      15
+    ],
+    [
+      3,
+      16
+    ],
+    [
+      3,
+      17
+    ],
+    [
+      3,
+      18
+    ],
+    [
+      3,
+      19
+    ]
+  ],
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ]
+}

faster-whisper-large-v3-turbo-ct2/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

faster-whisper-large-v3-turbo-ct2/vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gen_english.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import argparse
+import os
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+# ffmpeg_path = f"{ROOT_DIR}/bin" # 替换成你的 FFmpeg bin 目录
+# os.environ["PATH"] = os.environ.get("PATH", "") + os.pathsep + ffmpeg_path
+import gradio as gr
+from utils import movie2audio,make_srt,make_tran,merge_sub,make_tran_zh2en,make_tran_ja2zh,make_tran_ko2zh,make_srt_sv,make_tran_qwen2,make_tran_deep
+from subtitle_to_audio import generate_audio
+import pyttsx3
+engine = pyttsx3.init()
+voices = engine.getProperty('voices')       # getting details of current voice
+vlist = []
+num = 0
+for voice in voices:
+    print(" - Name: %s" % voice.name)
+    vlist.append((voice.name,num))
+    num += 1
+def do_pyttsx3(srt,speed,voice):
+    print(srt,speed,voice)
+    voice = int(voice)
+    generate_audio(path=srt,rate=int(speed),voice_idx=voice)
+    return f"output/{vlist[voice][0]}.wav"
+if __name__ == '__main__':
+    do_pyttsx3("./output/eng.srt",240,3)

img/sample.png ADDED Viewed

model_from_hg/model here ADDED Viewed

	@@ -0,0 +1,26 @@

+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from moviepy.editor import VideoFileClip
+model_dir_cirm = './models_from_modelscope/damo/speech_frcrn_ans_cirm_16k'
+# 提取人声
+def movie2audio(video_path):
+    # 读取视频文件
+    video = VideoFileClip(video_path)
+    # 提取视频文件中的声音
+    audio = video.audio
+    # 将声音保存为WAV格式
+    audio.write_audiofile("./audio.wav")
+    ans = pipeline(
+        Tasks.acoustic_noise_suppression,
+        model=model_dir_cirm)
+    ans('./audio.wav',output_path='./output.wav')
+    return "./output.wav"

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.mdl ADDED Viewed

Binary file (53 Bytes). View file

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/.msc ADDED Viewed

Binary file (616 Bytes). View file

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/README.md ADDED Viewed

	@@ -0,0 +1,285 @@

+---
+tasks:
+- acoustic-noise-suppression
+widgets:
+- task: acoustic-noise-suppression
+  inputs:
+  - type: audio
+    name: input
+    title: 带噪音的原始音频
+    validator:
+      max_size: 10M
+  examples:
+  - name: 1
+    title: 示例1
+    inputs:
+    - name: input
+      data: git://examples/speech_with_noise1.wav
+  - name: 2
+    title: 示例2
+    inputs:
+    - name: input
+      data: git://examples/speech_with_noise.wav
+  inferencespec:
+    cpu: 1
+    memory: 1000
+    gpu: 0
+    gpu_memory: 1000
+model_type:
+- complex-nn
+domain:
+- audio
+frameworks:
+- pytorch
+model-backbone:
+- frcrn
+customized-quickstart: True
+finetune-support: True
+license: Apache License 2.0
+tags:
+- Alibaba
+- Mind DNS
+- ANS
+- AI降噪
+- 语音增强
+- 音频前处理
+- 3A
+datasets:
+  train:
+  - modelscope/ICASSP_2021_DNS_Challenge
+  evaluation:
+  - modelscope/ICASSP_2021_DNS_Challenge
+---
+# FRCRN语音降噪模型介绍
+我们日常可能会碰到一些录音质量不佳的场景。比如，想录制一段干净的语音却发现周围都很吵，录制的语音里往往混杂着噪声。当我们在噪杂的地铁或者巴士上通电话，为了让对方听清楚，不得不提高嗓门和音量。这都是因为环境噪声的影响，使我们在使用语音应用时出现障碍。这是语音通讯中一个普遍存在且又非常棘手的问题。语音质量（quality）和可懂度（intelligibility）容易受到环境噪声、拾音设备、混响及回声的干扰，使通话质量和交流效率大幅降低，如何在嘈杂的环境中保持较高的语音质量和可懂度一直以来是众多企业和学者追求的目标。
+语音降噪问题通过多年研发积累，已经取得一定的突破，尤其针对复杂环境中的语音降噪问题，通过融入复数域深度学习算法，在性能上获得大幅度的提升，在保障更小语音失真度的情况下，最大限度地消除背景噪声，还原目标语音的清晰度，因而语音降噪模型也通常被叫做语音增强模型。
+语音降噪模型的作用是从污染的语音中提取目标语音，还原目标语音质量和可懂度，同时提升语音识别的效果和性能。我们的语音降噪模型只需要输入单麦克风的录音音频，便能够输出降噪后的干净语音音频，即保持音频的格式不变，仅消除音频中的噪声和混响部分，最大限度地保留原始语音。
+## 模型描述
+FRCRN语音降噪模型是基于频率循环 CRN (FRCRN) 新框架开发出来的。该框架是在卷积编-解码(Convolutional Encoder-Decoder)架构的基础上，通过进一步增加循环层获得的卷积循环编-解码(Convolutional Recurrent Encoder-Decoder)新型架构，可以明显改善卷积核的视野局限性，提升降噪模型对频率维度的特征表达，尤其是在频率长距离相关性表达上获得提升，可以在消除噪声的同时，对语音进行更针对性的辨识和保护。
+另外，我们引入前馈序列记忆网络（Feedforward Sequential Memory Network: FSMN）来降低循环网络的复杂性，以及结合复数域网络运算，实现全复数深度网络模型算法，不仅更有效地对长序列语音进行建模，同时对语音的幅度和相位进行同时增强，相关模型在IEEE/INTERSpeech DNS Challenge上有较好的表现。本次开放的模型在参赛版本基础上做了进一步优化，使用了两个Unet级联和SE layer，可以获得更为稳定的效果。如果用户需要因果模型，也可以自行修改代码，把模型中的SElayer替换成卷积层或者加上掩蔽即可。
+该模型神经网络结构如下图所示。
+![model.png](description/model.png)
+模型输入和输出均为16kHz采样率单通道语音时域波形信号，输入信号可由单通道麦克风直接进行录制，输出为噪声抑制后的语音音频信号[1]。模型输入信号通过STFT变换转换成复数频谱特征作为输入，并采用Complex FSMN在频域上进行关联性处理和在时序特征上进行长序处理，预测中间输出目标Complex ideal ratio mask, 然后使用预测的mask和输入频谱相乘后得到增强后的频谱，最后通过STFT逆变换得到增强后语音波形信号。
+## 期望模型使用方式以及适用范围
+### 如何使用
+在安装ModelScope完成之后即可使用```speech_frcrn_ans_cirm_16k```进行推理。模型输入和输出均为16kHz采样率单通道语音时域波形信号，输入信号可由单通道麦克风直接进行录制，输出为噪声抑制后的语音音频信号。为了方便使用在pipeline在模型处理前后增加了wav文件处理逻辑，可以直接读取一个wav文件，并把输出结果保存在指定的wav文件中。
+#### 环境准备：
+* 本模型支持Linxu，Windows和MacOS平台。
+* 本模型已经在1.8~1.11和1.13 下测试通过，由于PyTorch v1.12的[BUG](https://github.com/pytorch/pytorch/issues/80837)，无法在v1.12上运行，请升级到新版或���行以下命令回退到v1.11
+```
+conda install pytorch==1.11 torchaudio torchvision -c pytorch
+```
+* 本模型的pipeline中使用了三方库SoundFile进行wav文件处理，**在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile**，在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为例，用户需要执行如下命令:
+```shell
+sudo apt-get update
+sudo apt-get install libsndfile1
+```
+#### 代码范例
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+ans = pipeline(
+    Tasks.acoustic_noise_suppression,
+    model='damo/speech_frcrn_ans_cirm_16k')
+result = ans(
+    'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise1.wav',
+    output_path='output.wav')
+```
+### 模型局限性以及可能的偏差
+模型在存在多说话人干扰声的场景噪声抑制性能有不同程度的下降。
+## 训练数据介绍
+模型的训练数据来自DNS-Challenge开源数据集，是Microsoft团队为ICASSP相关挑战赛提供的，[官方网址](https://github.com/microsoft/DNS-Challenge)[2]。我们这个模型是用来处理16k音频，因此只使用了其中的fullband数据，并做了少量调整。为便于大家使用，我们把DNS Challenge 2020的数据集迁移在modelscope的[DatasetHub](https://modelscope.cn/datasets/modelscope/ICASSP_2021_DNS_Challenge/summary)上，用户可参照数据集说明文档下载使用。
+## 模型训练流程
+### 复制官方模型
+要训练您自己的降噪模型，首先需要一份官方模型的副本。ModelScope 框架默认把官方模型保存在本地缓存中，可以把本地缓存的模型目录copy一份到您的工作目录。
+检查目录./speech_frcrn_ans_cirm_16k，其中的 pytorch_model.bin 就是模型文件。如果想从头开始训练一个全新的模型，请删除掉这里的 pytorch_model.bin，避免程序运行时加载；如果想基于官方模型继续训练则不要删除。
+```bash
+cp -r ~/.cache/modelscope/hub/damo/speech_frcrn_ans_cirm_16k ./
+cd ./speech_frcrn_ans_cirm_16k
+rm pytorch_model.bin
+```
+目录中的configuration.json文件中是模型和训练的配置项，建议用户对代码逻辑非常熟悉以后再尝试修改。
+### 运行训练代码
+以下列出的为训练示例代码，其中有两个地方需要替换成您的本地路径：
+1. 用您前面下载的本地数据集路径替换`/your_local_path/ICASSP_2021_DNS_Challenge`
+2. 用您复制的官方模型路径替换模型路径
+```python
+import os
+from datasets import load_dataset
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import to_segment
+tmp_dir = './checkpoint'
+if not os.path.exists(tmp_dir):
+    os.makedirs(tmp_dir)
+hf_ds = load_dataset(
+    '/your_local_path/ICASSP_2021_DNS_Challenge',
+    'train',
+    split='train')
+mapped_ds = hf_ds.map(
+    to_segment,
+    remove_columns=['duration'],
+    num_proc=8,
+    batched=True,
+    batch_size=36)
+mapped_ds = mapped_ds.train_test_split(test_size=3000)
+mapped_ds = mapped_ds.shuffle()
+dataset = MsDataset.from_hf_dataset(mapped_ds)
+kwargs = dict(
+    model='your_local_path/speech_frcrn_ans_cirm_16k',
+    train_dataset=dataset['train'],
+    eval_dataset=dataset['test'],
+    work_dir=tmp_dir)
+trainer = build_trainer(
+    Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
+trainer.train()
+```
+训练按照默认配置共200轮，每轮2000个batch，训练出的模型文件会保存在代码中tmp_dir = './checkpoint'指定的目录。目录下还有一个log文件，记录了每个模型的训练和测试loss数据。
+### 使用您的模型
+从您训练出的模型中选择效果最好的，把模型文件copy到 `/your_local_path/speech_frcrn_ans_cirm_16k` ，重命名为 `pytorch_model.bin` 。
+把以下代码中模型路径 `/your_local_path/speech_frcrn_ans_cirm_16k` 替换为您复制的模型目录，就可以测试您的模型效果了。
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+ans = pipeline(
+    Tasks.acoustic_noise_suppression,
+    model='/your_local_path/speech_frcrn_ans_cirm_16k')
+result = ans(
+    'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise.wav',
+    output_path='output.wav')
+```
+代码中的http地址也可以换成您的本地音频文件路径，注意模型支持的音频格式是采样率16000，16bit的单通道wav文件。如果您有多个文件需要处理，只需要循环调用ans()方法即可。如果要多线程处理则需要在每个线程内运行pipeline()初始化一个ans对象。
+## 数据评估及结果
+与其他SOTA模型在DNS Challenge 2020官方测试集上对比效果如下：
+![matrix.png](description/matrix.png)
+指标说明：
+* PESQ (Perceptual Evaluation Of Speech Quality) 语音质量感知评估，是一种客观的、全参考的语音质量评估方法，得分范围在-0.5--4.5之间，得分越高表示语音质量越好。
+* STOI (Short-Time Objective Intelligibility) 短时客观可懂度，反映人类的听觉感知系统对语音可懂度的客观评价，STOI 值介于0~1 之间，值越大代表语音可懂度越高，越清晰。
+* SI-SNR (Scale Invariant Signal-to-Noise Ratio) 尺度不变的信噪比，是在普通信噪比基础上通过正则化消减信号变化导致的影响，是针对宽带噪声失真的语音增强算法的常规衡量方法。
+DNS Challenge的结果列表在[这里](https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-icassp-2022/results/)。
+### 模型评估代码
+可通过如下代码对模型进行评估验证，我们在modelscope的[DatasetHub](https://modelscope.cn/datasets/modelscope/ICASSP_2021_DNS_Challenge/summary)上存储了DNS Challenge 2020的验证集，方便用户下载调用。
+```python
+import os
+import tempfile
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import to_segment
+tmp_dir = tempfile.TemporaryDirectory().name
+if not os.path.exists(tmp_dir):
+    os.makedirs(tmp_dir)
+hf_ds = MsDataset.load(
+    'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
+mapped_ds = hf_ds.map(
+    to_segment,
+    remove_columns=['duration'],
+    # num_proc=5, # Comment this line to avoid error in Jupyter notebook
+    batched=True,
+    batch_size=36)
+dataset = MsDataset.from_hf_dataset(mapped_ds)
+kwargs = dict(
+    model='damo/speech_frcrn_ans_cirm_16k',
+    model_revision='beta',
+    train_dataset=None,
+    eval_dataset=dataset,
+    val_iters_per_epoch=125,
+    work_dir=tmp_dir)
+trainer = build_trainer(
+    Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
+eval_res = trainer.evaluate()
+print(eval_res['avg_sisnr'])
+```
+更多详情请参考下面相关论文。
+### 相关论文以及引用信息
+[1]
+```BibTeX
+@INPROCEEDINGS{9747578,
+  author={Zhao, Shengkui and Ma, Bin and Watcharasupat, Karn N. and Gan, Woon-Seng},
+  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  title={FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement},
+  year={2022},
+  pages={9281-9285},
+  doi={10.1109/ICASSP43922.2022.9747578}}
+```
+[2]
+```BibTeX
+@INPROCEEDINGS{9747230,
+  author={Dubey, Harishchandra and Gopal, Vishak and Cutler, Ross and Aazami, Ashkan and Matusevych, Sergiy and Braun, Sebastian and Eskimez, Sefik Emre and Thakker, Manthan and Yoshioka, Takuya and Gamper, Hannes and Aichner, Robert},
+  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  title={Icassp 2022 Deep Noise Suppression Challenge},
+  year={2022},
+  pages={9271-9275},
+  doi={10.1109/ICASSP43922.2022.9747230}}
+```

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/configuration.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "framework": "pytorch",
+  "task": "acoustic-noise-suppression",
+  "framework": "pytorch",
+  "pipeline": {
+     "type": "speech_frcrn_ans_cirm_16k"
+  },
+  "model": {
+    "type": "speech_frcrn_ans_cirm_16k",
+    "complex": true,
+    "model_complexity": 45,
+    "model_depth": 14,
+    "log_amp": false,
+    "padding_mode": "zeros",
+    "win_len": 640,
+    "win_inc": 320,
+    "fft_len": 640,
+    "win_type": "hann"
+  },
+  "preprocessor": {},
+  "train": {
+      "max_epochs": 200,
+      "train_iters_per_epoch": 2000,
+      "dataloader": {
+          "batch_size_per_gpu": 12,
+          "workers_per_gpu": 0
+      },
+      "seed": 20,
+      "optimizer": {
+          "type": "Adam",
+          "lr": 0.001,
+          "weight_decay": 0.00001,
+          "options": {
+              "grad_clip": {
+                  "max_norm": 10.0
+              }
+          }
+      },
+      "lr_scheduler": {
+          "type": "ReduceLROnPlateau",
+          "mode": "min",
+          "factor": 0.98,
+          "patience": 2,
+          "verbose": true
+      },
+      "lr_scheduler_hook": {
+          "type": "PlateauLrSchedulerHook",
+          "metric_key": "avg_loss"
+      },
+      "hooks": [
+          {
+              "type": "EvaluationHook",
+              "interval": 1
+          }
+      ]
+  },
+  "evaluation": {
+        "val_iters_per_epoch": 200,
+        "dataloader": {
+            "batch_size_per_gpu": 12,
+            "workers_per_gpu": 0
+        },
+        "metrics": ["audio-noise-metric"]
+  }
+}

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/matrix.png ADDED Viewed

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/description/model.png ADDED Viewed

models_from_modelscope/damo/speech_frcrn_ans_cirm_16k/faq.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## Q: 模型处理后的音频听起来有问题？
+A: 建议先确认一下音频格式是否16KHz采样率单通道wav音频，音频内容是否带噪音的语音。
+## Q: 这个模型 cpu 推理较慢怎么办？
+A: FRCRN语音降噪这一版模型的运算量是比较大的，特别是在CPU上处理耗时相对比较长，在模型不变的情况下没有什么很好的优化方案。建议使用GPU来提升速度，通常能够比CPU提升几倍到几十倍，不过GPU第一次使用需要初始化CUDA所以会比第二次调用耗时长一些。
+## Q: 模型是否支持导出为ONNX格式？
+A: 不支持导出。
+## Q: 模型训练速度很慢，一个epoch要跑10个小时左右，请问这是正常的吗？
+A: 这种情况不正常，目前训练流程默认使用单卡，通常V100单卡跑一个epoch约40分钟。您训练的时候可以观察一下cpu和gpu的占用情况。

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+faster-whisper
+ffmpeg-python
+gradio
+modelscope==1.10.0
+moviepy==1.0.3
+transformers==4.36.2
+sentencepiece
+librosa
+tensorflow
+sacremoses
+subword_nmt
+jieba
+funasr>=1.1.1
+ollama
+pysub-parser==1.7.0
+pyttsx3==2.90

slicer2.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import numpy as np
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    *,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(
+        y, shape=out_shape, strides=out_strides
+    )
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+    return np.sqrt(power)
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 threshold: float = -40.,
+                 min_length: int = 5000,
+                 min_interval: int = 300,
+                 hop_size: int = 20,
+                 max_sil_kept: int = 5000):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+        if not max_sil_kept >= hop_size:
+            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+        else:
+            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if (samples.shape[0] + self.hop_size - 1) // self.hop_size <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start: i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if silence_start is not None and total_frames - silence_start >= self.min_interval:
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0:
+                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+            for i in range(len(sil_tags) - 1):
+                chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
+            if sil_tags[-1][1] < total_frames:
+                chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
+            return chunks
+def main():
+    import os.path
+    from argparse import ArgumentParser
+    import librosa
+    import soundfile
+    parser = ArgumentParser()
+    parser.add_argument('audio', type=str, help='The audio to be sliced')
+    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
+    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
+                        help='The dB threshold for silence detection')
+    parser.add_argument('--min_length', type=int, required=False, default=1500,
+                        help='The minimum milliseconds required for each sliced audio clip')
+    parser.add_argument('--min_interval', type=int, required=False, default=300,
+                        help='The minimum milliseconds for a silence part to be sliced')
+    parser.add_argument('--hop_size', type=int, required=False, default=10,
+                        help='Frame length in milliseconds')
+    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
+                        help='The maximum silence length kept around the sliced clip, presented in milliseconds')
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None, mono=False)
+    slicer = Slicer(
+        sr=sr,
+        threshold=args.db_thresh,
+        min_length=args.min_length,
+        min_interval=args.min_interval,
+        hop_size=args.hop_size,
+        max_sil_kept=args.max_sil_kept
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) > 1:
+            chunk = chunk.T
+        soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
+if __name__ == '__main__':
+    main()

subtitle_to_audio.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import tempfile
+import argparse
+from pysubparser import parser
+from pydub import AudioSegment
+import pyttsx3
+engine = pyttsx3.init()
+voices = engine.getProperty('voices')       # getting details of current voice
+vlist = []
+for voice in voices:
+    vlist.append(voice.name)
+def time_to_ms(time):
+  return ((time.hour * 60 + time.minute) * 60 + time.second) * 1000 + time.microsecond / 1000
+def generate_audio(path, rate=200, voice_idx=0):
+  print("Generating audio file for {} with {}".format(path, "pyttsx3"))
+  subtitles = parser.parse(path)
+  tts_engine = pyttsx3.init()
+  tts_engine.setProperty('rate', rate)
+  tts_engine.setProperty('voice', tts_engine.getProperty('voices')[voice_idx].id)
+  audio_sum = AudioSegment.empty()
+  with tempfile.TemporaryDirectory() as tmpdirname:
+    print('created temporary directory', tmpdirname)
+    temp_file_path = os.path.join(tmpdirname, "temp.wav")
+    prev_subtitle = None
+    prev_audio_duration_ms = 0
+    for subtitle in subtitles:
+      tts_engine.save_to_file(subtitle.text, temp_file_path)
+      tts_engine.runAndWait()
+      audio_segment = AudioSegment.from_wav(temp_file_path)
+      print(subtitle.start, subtitle.text)
+      if prev_subtitle is None:
+        silence_duration_ms = time_to_ms(subtitle.start)
+      else:
+        silence_duration_ms = time_to_ms(subtitle.start) - time_to_ms(prev_subtitle.start) - prev_audio_duration_ms
+      audio_sum = audio_sum + AudioSegment.silent(duration=silence_duration_ms) + audio_segment
+      prev_subtitle = subtitle
+      prev_audio_duration_ms = len(audio_segment)
+    with open(f'output/{vlist[voice_idx]}.wav', 'wb') as out_f:
+      audio_sum.export(out_f, format='wav')
+if __name__ == "__main__":
+  arg_parser = argparse.ArgumentParser()
+  arg_parser.add_argument("-p", "--path", help="subtitle file path",default="two_single.srt")
+  arg_parser.add_argument("-r", "--rate", help="speech rate(words per minute)", type=int, default=240)
+  arg_parser.add_argument("-v", "--voice-idx", help="voice selection", type=int, default=1, choices=[0, 1])
+  args = arg_parser.parse_args()
+  generate_audio(path=args.path, rate=args.rate, voice_idx=args.voice_idx)

test_deep.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import httpx, json
+deeplx_api = "http://127.0.0.1:1188/translate"
+data = {
+	"text": "Hello World",
+	"source_lang": "EN",
+	"target_lang": "ZH"
+}
+# JA KO
+post_data = json.dumps(data)
+r = httpx.post(url = deeplx_api, data = post_data).json()
+print(r["data"])

test_ollama.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import ollama
+response = ollama.chat(model='qwen2:7b',messages=[
+{
+'role':'user',
+'content':'"you fucked up , bitch" 翻译为中文，只给我文本的翻译，别添加其他的内容，因为我要做字幕，谢谢'
+}])
+print(response['message']['content'])

test_turbo.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from faster_whisper import WhisperModel
+model = WhisperModel("faster-whisper-large-v3-turbo-ct2")
+segments, info = model.transcribe("audio.wav")
+for segment in segments:
+    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

utils.py ADDED Viewed

	@@ -0,0 +1,802 @@

+from modelscope.pipelines import pipeline as pipeline_ali
+from modelscope.utils.constant import Tasks
+from moviepy.editor import VideoFileClip
+import httpx, json
+import os
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+import ffmpeg
+from faster_whisper import WhisperModel
+import math
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
+from slicer2 import Slicer
+import librosa
+import soundfile
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+# 指定本地目录
+local_dir_root = "./models_from_modelscope"
+# model_dir_cirm = snapshot_download('damo/speech_frcrn_ans_cirm_16k', cache_dir=local_dir_root)
+# model_dir_ins = snapshot_download('damo/nlp_csanmt_translation_en2zh', cache_dir=local_dir_root)
+model_dir_cirm = f'{ROOT_DIR}/models_from_modelscope/damo/speech_frcrn_ans_cirm_16k'
+model_dir_ins = f'{ROOT_DIR}/models_from_modelscope/damo/nlp_csanmt_translation_en2zh'
+device = "cuda" if torch.cuda.is_available() else "cpu"
+import ollama
+def deep_tran(text,_s,_t):
+    deeplx_api = "http://127.0.0.1:1188/translate"
+    data = {
+        "text": text,
+        "source_lang": _s,
+        "target_lang": _t
+    }
+    post_data = json.dumps(data)
+    r = httpx.post(url = deeplx_api, data = post_data).json()
+    print(r["data"])
+    return r["data"]
+# 合并字幕
+def merge_sub(video_path,srt_path):
+    if os.path.exists("test_srt.mp4"):
+        os.remove("test_srt.mp4")
+    ffmpeg.input(video_path).output("test_srt.mp4", vf="subtitles=" + srt_path).run()
+    return "test_srt.mp4"
+def make_tran_ja2zh_neverLife(srt_path):
+    model_path = "neverLife/nllb-200-distilled-600M-ja-zh"
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, from_pt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, src_lang="jpn_Jpan", tgt_lang="zho_Hans", from_pt=True)
+    # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh")
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists("./two.srt"):
+        os.remove("./two.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
+            # print(translated_text)
+            input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt')
+            outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
+            translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+    with open("./two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+def make_tran_ko2zh(srt_path):
+    # pipe = pipeline(model="yesj1234/mbart_cycle1_ko-zh",device=device,from_pt=True)
+    model_path = "./model_from_hg/ko-zh/"
+    tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists("./two.srt"):
+        os.remove("./two.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
+            # print(translated_text)
+            input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt')
+            outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
+            translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+    with open("./two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+def make_tran_ja2zh(srt_path):
+    # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh",device=device)
+    model_path = "./model_from_hg/ja-zh/"
+    tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists("./two.srt"):
+        os.remove("./two.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text']
+            # print(translated_text)
+            input_ids = tokenizer.encode(f'<-ja2zh-> {line_srt[2]}', max_length=128, padding=True, return_tensors='pt')
+            outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128)
+            translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+    with open("./two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+def make_tran_zh2en(srt_path):
+    model_path = "./model_from_hg/zh-en/"
+    tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists("./two.srt"):
+        os.remove("./two.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt')
+            translation = model.generate(**tokenized_text)
+            translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
+            translated_text = translated_text.replace("<pad>","").replace("</s>","").strip()
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+    with open("./two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+# 翻译字幕 英译中
+def make_tran(srt_path):
+    model_path = "./model_from_hg/en-zh/"
+    tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True)
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists("./two.srt"):
+        os.remove("./two.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt')
+            translation = model.generate(**tokenized_text)
+            translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
+            translated_text = translated_text.replace("<pad>","").replace("</s>","").strip()
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+    with open("./two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+# 翻译字幕 deepl
+def make_tran_deep(srt_path,_s,_t):
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists(f"{ROOT_DIR}/output/two.srt"):
+        os.remove(f"{ROOT_DIR}/output/two.srt")
+    if os.path.exists(f"{ROOT_DIR}/output/t_sin_{_t}.srt"):
+        os.remove(f"{ROOT_DIR}/output/t_sin_{_t}.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            text = line_srt[2]
+            translated_text = deep_tran(text,_s,_t)
+            with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+            with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n")
+        except IndexError as e:
+            print(str(e))
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+    with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","r",encoding="utf-8") as f:
+        content_2 = f.read()
+    return content,content_2,f"{ROOT_DIR}/output/t_sin_{_t}.srt"
+# 翻译字幕 英译中 qwen2
+def make_tran_qwen2(model_name,srt_path,lang):
+    with open(srt_path, 'r',encoding="utf-8") as file:
+        gweight_data = file.read()
+    result = gweight_data.split("\n\n")
+    if os.path.exists(f"{ROOT_DIR}/output/two.srt"):
+        os.remove(f"{ROOT_DIR}/output/two.srt")
+    if os.path.exists(f"{ROOT_DIR}/output/two_single.srt"):
+        os.remove(f"{ROOT_DIR}/output/two_single.srt")
+    for res in result:
+        line_srt = res.split("\n")
+        try:
+            if lang == "zh":
+                lang = "中文"
+            elif lang == "en":
+                lang = "英文"
+            elif lang == "ja":
+                lang = "日文"
+            elif lang == "ko":
+                lang = "韩文"
+            text = line_srt[2]
+            content = f'"{text}" 翻译为{lang}，只给我文本的翻译，别添加其他的内容，因为我要做字幕，谢谢'
+            response = ollama.chat(model=model_name,messages=[
+            {
+            'role':'user',
+            'content':content
+            }])
+            translated_text = response['message']['content']
+            print(translated_text)
+        except IndexError as e:
+            # 处理下标越界异常
+            print(f"翻译完毕")
+            break
+        except Exception as e:
+             print(str(e))
+        with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n")
+        with open(f"{ROOT_DIR}/output/two_single.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n")
+    with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    with open(f"{ROOT_DIR}/output/two_single.srt","r",encoding="utf-8") as f:
+        content_2 = f.read()
+    return content,content_2
+# # 翻译字幕
+# def make_tran_ali():
+#     pipeline_ins = pipeline(task=Tasks.translation, model=model_dir_ins)
+#     with open("./video.srt", 'r',encoding="utf-8") as file:
+#         gweight_data = file.read()
+#     result = gweight_data.split("\n\n")
+#     if os.path.exists("./two.srt"):
+#         os.remove("./two.srt")
+#     for res in result:
+#         line_srt = res.split("\n")
+#         try:
+#             outputs = pipeline_ins(input=line_srt[2])
+#             print(outputs['translation'])
+#         except IndexError as e:
+#             # 处理下标越界异常
+#             print(f"翻译完毕")
+#             break
+#         except Exception as e:
+#              print(str(e))
+#         with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{outputs['translation']}\n\n")
+#     return "翻译完毕"
+def convert_seconds_to_hms(seconds):
+    hours, remainder = divmod(seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    milliseconds = math.floor((seconds % 1) * 1000)
+    output = f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"
+    return output
+emo_dict = {
+	"<|HAPPY|>": "😊",
+	"<|SAD|>": "😔",
+	"<|ANGRY|>": "😡",
+	"<|NEUTRAL|>": "",
+	"<|FEARFUL|>": "😰",
+	"<|DISGUSTED|>": "🤢",
+	"<|SURPRISED|>": "😮",
+}
+event_dict = {
+	"<|BGM|>": "🎼",
+	"<|Speech|>": "",
+	"<|Applause|>": "👏",
+	"<|Laughter|>": "😀",
+	"<|Cry|>": "😭",
+	"<|Sneeze|>": "🤧",
+	"<|Breath|>": "",
+	"<|Cough|>": "🤧",
+}
+emoji_dict = {
+	"<|nospeech|><|Event_UNK|>": "",
+	"<|zh|>": "",
+	"<|en|>": "",
+	"<|yue|>": "",
+	"<|ja|>": "",
+	"<|ko|>": "",
+	"<|nospeech|>": "",
+	"<|HAPPY|>": "",
+	"<|SAD|>": "",
+	"<|ANGRY|>": "",
+	"<|NEUTRAL|>": "",
+	"<|BGM|>": "",
+	"<|Speech|>": "",
+	"<|Applause|>": "",
+	"<|Laughter|>": "",
+	"<|FEARFUL|>": "",
+	"<|DISGUSTED|>": "",
+	"<|SURPRISED|>": "",
+	"<|Cry|>": "",
+	"<|EMO_UNKNOWN|>": "",
+	"<|Sneeze|>": "",
+	"<|Breath|>": "",
+	"<|Cough|>": "",
+	"<|Sing|>": "",
+	"<|Speech_Noise|>": "",
+	"<|withitn|>": "",
+	"<|woitn|>": "",
+	"<|GBG|>": "",
+	"<|Event_UNK|>": "",
+}
+lang_dict =  {
+    "<|zh|>": "<|lang|>",
+    "<|en|>": "<|lang|>",
+    "<|yue|>": "<|lang|>",
+    "<|ja|>": "<|lang|>",
+    "<|ko|>": "<|lang|>",
+    "<|nospeech|>": "<|lang|>",
+}
+emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
+lang2token = {
+            'zh': "ZH|",
+            'ja': "JP|",
+            "en": "EN|",
+            "ko": "KO|",
+            "yue": "YUE|",
+        }
+def format_str(s):
+	for sptk in emoji_dict:
+		s = s.replace(sptk, emoji_dict[sptk])
+	return s
+def format_str_v2(s):
+	sptk_dict = {}
+	for sptk in emoji_dict:
+		sptk_dict[sptk] = s.count(sptk)
+		s = s.replace(sptk, "")
+	emo = "<|NEUTRAL|>"
+	for e in emo_dict:
+		if sptk_dict[e] > sptk_dict[emo]:
+			emo = e
+	for e in event_dict:
+		if sptk_dict[e] > 0:
+			s = event_dict[e] + s
+	s = s + emo_dict[emo]
+	for emoji in emo_set.union(event_set):
+		s = s.replace(" " + emoji, emoji)
+		s = s.replace(emoji + " ", emoji)
+	return s.strip()
+def format_str_v3(s):
+	def get_emo(s):
+		return s[-1] if s[-1] in emo_set else None
+	def get_event(s):
+		return s[0] if s[0] in event_set else None
+	s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
+	for lang in lang_dict:
+		s = s.replace(lang, "<|lang|>")
+	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
+	new_s = " " + s_list[0]
+	cur_ent_event = get_event(new_s)
+	for i in range(1, len(s_list)):
+		if len(s_list[i]) == 0:
+			continue
+		if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
+			s_list[i] = s_list[i][1:]
+		#else:
+		cur_ent_event = get_event(s_list[i])
+		if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
+			new_s = new_s[:-1]
+		new_s += s_list[i].strip().lstrip()
+	new_s = new_s.replace("The.", " ")
+	return new_s.strip()
+def ms_to_srt_time(ms):
+    N = int(ms)
+    hours, remainder = divmod(N, 3600000)
+    minutes, remainder = divmod(remainder, 60000)
+    seconds, milliseconds = divmod(remainder, 1000)
+    timesrt = f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+    # print(timesrt)
+    return timesrt
+def time_to_srt(time_in_seconds):
+    """
+    将秒数转换为 SRT 时间戳格式。
+    Args:
+        time_in_seconds: 秒数。
+    Returns:
+        一个 SRT 时间戳字符串。
+    """
+    milliseconds = int(time_in_seconds * 1000)
+    hours = milliseconds // 3600000
+    minutes = (milliseconds % 3600000) // 60000
+    seconds = (milliseconds % 60000) // 1000
+    milliseconds = milliseconds % 1000
+    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+# 制作字幕文件 阿里
+def make_srt_sv(file_path):
+    model_dir = "iic/SenseVoiceSmall"
+    input_file = (file_path)
+    model = AutoModel(model=model_dir,
+                    vad_model="fsmn-vad",
+                    vad_kwargs={"max_single_segment_time": 30000},
+                    trust_remote_code=True, device="cuda:0")
+    res = model.generate(
+        input=input_file,
+        cache={},
+        language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+        use_itn=False,
+        batch_size_s=0,
+    )
+    print(res)
+    text = res[0]["text"]
+    # text = format_str_v3(text)
+    text = rich_transcription_postprocess(text)
+    print(text)
+    return text
+    # for filename in os.listdir("./wavs"):
+    #     if filename.endswith(".wav"):
+    #         filepath = os.path.join("./wavs/", filename)
+    #         try:
+    #             if os.path.isfile(filepath):
+    #                 os.remove(filepath)
+    #                 print(f"已删除文件: {filepath}")
+    #         except Exception as e:
+    #             print(f"删除文件时出错: {filepath} - {e}")
+    # # 第一步，先切片
+    # audio, sr = librosa.load(file_path, sr=None, mono=False)
+    # # 创建Slicer对象
+    # slicer = Slicer(
+    #     sr=sr,
+    #     threshold=-40,
+    #     min_length=1500,
+    #     min_interval=300,
+    #     hop_size=1,
+    #     max_sil_kept=150000
+    # )
+    # # 切割音频
+    # chunks = slicer.slice(audio)
+    # for i, chunk in enumerate(chunks):
+    #     if len(chunk.shape) > 1:
+    #         chunk = chunk.T  # Swap axes if the audio is stereo.
+    #     soundfile.write(f'./wavs/chunk_{i}.wav', chunk, sr)
+    # srtlines = []
+    # audio_samples = 0
+    # audio_opt = []
+    # for filename in os.listdir("./wavs"):
+    #     if filename.endswith(".wav"):
+    #         filepath = os.path.join("./wavs/", filename)
+    #         print(filepath)
+    #         model_dir = "iic/SenseVoiceSmall"
+    #         input_file = (filepath)
+    #         model = AutoModel(model=model_dir,
+    #                         vad_model="fsmn-vad",
+    #                         vad_kwargs={"max_single_segment_time": 30000},
+    #                         trust_remote_code=True, device="cuda:0")
+    #         res = model.generate(
+    #             input=input_file,
+    #             cache={},
+    #             language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    #             use_itn=False,
+    #             batch_size_s=0,
+    #         )
+    #         # print(res)
+    #         text = res[0]["text"]
+    #         # text = format_str_v3(text)
+    #         text = rich_transcription_postprocess(text)
+    #         print(text)
+    #         audio, sampling_rate = soundfile.read(filepath)
+    #         audio_opt.append(audio)
+    #         srtline_begin=ms_to_srt_time(audio_samples*1000.0 / sampling_rate)
+    #         audio_samples += audio.size
+    #         srtline_end=ms_to_srt_time(audio_samples*1000.0 / sampling_rate)
+    #         srtlines.append(f"{len(audio_opt)}\n")
+    #         srtlines.append(srtline_begin+' --> '+srtline_end+"\n")
+    #         srtlines.append(text+"\n\n")
+            # exit(-1)
+    with open('./video.srt', 'w', encoding='utf-8') as f:
+        f.writelines(srtlines)
+    with open("./video.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+# 制作字幕文件
+def make_srt(file_path,model_name="small"):
+    # if device == "cuda":
+    #     model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False)
+    # else:
+    #     model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False)
+    # or run on GPU with INT8
+    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+    if model_name != "faster-whisper-large-v3-turbo-ct2":
+        if device == "cuda":
+            try:
+                model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False)
+            except Exception as e:
+                model = WhisperModel(model_name, device="cuda", compute_type="int8_float16",download_root="./model_from_whisper",local_files_only=False)
+        else:
+            model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False)
+    else:
+        model_name = f"{ROOT_DIR}/faster-whisper-large-v3-turbo-ct2"
+        print(model_name)
+        if device == "cuda":
+            try:
+                model = WhisperModel(model_name, device="cuda", compute_type="float16")
+            except Exception as e:
+                model = WhisperModel(model_name, device="cuda", compute_type="int8_float16")
+        else:
+            model = WhisperModel(model_name, device="cpu", compute_type="int8")
+    segments, info = model.transcribe(file_path, beam_size=5,vad_filter=True,vad_parameters=dict(min_silence_duration_ms=500))
+    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
+    count = 0
+    with open(f'{ROOT_DIR}/output/video.srt', 'w',encoding="utf-8") as f:  # Open file for writing
+        for segment in segments:
+            count +=1
+            duration = f"{convert_seconds_to_hms(segment.start)} --> {convert_seconds_to_hms(segment.end)}\n"
+            text = f"{segment.text.lstrip()}\n\n"
+            f.write(f"{count}\n{duration}{text}")  # Write formatted string to the file
+            print(f"{duration}{text}",end='')
+    with open(f"{ROOT_DIR}/output/video.srt","r",encoding="utf-8") as f:
+        content = f.read()
+    return content
+# 提取人声
+def movie2audio(video_path):
+    # 读取视频文件
+    video = VideoFileClip(video_path)
+    # 提取视频文件中的声音
+    audio = video.audio
+    # 将声音保存为WAV格式
+    audio.write_audiofile(f"{ROOT_DIR}/audio.wav")
+    ans = pipeline_ali(
+        Tasks.acoustic_noise_suppression,
+        model=model_dir_cirm)
+    ans(f'{ROOT_DIR}/audio.wav',output_path=f'{ROOT_DIR}/output.wav')
+    return f"{ROOT_DIR}/output.wav"

生成英文配音.bat ADDED Viewed

	@@ -0,0 +1,17 @@

+SET PYTHON_PATH=%cd%\venv\
+rem overriding default python env vars in order not to interfere with any system python installation
+SET PYTHONHOME=
+SET PYTHONPATH=
+SET PYTHONEXECUTABLE=%PYTHON_PATH%\python.exe
+SET PYTHONWEXECUTABLE=%PYTHON_PATH%pythonw.exe
+SET PYTHON_EXECUTABLE=%PYTHON_PATH%\python.exe
+SET PYTHONW_EXECUTABLE=%PYTHON_PATH%pythonw.exe
+SET PYTHON_BIN_PATH=%PYTHON_EXECUTABLE%
+SET PYTHON_LIB_PATH=%PYTHON_PATH%\Lib\site-packages
+SET FFMPEG_PATH=%cd%\venv\ffmpeg\bin
+SET PATH=%PYTHON_PATH%;%PYTHON_PATH%\Scripts;%FFMPEG_PATH%;%PATH%
+@REM set HF_ENDPOINT=https://hf-mirror.com
+@REM set HF_HOME=%CD%\hf_download
+@REM set PYTHONPATH=third_party/AcademiCodec;third_party/Matcha-TTS
+"%PYTHON_EXECUTABLE%" gen_english.py
+pause

运行.bat ADDED Viewed

	@@ -0,0 +1,17 @@

+SET PYTHON_PATH=%cd%\venv\
+rem overriding default python env vars in order not to interfere with any system python installation
+SET PYTHONHOME=
+SET PYTHONPATH=
+SET PYTHONEXECUTABLE=%PYTHON_PATH%\python.exe
+SET PYTHONWEXECUTABLE=%PYTHON_PATH%pythonw.exe
+SET PYTHON_EXECUTABLE=%PYTHON_PATH%\python.exe
+SET PYTHONW_EXECUTABLE=%PYTHON_PATH%pythonw.exe
+SET PYTHON_BIN_PATH=%PYTHON_EXECUTABLE%
+SET PYTHON_LIB_PATH=%PYTHON_PATH%\Lib\site-packages
+SET FFMPEG_PATH=%cd%\venv\ffmpeg\bin
+SET PATH=%PYTHON_PATH%;%PYTHON_PATH%\Scripts;%FFMPEG_PATH%;%PATH%
+@REM set HF_ENDPOINT=https://hf-mirror.com
+@REM set HF_HOME=%CD%\hf_download
+@REM set PYTHONPATH=third_party/AcademiCodec;third_party/Matcha-TTS
+"%PYTHON_EXECUTABLE%" app.py
+pause