import gradio as gr from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from videoclipper import VideoClipper if __name__ == "__main__": inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch', vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', ) audio_clipper = VideoClipper(inference_pipeline) def audio_recog(audio_input): return audio_clipper.recog(audio_input) def audio_clip(dest_text, start_ost, end_ost, state): return audio_clipper.clip(dest_text, start_ost, end_ost, state) def video_recog(video_input): return audio_clipper.video_recog(video_input) def video_clip(dest_text, start_ost, end_ost, state): return audio_clipper.video_clip(dest_text, start_ost, end_ost, state) def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color): return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True) top_md_1 = (""" A video clip tool based on Paraformer-long's VAD, ASR, timestamp prediction, punctuation restoration abilities. Get the video clip simply following steps: * Step1: Upload video file (or try examples below), click **Recognize** button * Step2: Copy text segments you need to 'Text to Clip', set the subtitle settings (if you need) * Step3: Click **Clip** button or **Clip and Generate Subtitles** button """) top_md_2 = (""" The video had better to have size under 40Mb, For video in large size, you can split the audio from it and use 'Audio Clip', or **establish your own gradio service with the source code (recommanded)** :
FunASR_APP: 🌟Support Us:
""") top_md_3 = ("""You may understand FunASR futher with source code and paper:
FunASR: FunASR Paper: 🌟Star FunASR:
""") # gradio interface with gr.Blocks() as demo: #gr.Image("./examples/guide.png", show_label=False) gr.Markdown(top_md_1) gr.Markdown(top_md_2) gr.Markdown(top_md_3) video_state = gr.State() audio_state = gr.State() with gr.Tab("🎥✂️视频裁剪 Video Clipping"): with gr.Row(): with gr.Column(): video_input = gr.Video(label="🎥视频输入 Video Input") gr.Examples(['examples/2022云栖大会_片段2.mp4', 'examples/2022云栖大会_片段.mp4', 'examples/为什么要多读书?这是我听过最好的答案-片段.mp4', 'examples/使用chatgpt_片段.mp4'], [video_input]) recog_button2 = gr.Button("👂识别 Recognize") video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result") video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles") with gr.Column(): video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)") with gr.Row(): video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)") video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)") with gr.Row(): font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size") font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white') # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font") with gr.Row(): clip_button2 = gr.Button("✂️裁剪\nClip") clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles") video_output = gr.Video(label="🎥裁剪结果 Audio Clipped") video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log") video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles") with gr.Tab("🔊✂️音频裁剪 Audio Clipping"): with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="🔊音频输入 Audio Input") gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input]) recog_button1 = gr.Button("👂识别 Recognize") audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result") audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles") with gr.Column(): audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)") with gr.Row(): audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)") audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)") with gr.Row(): clip_button1 = gr.Button("✂️裁剪 Clip") audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped") audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log") audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles") recog_button1.click(audio_recog, inputs=audio_input, outputs=[audio_text_output, audio_srt_output, audio_state]) clip_button1.click(audio_clip, inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state], outputs=[audio_output, audio_mess_output, audio_srt_clip_output]) recog_button2.click(video_recog, inputs=video_input, outputs=[video_text_output, video_srt_output, video_state]) clip_button2.click(video_clip, inputs=[video_text_input, video_start_ost, video_end_ost, video_state], outputs=[video_output, video_mess_output, video_srt_clip_output]) clip_button3.click(video_clip_addsub, inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color], outputs=[video_output, video_mess_output, video_srt_clip_output]) # start gradio service in local demo.queue(concurrency_count=3).launch()