File size: 8,217 Bytes
76934e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2aacd40
76934e1
2aacd40
76934e1
2aacd40
76934e1
2aacd40
 
 
76934e1
2aacd40
76934e1
 
2aacd40
 
 
76934e1
 
 
2aacd40
76934e1
 
 
 
2aacd40
76934e1
 
 
 
 
 
 
 
 
 
 
 
2aacd40
 
 
76934e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from videoclipper import VideoClipper


if __name__ == "__main__":
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
    )
    audio_clipper = VideoClipper(inference_pipeline)

    def audio_recog(audio_input):
        return audio_clipper.recog(audio_input)

    def audio_clip(dest_text, start_ost, end_ost, state):
        return audio_clipper.clip(dest_text, start_ost, end_ost, state)

    def video_recog(video_input):
        return audio_clipper.video_recog(video_input)

    def video_clip(dest_text, start_ost, end_ost, state):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)

    def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)

    
    top_md_1 = ("""
    A video clip tool based on Paraformer-long's VAD, ASR, timestamp prediction, punctuation restoration abilities.

    Get the video clip simply following steps:

    * Step1: Upload video file (or try examples below), click **<font color="#f7802b">Recognize</font>** button
    * Step2: Copy text segments you need to 'Text to Clip', set the subtitle settings (if you need)
    * Step3: Click **<font color="#f7802b">Clip</font>** button or **<font color="#f7802b">Clip and Generate Subtitles</font>** button
    """)
    

    top_md_2 = ("""
    The video had better to have size under 40Mb,
    For video in large size, you can split the audio from it and use 'Audio Clip',
    or **<font color="#1785c4">establish your own gradio service with the source code (recommanded)</font>** :
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
    FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a> 
    🌟Support Us: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
    </div>
    </div>
    """)

    top_md_3 = ("""You may understand FunASR futher with source code and paper:
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
        FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a> 
        FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a> 
        🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
    </div>
    </div>
    """)

    # gradio interface
    with gr.Blocks() as demo:
        #gr.Image("./examples/guide.png", show_label=False)
        gr.Markdown(top_md_1)
        gr.Markdown(top_md_2)
        gr.Markdown(top_md_3)
        video_state = gr.State()
        audio_state = gr.State()
        with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(label="🎥视频输入 Video Input")
                    gr.Examples(['examples/2022云栖大会_片段2.mp4', 
                                 'examples/2022云栖大会_片段.mp4', 
                                 'examples/为什么要多读书?这是我听过最好的答案-片段.mp4', 
                                 'examples/使用chatgpt_片段.mp4'],
                                [video_input])
                    recog_button2 = gr.Button("👂识别 Recognize")
                    video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    with gr.Row():
                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
                        font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
                        # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
                    with gr.Row():
                        clip_button2 = gr.Button("✂️裁剪\nClip")
                        clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
                    video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
                    video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")

        with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(label="🔊音频输入 Audio Input")
                    gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
                    recog_button1 = gr.Button("👂识别 Recognize")
                    audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    with gr.Row():
                        audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
                        audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        clip_button1 = gr.Button("✂️裁剪 Clip")
                    audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
                    audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
        
        recog_button1.click(audio_recog, 
                            inputs=audio_input, 
                            outputs=[audio_text_output, audio_srt_output, audio_state])
        clip_button1.click(audio_clip, 
                           inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state], 
                           outputs=[audio_output, audio_mess_output, audio_srt_clip_output])

        recog_button2.click(video_recog, 
                            inputs=video_input, 
                            outputs=[video_text_output, video_srt_output, video_state])
        clip_button2.click(video_clip, 
                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state], 
                           outputs=[video_output, video_mess_output, video_srt_clip_output])
        clip_button3.click(video_clip_addsub, 
                           inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color], 
                           outputs=[video_output, video_mess_output, video_srt_clip_output])
    
    # start gradio service in local
    demo.queue(concurrency_count=3).launch()