File size: 8,217 Bytes
76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 2aacd40 76934e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from videoclipper import VideoClipper
if __name__ == "__main__":
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
)
audio_clipper = VideoClipper(inference_pipeline)
def audio_recog(audio_input):
return audio_clipper.recog(audio_input)
def audio_clip(dest_text, start_ost, end_ost, state):
return audio_clipper.clip(dest_text, start_ost, end_ost, state)
def video_recog(video_input):
return audio_clipper.video_recog(video_input)
def video_clip(dest_text, start_ost, end_ost, state):
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)
def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)
top_md_1 = ("""
A video clip tool based on Paraformer-long's VAD, ASR, timestamp prediction, punctuation restoration abilities.
Get the video clip simply following steps:
* Step1: Upload video file (or try examples below), click **<font color="#f7802b">Recognize</font>** button
* Step2: Copy text segments you need to 'Text to Clip', set the subtitle settings (if you need)
* Step3: Click **<font color="#f7802b">Clip</font>** button or **<font color="#f7802b">Clip and Generate Subtitles</font>** button
""")
top_md_2 = ("""
The video had better to have size under 40Mb,
For video in large size, you can split the audio from it and use 'Audio Clip',
or **<font color="#1785c4">establish your own gradio service with the source code (recommanded)</font>** :
<div align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
🌟Support Us: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
</div>
</div>
""")
top_md_3 = ("""You may understand FunASR futher with source code and paper:
<div align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
</div>
</div>
""")
# gradio interface
with gr.Blocks() as demo:
#gr.Image("./examples/guide.png", show_label=False)
gr.Markdown(top_md_1)
gr.Markdown(top_md_2)
gr.Markdown(top_md_3)
video_state = gr.State()
audio_state = gr.State()
with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
with gr.Row():
with gr.Column():
video_input = gr.Video(label="🎥视频输入 Video Input")
gr.Examples(['examples/2022云栖大会_片段2.mp4',
'examples/2022云栖大会_片段.mp4',
'examples/为什么要多读书?这是我听过最好的答案-片段.mp4',
'examples/使用chatgpt_片段.mp4'],
[video_input])
recog_button2 = gr.Button("👂识别 Recognize")
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
with gr.Column():
video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
with gr.Row():
video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
with gr.Row():
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
# font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
with gr.Row():
clip_button2 = gr.Button("✂️裁剪\nClip")
clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="🔊音频输入 Audio Input")
gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
recog_button1 = gr.Button("👂识别 Recognize")
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
with gr.Column():
audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
with gr.Row():
audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
with gr.Row():
clip_button1 = gr.Button("✂️裁剪 Clip")
audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
recog_button1.click(audio_recog,
inputs=audio_input,
outputs=[audio_text_output, audio_srt_output, audio_state])
clip_button1.click(audio_clip,
inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state],
outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
recog_button2.click(video_recog,
inputs=video_input,
outputs=[video_text_output, video_srt_output, video_state])
clip_button2.click(video_clip,
inputs=[video_text_input, video_start_ost, video_end_ost, video_state],
outputs=[video_output, video_mess_output, video_srt_clip_output])
clip_button3.click(video_clip_addsub,
inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color],
outputs=[video_output, video_mess_output, video_srt_clip_output])
# start gradio service in local
demo.queue(concurrency_count=3).launch()
|