01du commited on
Commit
cdbb2b2
·
1 Parent(s): 23cc4a2

Add application file

Browse files
README.md CHANGED
@@ -7,6 +7,9 @@ sdk: gradio
7
  sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - email
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/videoclipper.cpython-311.pyc ADDED
Binary file (24.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from videoclipper import VideoClipper
4
+ import gradio as gr
5
+ import requests
6
+ from huggingface_hub import whoami
7
+ import whisper
8
+ from llm.openai_api import openai_call
9
+ from llm.yi_moe_api import yi_moe
10
+ from utils.trans_utils import extract_timestamps
11
+
12
+ API_URL_TEMPLATE = "https://api-yidong.lingyiwanwu.com/v1/ops/api_key?user_email={user_email}&user_source=huggingface"
13
+ model = whisper.load_model("tiny")
14
+ audio_clipper = VideoClipper(model)
15
+
16
+ def get_user_email(oauth_token: gr.OAuthToken | None) -> str | None:
17
+ def call_api(user_email):
18
+ url = API_URL_TEMPLATE.format(user_email=user_email)
19
+ headers = {"Authorization": f'Basic {os.getenv("AUTH")}'}
20
+ response = requests.post(url, headers=headers)
21
+ return response.json()["data"]["display_api_key"]
22
+
23
+ if oauth_token is None:
24
+ return None
25
+
26
+ user_info = whoami(token=oauth_token.token)
27
+ email = user_info.get("email")
28
+ return call_api(email)
29
+
30
+ def audio_recog(audio_input, output_dir):
31
+ return audio_clipper.recog(audio_input, None, output_dir=output_dir)
32
+
33
+ def video_recog(video_input, output_dir, ASR):
34
+ return audio_clipper.video_recog(video_input, output_dir=output_dir, ASR=ASR)
35
+
36
+ def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
37
+
38
+ return audio_clipper.video_clip(
39
+ dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
40
+ )
41
+
42
+ def mix_recog(video_input, audio_input,output_dir,ASR="whisper"):
43
+ '''
44
+ 识别视频或音频,返回识别的文本、字幕和状态信息。
45
+ '''
46
+ output_dir = output_dir.strip()
47
+ if not len(output_dir):
48
+ output_dir = None
49
+ else:
50
+ output_dir = os.path.abspath(output_dir)
51
+ audio_state, video_state = None, None
52
+ if video_input is not None:
53
+ # import pdb; pdb.set_trace() ############
54
+ res_text, res_srt, video_state = video_recog(
55
+ video_input, output_dir=output_dir, ASR = ASR)
56
+ return res_text, res_srt, video_state, None
57
+
58
+ if audio_input is not None:
59
+ res_text, res_srt, audio_state = audio_recog(
60
+ audio_input, output_dir=output_dir)
61
+ return res_text, res_srt, None, audio_state
62
+
63
+ def llm_inference(system_content, user_content, srt_text, model, apikey):
64
+ SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot',"gpt-4o","22A"]
65
+ if model.startswith('gpt') or model.startswith('moonshot'):
66
+ return openai_call(apikey, model, system_content = system_content, user_content = user_content+'\n'+srt_text)
67
+ elif model.startswith('22A'):
68
+ return yi_moe(apikey, model, user_content+'\n'+srt_text, system_content)
69
+ else:
70
+ logging.error("LLM name error, only {} are supported as LLM name prefix."
71
+ .format(SUPPORT_LLM_PREFIX))
72
+
73
+ def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
74
+ timestamp_list = extract_timestamps(LLM_res)
75
+ output_dir = output_dir.strip()
76
+ if not len(output_dir):
77
+ output_dir = None
78
+ else:
79
+ output_dir = os.path.abspath(output_dir)
80
+ if video_state is not None:
81
+ clip_video_file, message, clip_srt = audio_clipper.video_clip(
82
+ dest_text, start_ost, end_ost, video_state,
83
+ dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
84
+ return clip_video_file, None, message, clip_srt
85
+ if audio_state is not None:
86
+ (sr, res_audio), message, clip_srt = audio_clipper.clip(
87
+ dest_text, start_ost, end_ost, audio_state,
88
+ dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
89
+ return None, (sr, res_audio), message, clip_srt
90
+
91
+ with gr.Blocks() as clip_service:
92
+ video_state, audio_state = gr.State(), gr.State()
93
+ with gr.Row():
94
+ login_button = gr.LoginButton()
95
+ user_email_display = gr.Textbox(
96
+ label="In order to get your user key, please click on huggingface login, the first time you login you will have the full key, please save it. After that your key will be hidden.",
97
+ interactive=True,
98
+ )
99
+ clip_service.load(get_user_email, inputs=None, outputs=user_email_display)
100
+ logging.info(f"The value of the current variable is: {user_email_display}")
101
+ video_input = gr.Video(label="视频输入 | Video Input")
102
+ audio_input = gr.Audio(label="音频输入 | Audio Input")
103
+ with gr.Column():
104
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
105
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
106
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
107
+ [video_input],
108
+ label='示例视频 | Demo Video')
109
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
110
+ [video_input],
111
+ label='多说话人示例视频 | Multi-speaker Demo Video')
112
+ gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
113
+ [audio_input],
114
+ label="示例音频 | Demo Audio",visible=False)
115
+ ASR_model = gr.Dropdown(
116
+ choices=["whisper"],
117
+ value="whisper",
118
+ label="ASR Model Name",
119
+ allow_custom_value=True)
120
+ recog_button = gr.Button("👂 识别 | ASR", variant="primary")
121
+ output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空,Linux, mac系统可以稳定使用)")
122
+ video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
123
+ video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
124
+ prompt_head = gr.Textbox(label="Prompt System (按需更改,最好不要变动主体和要求)", value=("你是一个视频srt字幕分析剪辑器,输入视频的srt字幕和用户命令,请你结合用户指令选出符合要求的片段并输出。注意:要谨慎分析用户的问题,找出符合用户提问的srt字幕片段。"
125
+ "尽可能将连续的片段并裁剪出来,将片段中在时间上连续的多个句子及它们的时间戳合并为一条,保证合并后的片段有着相同的主题"
126
+ "注意确保文字与时间戳的正确匹配。你应该按照以下顺序进行处理:"
127
+ "1.将srt字幕合并成数段主题的内容。2.将用户命令和查询进行匹配。"
128
+ "输出需严格按照如下格式:1. [开始时间-结束时间] 文本,注意其中的连接符是“-”"))
129
+
130
+ prompt_head2 = gr.Textbox(label="Prompt User(请输入用户指令)")
131
+ with gr.Column():
132
+ with gr.Row():
133
+ llm_model = gr.Dropdown(
134
+ choices=["gpt-4o",
135
+ "22A"],
136
+ value="22A",
137
+ label="LLM Model Name",
138
+ allow_custom_value=True)
139
+ apikey_input = gr.Textbox(label="APIKEY")
140
+ llm_button = gr.Button("LLM推理 | LLM Inference(首先进行识别,非g4f需配置对应apikey)", variant="primary")
141
+ llm_result = gr.Textbox(label="LLM Clipper Result")
142
+ llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
143
+ video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)",value ="这个不需要", visible=False)
144
+ video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)",value ="这个不需要", visible=False)
145
+ with gr.Row():
146
+ video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)",visible=False)
147
+ video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)",visible=False)
148
+ video_output = gr.Video(label="裁剪结果 | Video Clipped")
149
+ audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
150
+ clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
151
+ srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
152
+ recog_button.click(mix_recog,
153
+ inputs=[video_input,
154
+ audio_input,
155
+ output_dir,
156
+ ASR_model
157
+ ],
158
+ outputs=[video_text_output, video_srt_output, video_state, audio_state])
159
+
160
+ llm_button.click(llm_inference,
161
+ inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
162
+ outputs=[llm_result])
163
+ llm_clip_button.click(AI_clip,
164
+ inputs=[llm_result,
165
+ video_text_input,
166
+ video_spk_input,
167
+ video_start_ost,
168
+ video_end_ost,
169
+ video_state,
170
+ audio_state,
171
+ output_dir,
172
+ ],
173
+ outputs=[video_output, audio_output, clip_message, srt_clipped])
174
+
175
+
176
+
177
+ if __name__ == "__main__":
178
+
179
+ clip_service.queue(
180
+ max_size=10,
181
+ default_concurrency_limit=10,
182
+ )
183
+
184
+ clip_service.launch(ssr_mode=False)
llm/__pycache__/demo_prompt.cpython-311.pyc ADDED
Binary file (6.17 kB). View file
 
llm/__pycache__/g4f_openai_api.cpython-310.pyc ADDED
Binary file (904 Bytes). View file
 
llm/__pycache__/g4f_openai_api.cpython-311.pyc ADDED
Binary file (1.49 kB). View file
 
llm/__pycache__/openai_api.cpython-310.pyc ADDED
Binary file (1.06 kB). View file
 
llm/__pycache__/openai_api.cpython-311.pyc ADDED
Binary file (1.83 kB). View file
 
llm/__pycache__/qwen_api.cpython-310.pyc ADDED
Binary file (813 Bytes). View file
 
llm/__pycache__/qwen_api.cpython-311.pyc ADDED
Binary file (1.24 kB). View file
 
llm/__pycache__/yi_moe_api.cpython-311.pyc ADDED
Binary file (2.13 kB). View file
 
llm/openai_api.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from openai import OpenAI
4
+ import openai
5
+
6
+
7
+ if __name__ == '__main__':
8
+ from llm.demo_prompt import demo_prompt
9
+ client = OpenAI(
10
+ # This is the default and can be omitted
11
+ api_key=os.environ.get("OPENAI_API_KEY"),
12
+ )
13
+
14
+ chat_completion = client.chat.completions.create(
15
+ messages=[
16
+ {
17
+ "role": "user",
18
+ "content": demo_prompt,
19
+ }
20
+ ],
21
+ model="gpt-3.5-turbo-0125",
22
+ )
23
+ print(chat_completion.choices[0].message.content)
24
+
25
+
26
+ def openai_call(apikey,
27
+ model="gpt-3.5-turbo",
28
+ user_content="如何做西红柿炖牛腩?",
29
+ system_content=None):
30
+
31
+
32
+ client = OpenAI(
33
+ # This is the default and can be omitted
34
+
35
+ api_key=apikey,
36
+ base_url="https://api.lingyiwanwu.com/v1"
37
+ )
38
+
39
+ if system_content is not None and len(system_content.strip()):
40
+ messages = [
41
+ {'role': 'system', 'content': system_content},
42
+ {'role': 'user', 'content': user_content}
43
+ ]
44
+ else:
45
+ messages = [
46
+ {'role': 'user', 'content': user_content}
47
+ ]
48
+
49
+ chat_completion = client.chat.completions.create(
50
+ messages=messages,
51
+ model=model,
52
+ )
53
+ logging.info("Openai model inference done.")
54
+ return chat_completion.choices[0].message.content
llm/yi_moe_api.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import logger
4
+ import time
5
+ def yi_moe(api_key,
6
+ model="moyi-chat-v03-sglang",
7
+ user_content="如何下载github上的项目?",
8
+ system_content=None):
9
+
10
+ url ="http://10.2.5.29:30869/v1/chat/completions"
11
+ headers = {
12
+ "Authorization": "Basic ZjIyMDIwYjRkMTIyM2UyNGI4NjVlMWIxZWI0YzAzZTM6NWJjS01PbENMTDRTV1MxaERkSHlTRzViSTJCd3psR1A=" + api_key,
13
+ "Content-Type": "application/json"
14
+ }
15
+ data = {
16
+ "model": model,
17
+ "messages": [
18
+ {
19
+ "role": "system",
20
+ "content": system_content
21
+ },
22
+ {
23
+ "role": "user",
24
+ "content": user_content
25
+ }
26
+ ],
27
+ "temperature": 0.7,
28
+ "stream": False,
29
+ "max_tokens": 4096
30
+ }
31
+ max_retries = 5
32
+ retry_count = 0
33
+ while retry_count < max_retries:
34
+ try:
35
+ response = requests.post(url, json=data, headers=headers, timeout=60)
36
+ if response.status_code == 200:
37
+ response = response.json()
38
+ text = response["choices"][0]["message"]["content"]
39
+ return text
40
+ else:
41
+ logger.warning(f"{response.status_code}, {response.text}")
42
+ retry_count += 1
43
+ time.sleep(2)
44
+ logger.info(f"Retrying... attempt {retry_count}")
45
+ except Exception as e:
46
+ logger.error(e)
47
+ retry_count += 1
48
+ time.sleep(2)
49
+ logger.info(f"Retrying... attempt {retry_count}")
50
+ return "error"
51
+
52
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ librosa
2
+ soundfile
3
+ moviepy
4
+ numpy==1.26.4
5
+ openai
6
+ openai-whisper
utils/__pycache__/argparse_tools.cpython-310.pyc ADDED
Binary file (2.44 kB). View file
 
utils/__pycache__/argparse_tools.cpython-311.pyc ADDED
Binary file (4.08 kB). View file
 
utils/__pycache__/subtitle_utils.cpython-310.pyc ADDED
Binary file (3.7 kB). View file
 
utils/__pycache__/subtitle_utils.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
utils/__pycache__/trans_utils.cpython-310.pyc ADDED
Binary file (4.33 kB). View file
 
utils/__pycache__/trans_utils.cpython-311.pyc ADDED
Binary file (8.45 kB). View file
 
utils/argparse_tools.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import argparse
7
+ from pathlib import Path
8
+
9
+ import yaml
10
+ import sys
11
+
12
+
13
+ class ArgumentParser(argparse.ArgumentParser):
14
+ """Simple implementation of ArgumentParser supporting config file
15
+
16
+ This class is originated from https://github.com/bw2/ConfigArgParse,
17
+ but this class is lack of some features that it has.
18
+
19
+ - Not supporting multiple config files
20
+ - Automatically adding "--config" as an option.
21
+ - Not supporting any formats other than yaml
22
+ - Not checking argument type
23
+
24
+ """
25
+
26
+ def __init__(self, *args, **kwargs):
27
+ super().__init__(*args, **kwargs)
28
+ self.add_argument("--config", help="Give config file in yaml format")
29
+
30
+ def parse_known_args(self, args=None, namespace=None):
31
+ # Once parsing for setting from "--config"
32
+ _args, _ = super().parse_known_args(args, namespace)
33
+ if _args.config is not None:
34
+ if not Path(_args.config).exists():
35
+ self.error(f"No such file: {_args.config}")
36
+
37
+ with open(_args.config, "r", encoding="utf-8") as f:
38
+ d = yaml.safe_load(f)
39
+ if not isinstance(d, dict):
40
+ self.error("Config file has non dict value: {_args.config}")
41
+
42
+ for key in d:
43
+ for action in self._actions:
44
+ if key == action.dest:
45
+ break
46
+ else:
47
+ self.error(f"unrecognized arguments: {key} (from {_args.config})")
48
+
49
+ # NOTE(kamo): Ignore "--config" from a config file
50
+ # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
51
+ # i.e. We can set any type value regardless of argument type.
52
+ self.set_defaults(**d)
53
+ return super().parse_known_args(args, namespace)
54
+
55
+
56
+ def get_commandline_args():
57
+ extra_chars = [
58
+ " ",
59
+ ";",
60
+ "&",
61
+ "(",
62
+ ")",
63
+ "|",
64
+ "^",
65
+ "<",
66
+ ">",
67
+ "?",
68
+ "*",
69
+ "[",
70
+ "]",
71
+ "$",
72
+ "`",
73
+ '"',
74
+ "\\",
75
+ "!",
76
+ "{",
77
+ "}",
78
+ ]
79
+
80
+ # Escape the extra characters for shell
81
+ argv = [
82
+ arg.replace("'", "'\\''")
83
+ if all(char not in arg for char in extra_chars)
84
+ else "'" + arg.replace("'", "'\\''") + "'"
85
+ for arg in sys.argv
86
+ ]
87
+
88
+ return sys.executable + " " + " ".join(argv)
utils/subtitle_utils.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+ import re
6
+
7
+ def time_convert(ms):
8
+ ms = int(ms)
9
+ tail = ms % 1000
10
+ s = ms // 1000
11
+ mi = s // 60
12
+ s = s % 60
13
+ h = mi // 60
14
+ mi = mi % 60
15
+ h = "00" if h == 0 else str(h)
16
+ mi = "00" if mi == 0 else str(mi)
17
+ s = "00" if s == 0 else str(s)
18
+ tail = str(tail)
19
+ if len(h) == 1: h = '0' + h
20
+ if len(mi) == 1: mi = '0' + mi
21
+ if len(s) == 1: s = '0' + s
22
+ return "{}:{}:{},{}".format(h, mi, s, tail)
23
+
24
+ def str2list(text):
25
+ pattern = re.compile(r'[\u4e00-\u9fff]|[\w-]+', re.UNICODE)
26
+ elements = pattern.findall(text)
27
+ return elements
28
+
29
+ class Text2SRT():
30
+ def __init__(self, text, timestamp, offset=0):
31
+ self.token_list = text
32
+ self.timestamp = timestamp
33
+ start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
34
+ self.start_sec, self.end_sec = start, end
35
+ self.start_time = time_convert(start)
36
+ self.end_time = time_convert(end)
37
+ def text(self):
38
+ if isinstance(self.token_list, str):
39
+ return self.token_list
40
+ else:
41
+ res = ""
42
+ for word in self.token_list:
43
+ if '\u4e00' <= word <= '\u9fff':
44
+ res += word
45
+ else:
46
+ res += " " + word
47
+ return res.lstrip()
48
+ def srt(self, acc_ost=0.0):
49
+ return "{} --> {}\n{}\n".format(
50
+ time_convert(self.start_sec+acc_ost*1000),
51
+ time_convert(self.end_sec+acc_ost*1000),
52
+ self.text())
53
+ def time(self, acc_ost=0.0):
54
+ return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
55
+
56
+ class Text2SRT_audio():
57
+ def __init__(self, text, start,end, offset=0):
58
+ self.token_list = text
59
+ # self.timestamp = timestamp
60
+ start, end = start*1000 - offset, end*1000 - offset
61
+ self.start_sec, self.end_sec = start, end
62
+ self.start_time = time_convert(start)
63
+ self.end_time = time_convert(end)
64
+ def text(self):
65
+ if isinstance(self.token_list, str):
66
+ return self.token_list
67
+ else:
68
+ res = ""
69
+ for word in self.token_list:
70
+ if '\u4e00' <= word <= '\u9fff':
71
+ res += word
72
+ else:
73
+ res += " " + word
74
+ return res.lstrip()
75
+ def srt(self, acc_ost=0.0):
76
+ return "{} --> {}\n{}\n".format(
77
+ time_convert(self.start_sec+acc_ost*1000),
78
+ time_convert(self.end_sec+acc_ost*1000),
79
+ self.text())
80
+ def time(self, acc_ost=0.0):
81
+ return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
82
+
83
+ def generate_srt(sentence_list):
84
+ srt_total = ''
85
+ for i, sent in enumerate(sentence_list):
86
+ t2s = Text2SRT(sent['text'], sent['timestamp'])
87
+ if 'spk' in sent:
88
+ srt_total += "{} spk{}\n{}".format(i, sent['spk'], t2s.srt())
89
+ else:
90
+ srt_total += "{}\n{}".format(i, t2s.srt())
91
+ return srt_total
92
+
93
+ def trans_format(text):
94
+ # 将whisper的识别结果转化为后续的数据标准
95
+ total_list = []
96
+ timestamp_list = []
97
+ sentence_info = []
98
+ for segment in text["segments"]:
99
+ timestamp_list.append([int(segment["start"]*1000), int(segment["end"]*1000)])
100
+ if segment["words"] != []:
101
+ sentence_info.append({"text":segment["text"],"start":int(segment["start"]*1000),"end":int(segment["end"]*1000),"timestamp":[[int(item['start']*1000), int(item['end']*1000)] for item in segment["words"]],"raw_text":segment["text"]})
102
+ raw_text = text["text"]
103
+ total_list.append({"text":text["text"],"raw_text":raw_text,"timestamp":timestamp_list,"sentence_info":sentence_info})
104
+ return total_list
105
+
106
+
107
+
108
+ def generate_audio_srt(sentence_list):
109
+ '''根据音频转文字,生成对应的srt格式字幕'''
110
+ srt_total = ''
111
+ for i, sent in enumerate(sentence_list):
112
+ t2s = Text2SRT_audio(sent['text'], sent['start'],sent['end'])
113
+ if 'spk' in sent:
114
+ srt_total += "{} spk{}\n{}".format(i, sent['spk'], t2s.srt())
115
+ else:
116
+ srt_total += "{}\n{}".format(i, t2s.srt())
117
+ return srt_total
118
+
119
+ def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
120
+ '''
121
+ 生成字幕片段
122
+ return:
123
+ srt_total:生成的SRT格式字幕文本。
124
+ subs:字幕的时间范围及文本信息,格式为 [(时间, 文本), ...]。
125
+ cc:字幕的最终编号。
126
+ '''
127
+ start, end = int(start * 1000), int(end * 1000)
128
+ srt_total = ''
129
+ cc = 1 + begin_index
130
+ subs = []
131
+ for _, sent in enumerate(sentence_list):
132
+ if isinstance(sent['text'], str):
133
+ sent['text'] = str2list(sent['text'])
134
+ if sent['timestamp'][-1][1] <= start:
135
+ # print("CASE0")
136
+ continue
137
+ if sent['timestamp'][0][0] >= end:
138
+ # print("CASE4")
139
+ break
140
+ # parts in between
141
+ if (sent['timestamp'][-1][1] <= end and sent['timestamp'][0][0] > start) or (sent['timestamp'][-1][1] == end and sent['timestamp'][0][0] == start):
142
+ # print("CASE1"); import pdb; pdb.set_trace()
143
+ t2s = Text2SRT(sent['text'], sent['timestamp'], offset=start)
144
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
145
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
146
+ cc += 1
147
+ continue
148
+ if sent['timestamp'][0][0] <= start:
149
+ # print("CASE2"); import pdb; pdb.set_trace()
150
+ if not sent['timestamp'][-1][1] > end:
151
+ for j, ts in enumerate(sent['timestamp']):
152
+ if ts[1] > start:
153
+ break
154
+ _text = sent['text'][j:]
155
+ _ts = sent['timestamp'][j:]
156
+ else:
157
+ for j, ts in enumerate(sent['timestamp']):
158
+ if ts[1] > start:
159
+ _start = j
160
+ break
161
+ for j, ts in enumerate(sent['timestamp']):
162
+ if ts[1] > end:
163
+ _end = j
164
+ break
165
+ # _text = " ".join(sent['text'][_start:_end])
166
+ _text = sent['text'][_start:_end]
167
+ _ts = sent['timestamp'][_start:_end]
168
+ if len(ts):
169
+ t2s = Text2SRT(_text, _ts, offset=start)
170
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
171
+ subs.append((t2s.time(time_acc_ost), t2s.text()))
172
+ cc += 1
173
+ continue
174
+ if sent['timestamp'][-1][1] > end:
175
+ # print("CASE3"); import pdb; pdb.set_trace()
176
+ for j, ts in enumerate(sent['timestamp']):
177
+ if ts[1] > end:
178
+ break
179
+ _text = sent['text'][:j]
180
+ _ts = sent['timestamp'][:j]
181
+ if len(_ts):
182
+ t2s = Text2SRT(_text, _ts, offset=start)
183
+ srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
184
+ subs.append(
185
+ (t2s.time(time_acc_ost), t2s.text())
186
+ )
187
+ cc += 1
188
+ continue
189
+ return srt_total, subs, cc
utils/theme.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "theme": {
3
+ "_font": [
4
+ {
5
+ "__gradio_font__": true,
6
+ "name": "Montserrat",
7
+ "class": "google"
8
+ },
9
+ {
10
+ "__gradio_font__": true,
11
+ "name": "ui-sans-serif",
12
+ "class": "font"
13
+ },
14
+ {
15
+ "__gradio_font__": true,
16
+ "name": "system-ui",
17
+ "class": "font"
18
+ },
19
+ {
20
+ "__gradio_font__": true,
21
+ "name": "sans-serif",
22
+ "class": "font"
23
+ }
24
+ ],
25
+ "_font_mono": [
26
+ {
27
+ "__gradio_font__": true,
28
+ "name": "IBM Plex Mono",
29
+ "class": "google"
30
+ },
31
+ {
32
+ "__gradio_font__": true,
33
+ "name": "ui-monospace",
34
+ "class": "font"
35
+ },
36
+ {
37
+ "__gradio_font__": true,
38
+ "name": "Consolas",
39
+ "class": "font"
40
+ },
41
+ {
42
+ "__gradio_font__": true,
43
+ "name": "monospace",
44
+ "class": "font"
45
+ }
46
+ ],
47
+ "background_fill_primary": "*neutral_50",
48
+ "background_fill_primary_dark": "*neutral_950",
49
+ "background_fill_secondary": "*neutral_50",
50
+ "background_fill_secondary_dark": "*neutral_900",
51
+ "block_background_fill": "white",
52
+ "block_background_fill_dark": "*neutral_800",
53
+ "block_border_color": "*border_color_primary",
54
+ "block_border_color_dark": "*border_color_primary",
55
+ "block_border_width": "0px",
56
+ "block_border_width_dark": "0px",
57
+ "block_info_text_color": "*body_text_color_subdued",
58
+ "block_info_text_color_dark": "*body_text_color_subdued",
59
+ "block_info_text_size": "*text_sm",
60
+ "block_info_text_weight": "400",
61
+ "block_label_background_fill": "*primary_100",
62
+ "block_label_background_fill_dark": "*primary_600",
63
+ "block_label_border_color": "*border_color_primary",
64
+ "block_label_border_color_dark": "*border_color_primary",
65
+ "block_label_border_width": "1px",
66
+ "block_label_border_width_dark": "1px",
67
+ "block_label_margin": "*spacing_md",
68
+ "block_label_padding": "*spacing_sm *spacing_md",
69
+ "block_label_radius": "*radius_md",
70
+ "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)",
71
+ "block_label_text_color": "*primary_500",
72
+ "block_label_text_color_dark": "*white",
73
+ "block_label_text_size": "*text_md",
74
+ "block_label_text_weight": "600",
75
+ "block_padding": "*spacing_xl calc(*spacing_xl + 2px)",
76
+ "block_radius": "*radius_lg",
77
+ "block_shadow": "none",
78
+ "block_shadow_dark": "none",
79
+ "block_title_background_fill": "*block_label_background_fill",
80
+ "block_title_background_fill_dark": "*block_label_background_fill",
81
+ "block_title_border_color": "none",
82
+ "block_title_border_color_dark": "none",
83
+ "block_title_border_width": "0px",
84
+ "block_title_border_width_dark": "0px",
85
+ "block_title_padding": "*block_label_padding",
86
+ "block_title_radius": "*block_label_radius",
87
+ "block_title_text_color": "*primary_500",
88
+ "block_title_text_color_dark": "*white",
89
+ "block_title_text_size": "*text_md",
90
+ "block_title_text_weight": "600",
91
+ "body_background_fill": "*background_fill_primary",
92
+ "body_background_fill_dark": "*background_fill_primary",
93
+ "body_text_color": "*neutral_800",
94
+ "body_text_color_dark": "*neutral_100",
95
+ "body_text_color_subdued": "*neutral_400",
96
+ "body_text_color_subdued_dark": "*neutral_400",
97
+ "body_text_size": "*text_md",
98
+ "body_text_weight": "400",
99
+ "border_color_accent": "*primary_300",
100
+ "border_color_accent_dark": "*neutral_600",
101
+ "border_color_primary": "*neutral_200",
102
+ "border_color_primary_dark": "*neutral_700",
103
+ "button_border_width": "*input_border_width",
104
+ "button_border_width_dark": "*input_border_width",
105
+ "button_cancel_background_fill": "*button_secondary_background_fill",
106
+ "button_cancel_background_fill_dark": "*button_secondary_background_fill",
107
+ "button_cancel_background_fill_hover": "*button_secondary_background_fill_hover",
108
+ "button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover",
109
+ "button_cancel_border_color": "*button_secondary_border_color",
110
+ "button_cancel_border_color_dark": "*button_secondary_border_color",
111
+ "button_cancel_border_color_hover": "*button_cancel_border_color",
112
+ "button_cancel_border_color_hover_dark": "*button_cancel_border_color",
113
+ "button_cancel_text_color": "*button_secondary_text_color",
114
+ "button_cancel_text_color_dark": "*button_secondary_text_color",
115
+ "button_cancel_text_color_hover": "*button_cancel_text_color",
116
+ "button_cancel_text_color_hover_dark": "*button_cancel_text_color",
117
+ "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)",
118
+ "button_large_radius": "*radius_lg",
119
+ "button_large_text_size": "*text_lg",
120
+ "button_large_text_weight": "600",
121
+ "button_primary_background_fill": "*primary_500",
122
+ "button_primary_background_fill_dark": "*primary_700",
123
+ "button_primary_background_fill_hover": "*primary_400",
124
+ "button_primary_background_fill_hover_dark": "*primary_500",
125
+ "button_primary_border_color": "*primary_200",
126
+ "button_primary_border_color_dark": "*primary_600",
127
+ "button_primary_border_color_hover": "*button_primary_border_color",
128
+ "button_primary_border_color_hover_dark": "*button_primary_border_color",
129
+ "button_primary_text_color": "white",
130
+ "button_primary_text_color_dark": "white",
131
+ "button_primary_text_color_hover": "*button_primary_text_color",
132
+ "button_primary_text_color_hover_dark": "*button_primary_text_color",
133
+ "button_secondary_background_fill": "white",
134
+ "button_secondary_background_fill_dark": "*neutral_600",
135
+ "button_secondary_background_fill_hover": "*neutral_100",
136
+ "button_secondary_background_fill_hover_dark": "*primary_500",
137
+ "button_secondary_border_color": "*neutral_200",
138
+ "button_secondary_border_color_dark": "*neutral_600",
139
+ "button_secondary_border_color_hover": "*button_secondary_border_color",
140
+ "button_secondary_border_color_hover_dark": "*button_secondary_border_color",
141
+ "button_secondary_text_color": "*neutral_800",
142
+ "button_secondary_text_color_dark": "white",
143
+ "button_secondary_text_color_hover": "*button_secondary_text_color",
144
+ "button_secondary_text_color_hover_dark": "*button_secondary_text_color",
145
+ "button_shadow": "*shadow_drop_lg",
146
+ "button_shadow_active": "*shadow_inset",
147
+ "button_shadow_hover": "*shadow_drop_lg",
148
+ "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)",
149
+ "button_small_radius": "*radius_lg",
150
+ "button_small_text_size": "*text_md",
151
+ "button_small_text_weight": "400",
152
+ "button_transition": "background-color 0.2s ease",
153
+ "checkbox_background_color": "*background_fill_primary",
154
+ "checkbox_background_color_dark": "*neutral_800",
155
+ "checkbox_background_color_focus": "*checkbox_background_color",
156
+ "checkbox_background_color_focus_dark": "*checkbox_background_color",
157
+ "checkbox_background_color_hover": "*checkbox_background_color",
158
+ "checkbox_background_color_hover_dark": "*checkbox_background_color",
159
+ "checkbox_background_color_selected": "*primary_600",
160
+ "checkbox_background_color_selected_dark": "*primary_700",
161
+ "checkbox_border_color": "*neutral_100",
162
+ "checkbox_border_color_dark": "*neutral_600",
163
+ "checkbox_border_color_focus": "*primary_500",
164
+ "checkbox_border_color_focus_dark": "*primary_600",
165
+ "checkbox_border_color_hover": "*neutral_300",
166
+ "checkbox_border_color_hover_dark": "*neutral_600",
167
+ "checkbox_border_color_selected": "*primary_600",
168
+ "checkbox_border_color_selected_dark": "*primary_700",
169
+ "checkbox_border_radius": "*radius_sm",
170
+ "checkbox_border_width": "1px",
171
+ "checkbox_border_width_dark": "*input_border_width",
172
+ "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")",
173
+ "checkbox_label_background_fill": "*button_secondary_background_fill",
174
+ "checkbox_label_background_fill_dark": "*button_secondary_background_fill",
175
+ "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover",
176
+ "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover",
177
+ "checkbox_label_background_fill_selected": "*primary_500",
178
+ "checkbox_label_background_fill_selected_dark": "*primary_600",
179
+ "checkbox_label_border_color": "*border_color_primary",
180
+ "checkbox_label_border_color_dark": "*border_color_primary",
181
+ "checkbox_label_border_color_hover": "*checkbox_label_border_color",
182
+ "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color",
183
+ "checkbox_label_border_width": "*input_border_width",
184
+ "checkbox_label_border_width_dark": "*input_border_width",
185
+ "checkbox_label_gap": "*spacing_lg",
186
+ "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)",
187
+ "checkbox_label_shadow": "*shadow_drop_lg",
188
+ "checkbox_label_text_color": "*body_text_color",
189
+ "checkbox_label_text_color_dark": "*body_text_color",
190
+ "checkbox_label_text_color_selected": "white",
191
+ "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color",
192
+ "checkbox_label_text_size": "*text_md",
193
+ "checkbox_label_text_weight": "400",
194
+ "checkbox_shadow": "none",
195
+ "color_accent": "*primary_500",
196
+ "color_accent_soft": "*primary_50",
197
+ "color_accent_soft_dark": "*neutral_700",
198
+ "container_radius": "*radius_lg",
199
+ "embed_radius": "*radius_lg",
200
+ "error_background_fill": "#fee2e2",
201
+ "error_background_fill_dark": "*background_fill_primary",
202
+ "error_border_color": "#fecaca",
203
+ "error_border_color_dark": "*border_color_primary",
204
+ "error_border_width": "1px",
205
+ "error_border_width_dark": "1px",
206
+ "error_text_color": "#ef4444",
207
+ "error_text_color_dark": "#ef4444",
208
+ "font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif",
209
+ "font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace",
210
+ "form_gap_width": "0px",
211
+ "input_background_fill": "white",
212
+ "input_background_fill_dark": "*neutral_700",
213
+ "input_background_fill_focus": "*secondary_500",
214
+ "input_background_fill_focus_dark": "*secondary_600",
215
+ "input_background_fill_hover": "*input_background_fill",
216
+ "input_background_fill_hover_dark": "*input_background_fill",
217
+ "input_border_color": "*neutral_50",
218
+ "input_border_color_dark": "*border_color_primary",
219
+ "input_border_color_focus": "*secondary_300",
220
+ "input_border_color_focus_dark": "*neutral_700",
221
+ "input_border_color_hover": "*input_border_color",
222
+ "input_border_color_hover_dark": "*input_border_color",
223
+ "input_border_width": "0px",
224
+ "input_border_width_dark": "0px",
225
+ "input_padding": "*spacing_xl",
226
+ "input_placeholder_color": "*neutral_400",
227
+ "input_placeholder_color_dark": "*neutral_500",
228
+ "input_radius": "*radius_lg",
229
+ "input_shadow": "*shadow_drop",
230
+ "input_shadow_dark": "*shadow_drop",
231
+ "input_shadow_focus": "*shadow_drop_lg",
232
+ "input_shadow_focus_dark": "*shadow_drop_lg",
233
+ "input_text_size": "*text_md",
234
+ "input_text_weight": "400",
235
+ "layout_gap": "*spacing_xxl",
236
+ "link_text_color": "*secondary_600",
237
+ "link_text_color_active": "*secondary_600",
238
+ "link_text_color_active_dark": "*secondary_500",
239
+ "link_text_color_dark": "*secondary_500",
240
+ "link_text_color_hover": "*secondary_700",
241
+ "link_text_color_hover_dark": "*secondary_400",
242
+ "link_text_color_visited": "*secondary_500",
243
+ "link_text_color_visited_dark": "*secondary_600",
244
+ "loader_color": "*color_accent",
245
+ "loader_color_dark": "*color_accent",
246
+ "name": "base",
247
+ "neutral_100": "#f3f4f6",
248
+ "neutral_200": "#e5e7eb",
249
+ "neutral_300": "#d1d5db",
250
+ "neutral_400": "#9ca3af",
251
+ "neutral_50": "#f9fafb",
252
+ "neutral_500": "#6b7280",
253
+ "neutral_600": "#4b5563",
254
+ "neutral_700": "#374151",
255
+ "neutral_800": "#1f2937",
256
+ "neutral_900": "#111827",
257
+ "neutral_950": "#0b0f19",
258
+ "panel_background_fill": "*background_fill_secondary",
259
+ "panel_background_fill_dark": "*background_fill_secondary",
260
+ "panel_border_color": "*border_color_primary",
261
+ "panel_border_color_dark": "*border_color_primary",
262
+ "panel_border_width": "1px",
263
+ "panel_border_width_dark": "1px",
264
+ "primary_100": "#e0e7ff",
265
+ "primary_200": "#c7d2fe",
266
+ "primary_300": "#a5b4fc",
267
+ "primary_400": "#818cf8",
268
+ "primary_50": "#eef2ff",
269
+ "primary_500": "#6366f1",
270
+ "primary_600": "#4f46e5",
271
+ "primary_700": "#4338ca",
272
+ "primary_800": "#3730a3",
273
+ "primary_900": "#312e81",
274
+ "primary_950": "#2b2c5e",
275
+ "prose_header_text_weight": "600",
276
+ "prose_text_size": "*text_md",
277
+ "prose_text_weight": "400",
278
+ "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")",
279
+ "radius_lg": "6px",
280
+ "radius_md": "4px",
281
+ "radius_sm": "2px",
282
+ "radius_xl": "8px",
283
+ "radius_xs": "1px",
284
+ "radius_xxl": "12px",
285
+ "radius_xxs": "1px",
286
+ "secondary_100": "#ecfccb",
287
+ "secondary_200": "#d9f99d",
288
+ "secondary_300": "#bef264",
289
+ "secondary_400": "#a3e635",
290
+ "secondary_50": "#f7fee7",
291
+ "secondary_500": "#84cc16",
292
+ "secondary_600": "#65a30d",
293
+ "secondary_700": "#4d7c0f",
294
+ "secondary_800": "#3f6212",
295
+ "secondary_900": "#365314",
296
+ "secondary_950": "#2f4e14",
297
+ "section_header_text_size": "*text_md",
298
+ "section_header_text_weight": "400",
299
+ "shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)",
300
+ "shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)",
301
+ "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset",
302
+ "shadow_spread": "6px",
303
+ "shadow_spread_dark": "1px",
304
+ "slider_color": "*primary_500",
305
+ "slider_color_dark": "*primary_600",
306
+ "spacing_lg": "6px",
307
+ "spacing_md": "4px",
308
+ "spacing_sm": "2px",
309
+ "spacing_xl": "9px",
310
+ "spacing_xs": "1px",
311
+ "spacing_xxl": "12px",
312
+ "spacing_xxs": "1px",
313
+ "stat_background_fill": "*primary_300",
314
+ "stat_background_fill_dark": "*primary_500",
315
+ "table_border_color": "*neutral_300",
316
+ "table_border_color_dark": "*neutral_700",
317
+ "table_even_background_fill": "white",
318
+ "table_even_background_fill_dark": "*neutral_950",
319
+ "table_odd_background_fill": "*neutral_50",
320
+ "table_odd_background_fill_dark": "*neutral_900",
321
+ "table_radius": "*radius_lg",
322
+ "table_row_focus": "*color_accent_soft",
323
+ "table_row_focus_dark": "*color_accent_soft",
324
+ "text_lg": "16px",
325
+ "text_md": "14px",
326
+ "text_sm": "12px",
327
+ "text_xl": "22px",
328
+ "text_xs": "10px",
329
+ "text_xxl": "26px",
330
+ "text_xxs": "9px"
331
+ },
332
+ "version": "0.0.1"
333
+ }
utils/trans_utils.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import os
7
+ import re
8
+ import numpy as np
9
+
10
+ PUNC_LIST = [',', '。', '!', '?', '、', ',', '.', '?', '!']
11
+
12
+ def pre_proc(text):
13
+ res = ''
14
+ for i in range(len(text)):
15
+ if text[i] in PUNC_LIST:
16
+ continue
17
+ if '\u4e00' <= text[i] <= '\u9fff':
18
+ if len(res) and res[-1] != " ":
19
+ res += ' ' + text[i]+' '
20
+ else:
21
+ res += text[i]+' '
22
+ else:
23
+ res += text[i]
24
+ if res[-1] == ' ':
25
+ res = res[:-1]
26
+ return res
27
+
28
+ def proc(raw_text, timestamp, dest_text, lang='zh'):
29
+ # simple matching
30
+ ld = len(dest_text.split())
31
+ mi, ts = [], []
32
+ offset = 0
33
+ while True:
34
+ fi = raw_text.find(dest_text, offset, len(raw_text))
35
+ ti = raw_text[:fi].count(' ')
36
+ if fi == -1:
37
+ break
38
+ offset = fi + ld
39
+ mi.append(fi)
40
+ ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
41
+ return ts
42
+
43
+
44
+ def proc_spk(dest_spk, sd_sentences):
45
+ ts = []
46
+ for d in sd_sentences:
47
+ d_start = d['timestamp'][0][0]
48
+ d_end = d['timestamp'][-1][1]
49
+ spkid=dest_spk[3:]
50
+ if str(d['spk']) == spkid and d_end-d_start>999:
51
+ ts.append([d_start*16, d_end*16])
52
+ return ts
53
+
54
+ def generate_vad_data(data, sd_sentences, sr=16000):
55
+ assert len(data.shape) == 1
56
+ vad_data = []
57
+ for d in sd_sentences:
58
+ d_start = round(d['ts_list'][0][0]/1000, 2)
59
+ d_end = round(d['ts_list'][-1][1]/1000, 2)
60
+ vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
61
+ return vad_data
62
+
63
+ def write_state(output_dir, state):
64
+ for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
65
+ with open(output_dir+key, 'w') as fout:
66
+ fout.write(str(state[key[1:]]))
67
+ if 'sd_sentences' in state:
68
+ with open(output_dir+'/sd_sentences', 'w') as fout:
69
+ fout.write(str(state['sd_sentences']))
70
+
71
+ def load_state(output_dir):
72
+ state = {}
73
+ with open(output_dir+'/recog_res_raw') as fin:
74
+ line = fin.read()
75
+ state['recog_res_raw'] = line
76
+ with open(output_dir+'/timestamp') as fin:
77
+ line = fin.read()
78
+ state['timestamp'] = eval(line)
79
+ with open(output_dir+'/sentences') as fin:
80
+ line = fin.read()
81
+ state['sentences'] = eval(line)
82
+ if os.path.exists(output_dir+'/sd_sentences'):
83
+ with open(output_dir+'/sd_sentences') as fin:
84
+ line = fin.read()
85
+ state['sd_sentences'] = eval(line)
86
+ return state
87
+
88
+ def convert_pcm_to_float(data):
89
+ if data.dtype == np.float64:
90
+ return data
91
+ elif data.dtype == np.float32:
92
+ return data.astype(np.float64)
93
+ elif data.dtype == np.int16:
94
+ bit_depth = 16
95
+ elif data.dtype == np.int32:
96
+ bit_depth = 32
97
+ elif data.dtype == np.int8:
98
+ bit_depth = 8
99
+ else:
100
+ raise ValueError("Unsupported audio data type")
101
+
102
+ # Now handle the integer types
103
+ max_int_value = float(2 ** (bit_depth - 1))
104
+ if bit_depth == 8:
105
+ data = data - 128
106
+ return (data.astype(np.float64) / max_int_value)
107
+
108
+ def convert_time_to_millis(time_str):
109
+ # 格式: [小时:分钟:秒,毫秒]
110
+ hours, minutes, seconds, milliseconds = map(int, re.split('[:,]', time_str))
111
+ return (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
112
+
113
+ def extract_timestamps(input_text):
114
+ # 使用正则表达式查找所有时间戳
115
+ timestamps = re.findall(r'\[(\d{2}:\d{2}:\d{2},\d{2,3})\s*-\s*(\d{2}:\d{2}:\d{2},\d{2,3})\]', input_text)
116
+ times_list = []
117
+ print(timestamps)
118
+ # 循环遍历找到的所有时间戳,并转换为毫秒
119
+ for start_time, end_time in timestamps:
120
+ start_millis = convert_time_to_millis(start_time)
121
+ end_millis = convert_time_to_millis(end_time)
122
+ times_list.append([start_millis, end_millis])
123
+
124
+ return times_list
125
+
126
+
127
+ if __name__ == '__main__':
128
+ text = ("1. [00:00:00,500-00:00:05,850] 在我们的设计普惠当中,有一个我经常津津乐道的项目叫寻找远方的美好。"
129
+ "2. [00:00:07,120-00:00:12,940] 啊,在这样一个我们叫寻美在这样的一个项目当中,我们把它跟乡村振兴去结合起来,利用我们的设计的能力。"
130
+ "3. [00:00:13,240-00:00:25,620] 问我们自身员工的设设计能力,我们设计生态伙伴的能力,帮助乡村振兴当中,要希望把他的产品推向市场,把他的农产品把他加工产品推向市场的这样的伙伴做一件事情,")
131
+
132
+ print(extract_timestamps(text))
videoclipper.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ import re
7
+ import os
8
+ import sys
9
+ import copy
10
+ import librosa
11
+ import logging
12
+ import argparse
13
+ import numpy as np
14
+ import soundfile as sf
15
+ from moviepy.editor import *
16
+ import moviepy.editor as mpy
17
+ from moviepy.video.tools.subtitles import SubtitlesClip, TextClip
18
+ from moviepy.editor import VideoFileClip, concatenate_videoclips
19
+ from moviepy.video.compositing import CompositeVideoClip
20
+ from utils.subtitle_utils import generate_srt, generate_srt_clip,generate_audio_srt,trans_format
21
+ from utils.argparse_tools import ArgumentParser, get_commandline_args
22
+ from utils.trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
23
+ import whisper
24
+
25
+ class VideoClipper():
26
+ def __init__(self, model):
27
+ logging.warning("Initializing VideoClipper.")
28
+
29
+ self.GLOBAL_COUNT = 0
30
+ self.model = model
31
+
32
+ def recog(self, audio_input, state=None, output_dir=None,text=None):
33
+ '''
34
+ 将音频输入转化为文本。它可以选择性地进行说话人分离(SD, Speaker Diarization)和生成字幕文件(SRT格式)。
35
+ return:
36
+ res_text:识别出的文本内容。
37
+ res_srt:识别内容生成的 SRT 字幕格式。
38
+ state:包含了识别的原始结果、时间戳和句子信息的状态字典
39
+ '''
40
+ if state is None:
41
+ state = {}
42
+ sr, data = audio_input
43
+
44
+ # Convert to float64 consistently (includes data type checking)
45
+ data = convert_pcm_to_float(data)
46
+
47
+ # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
48
+ if sr != 16000: # resample with librosa
49
+ data = librosa.resample(data, orig_sr=sr, target_sr=16000)
50
+ if len(data.shape) == 2: # multi-channel wav input
51
+ logging.warning("Input wav shape: {}, only first channel reserved.".format(data.shape))
52
+ data = data[:,0]
53
+ state['audio_input'] = (sr, data)
54
+ rec_result = trans_format(text)
55
+ res_srt = generate_srt(rec_result[0]['sentence_info'])
56
+ state['recog_res_raw'] = rec_result[0]['raw_text']
57
+ state['timestamp'] = rec_result[0]['timestamp']
58
+ state['sentences'] = rec_result[0]['sentence_info']
59
+ res_text = rec_result[0]['text']
60
+ return res_text, res_srt, state
61
+
62
+
63
+ def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None, output_dir=None, timestamp_list=None):
64
+ # get from state
65
+ '''
66
+ dest_text:目标文本,根据这个文本内容来定位音频中相应的片段。
67
+ start_ost 和 end_ost:起始和结束时间偏移量,用于微调音频片段的起止位置。
68
+ state:包含函数执行所需的数据状态,例如音频数据、识别结果、时间戳等。
69
+ dest_spk:目标说话者,如果指定了这个参数,函数会根据说话者信息来提取音频片段。
70
+ output_dir:输出目录,用于保存结果。
71
+ timestamp_list:时间戳列表,如果提供了时间戳,则直接按照这些时间戳提取音频片段。
72
+ '''
73
+ audio_input = state['audio_input']
74
+ recog_res_raw = state['recog_res_raw']
75
+ timestamp = state['timestamp']
76
+ sentences = state['sentences']
77
+ sr, data = audio_input
78
+ data = data.astype(np.float64)
79
+
80
+ if timestamp_list is None:
81
+ all_ts = []
82
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
83
+ for _dest_text in dest_text.split('#'):
84
+ if '[' in _dest_text:
85
+ match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
86
+ if match:
87
+ offset_b, offset_e = map(int, match.groups())
88
+ log_append = ""
89
+ else:
90
+ offset_b, offset_e = 0, 0
91
+ log_append = "(Bracket detected in dest_text but offset time matching failed)"
92
+ _dest_text = _dest_text[:_dest_text.find('[')]
93
+ else:
94
+ log_append = ""
95
+ offset_b, offset_e = 0, 0
96
+ _dest_text = pre_proc(_dest_text)
97
+ ts = proc(recog_res_raw, timestamp, _dest_text) # 得到时间戳
98
+ for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
99
+ if len(ts) > 1 and match:
100
+ log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
101
+ offsets are applied to all periods)'
102
+ else:
103
+ for _dest_spk in dest_spk.split('#'):
104
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
105
+ for _ts in ts: all_ts.append(_ts)
106
+ log_append = ""
107
+ else:
108
+ all_ts = timestamp_list
109
+ ts = all_ts
110
+ # ts.sort()
111
+ srt_index = 0
112
+ clip_srt = ""
113
+ if len(ts):
114
+ start, end = ts[0]
115
+ start = min(max(0, start+start_ost*16), len(data))
116
+ end = min(max(0, end+end_ost*16), len(data))
117
+ res_audio = data[start:end]
118
+ start_end_info = "from {} to {}".format(start/16000, end/16000)
119
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
120
+ clip_srt += srt_clip
121
+ for _ts in ts[1:]: # multiple sentence input or multiple output matched
122
+ start, end = _ts
123
+ start = min(max(0, start+start_ost*16), len(data))
124
+ end = min(max(0, end+end_ost*16), len(data))
125
+ start_end_info += ", from {} to {}".format(start, end)
126
+ res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
127
+ srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
128
+ clip_srt += srt_clip
129
+ if len(ts):
130
+ message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
131
+ else:
132
+ message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
133
+ res_audio = data
134
+ return (sr, res_audio), message, clip_srt # 音频数据、消息文本和生成的 SRT 字幕
135
+
136
+ def video_recog(self, video_filename, output_dir=None,ASR="whisper"):
137
+ '''通过处理视频获得想要的视频、音频以及其他信息'''
138
+ video = mpy.VideoFileClip(video_filename)
139
+ # Extract the base name, add '_clip.mp4', and 'wav'
140
+ if output_dir is not None:
141
+ os.makedirs(output_dir, exist_ok=True)
142
+ _, base_name = os.path.split(video_filename)
143
+ base_name, _ = os.path.splitext(base_name)
144
+ clip_video_file = base_name + '_clip.mp4'
145
+ audio_file = base_name + '.wav'
146
+ audio_file = os.path.join(output_dir, audio_file)
147
+ else:
148
+ base_name, _ = os.path.splitext(video_filename)
149
+ clip_video_file = base_name + '_clip.mp4'
150
+ audio_file = base_name + '.wav'
151
+ video.audio.write_audiofile(audio_file)
152
+ # 在这里使用whisper对音频文件进行处理
153
+ result_audio = self.model.transcribe(audio_file,language = "zh", word_timestamps=True)
154
+ wav = librosa.load(audio_file, sr=16000)[0]
155
+ # delete the audio file after processing
156
+ if os.path.exists(audio_file):
157
+ os.remove(audio_file)
158
+ state = {
159
+ 'video_filename': video_filename,
160
+ 'clip_video_file': clip_video_file,
161
+ 'video': video,
162
+ }
163
+ return self.recog((16000, wav), state, output_dir,text=result_audio)
164
+
165
+
166
+ def video_clip(self,
167
+ dest_text,
168
+ start_ost,
169
+ end_ost,
170
+ state,
171
+ font_size=32,
172
+ font_color='white',
173
+ add_sub=False,
174
+ dest_spk=None,
175
+ output_dir=None,
176
+ timestamp_list=None):
177
+ # get from state
178
+ recog_res_raw = state['recog_res_raw']
179
+ timestamp = state['timestamp']
180
+ sentences = state['sentences']
181
+ video = state['video']
182
+ clip_video_file = state['clip_video_file']
183
+ video_filename = state['video_filename']
184
+
185
+ if timestamp_list is None:
186
+ all_ts = []
187
+ if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
188
+ for _dest_text in dest_text.split('#'):
189
+ if '[' in _dest_text:
190
+ match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
191
+ if match:
192
+ offset_b, offset_e = map(int, match.groups())
193
+ log_append = ""
194
+ else:
195
+ offset_b, offset_e = 0, 0
196
+ log_append = "(Bracket detected in dest_text but offset time matching failed)"
197
+ _dest_text = _dest_text[:_dest_text.find('[')]
198
+ else:
199
+ offset_b, offset_e = 0, 0
200
+ log_append = ""
201
+ # import pdb; pdb.set_trace()
202
+ _dest_text = pre_proc(_dest_text)
203
+ ts = proc(recog_res_raw, timestamp, _dest_text.lower())
204
+ for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
205
+ if len(ts) > 1 and match:
206
+ log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
207
+ offsets are applied to all periods)'
208
+ else:
209
+ for _dest_spk in dest_spk.split('#'):
210
+ ts = proc_spk(_dest_spk, state['sd_sentences'])
211
+ for _ts in ts: all_ts.append(_ts)
212
+ else: # AI clip pass timestamp as input directly
213
+ all_ts = [[i[0]*16.0, i[1]*16.0] for i in timestamp_list]
214
+
215
+ srt_index = 0
216
+ time_acc_ost = 0.0
217
+ ts = all_ts
218
+ # ts.sort()
219
+ clip_srt = ""
220
+ if len(ts):
221
+ # if self.lang == 'en' and isinstance(sentences, str):
222
+ # sentences = sentences.split()
223
+ start, end = ts[0][0] / 16000, ts[0][1] / 16000
224
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
225
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
226
+ video_clip = video.subclip(start, end)
227
+ start_end_info = "from {} to {}".format(start, end)
228
+ clip_srt += srt_clip
229
+ if add_sub: # 叠加字幕
230
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
231
+ subtitles = SubtitlesClip(subs, generator)
232
+ video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
233
+ concate_clip = [video_clip]
234
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
235
+ for _ts in ts[1:]:
236
+ start, end = _ts[0] / 16000, _ts[1] / 16000
237
+ srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
238
+ if not len(subs):
239
+ continue
240
+ chi_subs = []
241
+ sub_starts = subs[0][0][0]
242
+ for sub in subs:
243
+ chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
244
+ start, end = start+start_ost/1000.0, end+end_ost/1000.0
245
+ _video_clip = video.subclip(start, end)
246
+ start_end_info += ", from {} to {}".format(str(start)[:5], str(end)[:5])
247
+ clip_srt += srt_clip
248
+ if add_sub:
249
+ generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
250
+ subtitles = SubtitlesClip(chi_subs, generator)
251
+ _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
252
+ # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
253
+ concate_clip.append(copy.copy(_video_clip))
254
+ time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
255
+ message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
256
+ logging.warning("Concating...")
257
+ if len(concate_clip) > 1: # 对视频片段进行拼接
258
+ video_clip = concatenate_videoclips(concate_clip)
259
+ # clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
260
+ if output_dir is not None:
261
+ os.makedirs(output_dir, exist_ok=True)
262
+ _, file_with_extension = os.path.split(clip_video_file)
263
+ clip_video_file_name, _ = os.path.splitext(file_with_extension)
264
+ print(output_dir, clip_video_file)
265
+ clip_video_file = os.path.join(output_dir, "{}_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
266
+ temp_audio_file = os.path.join(output_dir, "{}_tempaudio_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
267
+ else:
268
+ clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
269
+ temp_audio_file = clip_video_file[:-4] + '_tempaudio_no{}.mp4'.format(self.GLOBAL_COUNT)
270
+ video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile=temp_audio_file,fps=25) #写入指定文件路径下
271
+ self.GLOBAL_COUNT += 1
272
+ else:
273
+ clip_video_file = video_filename
274
+ message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
275
+ srt_clip = ''
276
+ return clip_video_file, message, clip_srt
277
+
278
+
279
+ def get_parser():
280
+ parser = ArgumentParser(
281
+ description="ClipVideo Argument",
282
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
283
+ )
284
+ parser.add_argument(
285
+ "--stage",
286
+ type=int,
287
+ choices=(1, 2),
288
+ help="Stage, 0 for recognizing and 1 for clipping",
289
+ required=True
290
+ )
291
+ parser.add_argument(
292
+ "--file",
293
+ type=str,
294
+ default=None,
295
+ help="Input file path",
296
+ required=True
297
+ )
298
+ parser.add_argument(
299
+ "--sd_switch",
300
+ type=str,
301
+ choices=("no", "yes"),
302
+ default="no",
303
+ help="Turn on the speaker diarization or not",
304
+ )
305
+ parser.add_argument(
306
+ "--output_dir",
307
+ type=str,
308
+ default='./output',
309
+ help="Output files path",
310
+ )
311
+ parser.add_argument(
312
+ "--dest_text",
313
+ type=str,
314
+ default=None,
315
+ help="Destination text string for clipping",
316
+ )
317
+ parser.add_argument(
318
+ "--dest_spk",
319
+ type=str,
320
+ default=None,
321
+ help="Destination spk id for clipping",
322
+ )
323
+ parser.add_argument(
324
+ "--start_ost",
325
+ type=int,
326
+ default=0,
327
+ help="Offset time in ms at beginning for clipping"
328
+ )
329
+ parser.add_argument(
330
+ "--end_ost",
331
+ type=int,
332
+ default=0,
333
+ help="Offset time in ms at ending for clipping"
334
+ )
335
+ parser.add_argument(
336
+ "--output_file",
337
+ type=str,
338
+ default=None,
339
+ help="Output file path"
340
+ )
341
+ parser.add_argument(
342
+ "--lang",
343
+ type=str,
344
+ default='zh',
345
+ help="language"
346
+ )
347
+ return parser
348
+