Spaces:

dtl123
/

test

Sleeping

App Files Files Community

01du commited on Nov 5, 2024

Commit

cdbb2b2

1 Parent(s): 23cc4a2

Add application file

Browse files

Files changed (25) hide show

README.md +3 -0
__pycache__/videoclipper.cpython-311.pyc +0 -0
app.py +184 -0
llm/__pycache__/demo_prompt.cpython-311.pyc +0 -0
llm/__pycache__/g4f_openai_api.cpython-310.pyc +0 -0
llm/__pycache__/g4f_openai_api.cpython-311.pyc +0 -0
llm/__pycache__/openai_api.cpython-310.pyc +0 -0
llm/__pycache__/openai_api.cpython-311.pyc +0 -0
llm/__pycache__/qwen_api.cpython-310.pyc +0 -0
llm/__pycache__/qwen_api.cpython-311.pyc +0 -0
llm/__pycache__/yi_moe_api.cpython-311.pyc +0 -0
llm/openai_api.py +54 -0
llm/yi_moe_api.py +52 -0
requirements.txt +6 -0
utils/__pycache__/argparse_tools.cpython-310.pyc +0 -0
utils/__pycache__/argparse_tools.cpython-311.pyc +0 -0
utils/__pycache__/subtitle_utils.cpython-310.pyc +0 -0
utils/__pycache__/subtitle_utils.cpython-311.pyc +0 -0
utils/__pycache__/trans_utils.cpython-310.pyc +0 -0
utils/__pycache__/trans_utils.cpython-311.pyc +0 -0
utils/argparse_tools.py +88 -0
utils/subtitle_utils.py +189 -0
utils/theme.json +333 -0
utils/trans_utils.py +132 -0
videoclipper.py +348 -0

README.md CHANGED Viewed

@@ -7,6 +7,9 @@ sdk: gradio
 sdk_version: 5.5.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 5.5.0
 app_file: app.py
 pinned: false
+hf_oauth: true
+hf_oauth_scopes:
+- email
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/videoclipper.cpython-311.pyc ADDED Viewed

Binary file (24.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import logging
+import os
+from videoclipper import VideoClipper
+import gradio as gr
+import requests
+from huggingface_hub import whoami
+import whisper
+from llm.openai_api import openai_call
+from llm.yi_moe_api import yi_moe
+from utils.trans_utils import extract_timestamps
+API_URL_TEMPLATE = "https://api-yidong.lingyiwanwu.com/v1/ops/api_key?user_email={user_email}&user_source=huggingface"
+model = whisper.load_model("tiny")
+audio_clipper = VideoClipper(model)
+def get_user_email(oauth_token: gr.OAuthToken | None) -> str | None:
+    def call_api(user_email):
+        url = API_URL_TEMPLATE.format(user_email=user_email)
+        headers = {"Authorization":  f'Basic {os.getenv("AUTH")}'}
+        response = requests.post(url, headers=headers)
+        return response.json()["data"]["display_api_key"]
+    if oauth_token is None:
+        return None
+    user_info = whoami(token=oauth_token.token)
+    email = user_info.get("email")
+    return call_api(email)
+def audio_recog(audio_input, output_dir):
+    return audio_clipper.recog(audio_input, None, output_dir=output_dir)
+def video_recog(video_input, output_dir, ASR):
+    return audio_clipper.video_recog(video_input, output_dir=output_dir, ASR=ASR)
+def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
+    return audio_clipper.video_clip(
+        dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
+        )
+def mix_recog(video_input, audio_input,output_dir,ASR="whisper"):
+    '''
+    识别视频或音频，返回识别的文本、字幕和状态信息。
+    '''
+    output_dir = output_dir.strip()
+    if not len(output_dir):
+        output_dir = None
+    else:
+        output_dir = os.path.abspath(output_dir)
+    audio_state, video_state = None, None
+    if video_input is not None:
+        # import pdb; pdb.set_trace() ############
+        res_text, res_srt, video_state = video_recog(
+            video_input, output_dir=output_dir, ASR = ASR)
+        return res_text, res_srt, video_state, None
+    if audio_input is not None:
+        res_text, res_srt, audio_state = audio_recog(
+            audio_input, output_dir=output_dir)
+        return res_text, res_srt, None, audio_state
+def llm_inference(system_content, user_content, srt_text, model, apikey):
+    SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot',"gpt-4o","22A"]
+    if model.startswith('gpt') or model.startswith('moonshot'):
+        return openai_call(apikey, model, system_content = system_content, user_content = user_content+'\n'+srt_text)
+    elif model.startswith('22A'):
+        return yi_moe(apikey, model, user_content+'\n'+srt_text, system_content)
+    else:
+        logging.error("LLM name error, only {} are supported as LLM name prefix."
+                        .format(SUPPORT_LLM_PREFIX))
+def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+    timestamp_list = extract_timestamps(LLM_res)
+    output_dir = output_dir.strip()
+    if not len(output_dir):
+        output_dir = None
+    else:
+        output_dir = os.path.abspath(output_dir)
+    if video_state is not None:
+        clip_video_file, message, clip_srt = audio_clipper.video_clip(
+            dest_text, start_ost, end_ost, video_state,
+            dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
+        return clip_video_file, None, message, clip_srt
+    if audio_state is not None:
+        (sr, res_audio), message, clip_srt = audio_clipper.clip(
+            dest_text, start_ost, end_ost, audio_state,
+            dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
+        return None, (sr, res_audio), message, clip_srt
+with gr.Blocks() as clip_service:
+    video_state, audio_state = gr.State(), gr.State()
+    with gr.Row():
+        login_button = gr.LoginButton()
+        user_email_display = gr.Textbox(
+            label="In order to get your user key, please click on huggingface login, the first time you login you will have the full key, please save it. After that your key will be hidden.",
+            interactive=True,
+        )
+        clip_service.load(get_user_email, inputs=None, outputs=user_email_display)
+        logging.info(f"The value of the current variable is: {user_email_display}")
+    video_input = gr.Video(label="视频输入 | Video Input")
+    audio_input = gr.Audio(label="音频输入 | Audio Input")
+    with gr.Column():
+        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
+                        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
+                        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
+                    [video_input],
+                    label='示例视频 | Demo Video')
+        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
+                    [video_input],
+                    label='多说话人示例视频 | Multi-speaker Demo Video')
+        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
+                    [audio_input],
+                    label="示例音频 | Demo Audio",visible=False)
+        ASR_model = gr.Dropdown(
+                choices=["whisper"],
+                value="whisper",
+                label="ASR Model Name",
+                allow_custom_value=True)
+        recog_button = gr.Button("👂 识别 | ASR", variant="primary")
+    output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空，Linux, mac系统可以稳定使用)")
+    video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
+    video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
+    prompt_head = gr.Textbox(label="Prompt System (按需更改，最好不要变动主体和要求)", value=("你是一个视频srt字幕分析剪辑器，输入视频的srt字幕和用户命令，请你结合用户指令选出符合要求的片段并输出。注意：要谨慎分析用户的问题，找出符合用户提问的srt字幕片段。"
+                        "尽可能将连续的片段并裁剪出来，将片段中在时间上连续的多个句子及它们的时间戳合并为一条，保证合并后的片段有着相同的主题"
+                        "注意确保文字与时间戳的正确匹配。你应该按照以下顺序进行处理："
+                        "1.将srt字幕合并成数段主题的内容。2.将用户命令和查询进行匹配。"
+                        "输出需严格按照如下格式：1. [开始时间-结束时间] 文本，注意其中的连接符是“-”"))
+    prompt_head2 = gr.Textbox(label="Prompt User（请输入用户指令）")
+    with gr.Column():
+        with gr.Row():
+            llm_model = gr.Dropdown(
+                choices=["gpt-4o",
+                        "22A"],
+                value="22A",
+                label="LLM Model Name",
+                allow_custom_value=True)
+            apikey_input = gr.Textbox(label="APIKEY")
+        llm_button =  gr.Button("LLM推理 | LLM Inference（首先进行识别，非g4f需配置对应apikey）", variant="primary")
+    llm_result = gr.Textbox(label="LLM Clipper Result")
+    llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
+    video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)",value ="这个不需要", visible=False)
+    video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)",value ="这个不需要", visible=False)
+    with gr.Row():
+        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)",visible=False)
+        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)",visible=False)
+    video_output = gr.Video(label="裁剪结果 | Video Clipped")
+    audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
+    clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
+    srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
+    recog_button.click(mix_recog,
+                    inputs=[video_input,
+                            audio_input,
+                            output_dir,
+                            ASR_model
+                            ],
+                    outputs=[video_text_output, video_srt_output, video_state, audio_state])
+    llm_button.click(llm_inference,
+                    inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
+                    outputs=[llm_result])
+    llm_clip_button.click(AI_clip,
+                    inputs=[llm_result,
+                            video_text_input,
+                            video_spk_input,
+                            video_start_ost,
+                            video_end_ost,
+                            video_state,
+                            audio_state,
+                            output_dir,
+                            ],
+                    outputs=[video_output, audio_output, clip_message, srt_clipped])
+if __name__ == "__main__":
+    clip_service.queue(
+    max_size=10,
+    default_concurrency_limit=10,
+)
+    clip_service.launch(ssr_mode=False)

llm/__pycache__/demo_prompt.cpython-311.pyc ADDED Viewed

Binary file (6.17 kB). View file

llm/__pycache__/g4f_openai_api.cpython-310.pyc ADDED Viewed

Binary file (904 Bytes). View file

llm/__pycache__/g4f_openai_api.cpython-311.pyc ADDED Viewed

Binary file (1.49 kB). View file

llm/__pycache__/openai_api.cpython-310.pyc ADDED Viewed

Binary file (1.06 kB). View file

llm/__pycache__/openai_api.cpython-311.pyc ADDED Viewed

Binary file (1.83 kB). View file

llm/__pycache__/qwen_api.cpython-310.pyc ADDED Viewed

Binary file (813 Bytes). View file

llm/__pycache__/qwen_api.cpython-311.pyc ADDED Viewed

Binary file (1.24 kB). View file

llm/__pycache__/yi_moe_api.cpython-311.pyc ADDED Viewed

Binary file (2.13 kB). View file

llm/openai_api.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import logging
+from openai import OpenAI
+import openai
+if __name__ == '__main__':
+    from llm.demo_prompt import demo_prompt
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=os.environ.get("OPENAI_API_KEY"),
+    )
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": demo_prompt,
+            }
+        ],
+        model="gpt-3.5-turbo-0125",
+    )
+    print(chat_completion.choices[0].message.content)
+def openai_call(apikey,
+                model="gpt-3.5-turbo",
+                user_content="如何做西红柿炖牛腩？",
+                system_content=None):
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=apikey,
+        base_url="https://api.lingyiwanwu.com/v1"
+    )
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+    )
+    logging.info("Openai model inference done.")
+    return chat_completion.choices[0].message.content

llm/yi_moe_api.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import requests
+import json
+import logger
+import time
+def yi_moe(api_key,
+           model="moyi-chat-v03-sglang",
+           user_content="如何下载github上的项目？",
+           system_content=None):
+    url ="http://10.2.5.29:30869/v1/chat/completions"
+    headers = {
+        "Authorization": "Basic ZjIyMDIwYjRkMTIyM2UyNGI4NjVlMWIxZWI0YzAzZTM6NWJjS01PbENMTDRTV1MxaERkSHlTRzViSTJCd3psR1A=" + api_key,
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "messages": [
+        {
+        "role": "system",
+        "content": system_content
+        },
+        {
+        "role": "user",
+        "content": user_content
+        }
+        ],
+        "temperature": 0.7,
+        "stream": False,
+        "max_tokens": 4096
+    }
+    max_retries = 5
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            response = requests.post(url, json=data, headers=headers, timeout=60)
+            if response.status_code == 200:
+                response = response.json()
+                text = response["choices"][0]["message"]["content"]
+                return text
+            else:
+                logger.warning(f"{response.status_code}, {response.text}")
+                retry_count += 1
+                time.sleep(2)
+                logger.info(f"Retrying... attempt {retry_count}")
+        except Exception as e:
+            logger.error(e)
+            retry_count += 1
+            time.sleep(2)
+            logger.info(f"Retrying... attempt {retry_count}")
+    return "error"

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+librosa
+soundfile
+moviepy
+numpy==1.26.4
+openai
+openai-whisper

utils/__pycache__/argparse_tools.cpython-310.pyc ADDED Viewed

Binary file (2.44 kB). View file

utils/__pycache__/argparse_tools.cpython-311.pyc ADDED Viewed

Binary file (4.08 kB). View file

utils/__pycache__/subtitle_utils.cpython-310.pyc ADDED Viewed

Binary file (3.7 kB). View file

utils/__pycache__/subtitle_utils.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

utils/__pycache__/trans_utils.cpython-310.pyc ADDED Viewed

Binary file (4.33 kB). View file

utils/__pycache__/trans_utils.cpython-311.pyc ADDED Viewed

Binary file (8.45 kB). View file

utils/argparse_tools.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import argparse
+from pathlib import Path
+import yaml
+import sys
+class ArgumentParser(argparse.ArgumentParser):
+    """Simple implementation of ArgumentParser supporting config file
+    This class is originated from https://github.com/bw2/ConfigArgParse,
+    but this class is lack of some features that it has.
+    - Not supporting multiple config files
+    - Automatically adding "--config" as an option.
+    - Not supporting any formats other than yaml
+    - Not checking argument type
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--config", help="Give config file in yaml format")
+    def parse_known_args(self, args=None, namespace=None):
+        # Once parsing for setting from "--config"
+        _args, _ = super().parse_known_args(args, namespace)
+        if _args.config is not None:
+            if not Path(_args.config).exists():
+                self.error(f"No such file: {_args.config}")
+            with open(_args.config, "r", encoding="utf-8") as f:
+                d = yaml.safe_load(f)
+            if not isinstance(d, dict):
+                self.error("Config file has non dict value: {_args.config}")
+            for key in d:
+                for action in self._actions:
+                    if key == action.dest:
+                        break
+                else:
+                    self.error(f"unrecognized arguments: {key} (from {_args.config})")
+            # NOTE(kamo): Ignore "--config" from a config file
+            # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
+            #   i.e. We can set any type value regardless of argument type.
+            self.set_defaults(**d)
+        return super().parse_known_args(args, namespace)
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''")
+        if all(char not in arg for char in extra_chars)
+        else "'" + arg.replace("'", "'\\''") + "'"
+        for arg in sys.argv
+    ]
+    return sys.executable + " " + " ".join(argv)

utils/subtitle_utils.py ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import re
+def time_convert(ms):
+    ms = int(ms)
+    tail = ms % 1000
+    s = ms // 1000
+    mi = s // 60
+    s = s % 60
+    h = mi // 60
+    mi = mi % 60
+    h = "00" if h == 0 else str(h)
+    mi = "00" if mi == 0 else str(mi)
+    s = "00" if s == 0 else str(s)
+    tail = str(tail)
+    if len(h) == 1: h = '0' + h
+    if len(mi) == 1: mi = '0' + mi
+    if len(s) == 1: s = '0' + s
+    return "{}:{}:{},{}".format(h, mi, s, tail)
+def str2list(text):
+    pattern = re.compile(r'[\u4e00-\u9fff]|[\w-]+', re.UNICODE)
+    elements = pattern.findall(text)
+    return elements
+class Text2SRT():
+    def __init__(self, text, timestamp, offset=0):
+        self.token_list = text
+        self.timestamp = timestamp
+        start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
+        self.start_sec, self.end_sec = start, end
+        self.start_time = time_convert(start)
+        self.end_time = time_convert(end)
+    def text(self):
+        if isinstance(self.token_list, str):
+            return self.token_list
+        else:
+            res = ""
+            for word in self.token_list:
+                if '\u4e00' <= word <= '\u9fff':
+                    res += word
+                else:
+                    res += " " + word
+            return res.lstrip()
+    def srt(self, acc_ost=0.0):
+        return "{} --> {}\n{}\n".format(
+            time_convert(self.start_sec+acc_ost*1000),
+            time_convert(self.end_sec+acc_ost*1000),
+            self.text())
+    def time(self, acc_ost=0.0):
+        return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
+class Text2SRT_audio():
+    def __init__(self, text, start,end, offset=0):
+        self.token_list = text
+        # self.timestamp = timestamp
+        start, end = start*1000 - offset, end*1000 - offset
+        self.start_sec, self.end_sec = start, end
+        self.start_time = time_convert(start)
+        self.end_time = time_convert(end)
+    def text(self):
+        if isinstance(self.token_list, str):
+            return self.token_list
+        else:
+            res = ""
+            for word in self.token_list:
+                if '\u4e00' <= word <= '\u9fff':
+                    res += word
+                else:
+                    res += " " + word
+            return res.lstrip()
+    def srt(self, acc_ost=0.0):
+        return "{} --> {}\n{}\n".format(
+            time_convert(self.start_sec+acc_ost*1000),
+            time_convert(self.end_sec+acc_ost*1000),
+            self.text())
+    def time(self, acc_ost=0.0):
+        return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
+def generate_srt(sentence_list):
+    srt_total = ''
+    for i, sent in enumerate(sentence_list):
+        t2s = Text2SRT(sent['text'], sent['timestamp'])
+        if 'spk' in sent:
+            srt_total += "{}  spk{}\n{}".format(i, sent['spk'], t2s.srt())
+        else:
+            srt_total += "{}\n{}".format(i, t2s.srt())
+    return srt_total
+def trans_format(text):
+    # 将whisper的识别结果转化为后续的数据标准
+    total_list = []
+    timestamp_list = []
+    sentence_info = []
+    for segment in text["segments"]:
+        timestamp_list.append([int(segment["start"]*1000), int(segment["end"]*1000)])
+        if segment["words"] != []:
+            sentence_info.append({"text":segment["text"],"start":int(segment["start"]*1000),"end":int(segment["end"]*1000),"timestamp":[[int(item['start']*1000), int(item['end']*1000)] for item in segment["words"]],"raw_text":segment["text"]})
+    raw_text = text["text"]
+    total_list.append({"text":text["text"],"raw_text":raw_text,"timestamp":timestamp_list,"sentence_info":sentence_info})
+    return total_list
+def generate_audio_srt(sentence_list):
+    '''根据音频转文字，生成对应的srt格式字幕'''
+    srt_total = ''
+    for i, sent in enumerate(sentence_list):
+        t2s = Text2SRT_audio(sent['text'], sent['start'],sent['end'])
+        if 'spk' in sent:
+            srt_total += "{}  spk{}\n{}".format(i, sent['spk'], t2s.srt())
+        else:
+            srt_total += "{}\n{}".format(i, t2s.srt())
+    return srt_total
+def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
+    '''
+    生成字幕片段
+    return:
+    srt_total：生成的SRT格式字幕文本。
+    subs：字幕的时间范围及文本信息，格式为 [(时间, 文本), ...]。
+    cc：字幕的最终编号。
+    '''
+    start, end = int(start * 1000), int(end * 1000)
+    srt_total = ''
+    cc = 1 + begin_index
+    subs = []
+    for _, sent in enumerate(sentence_list):
+        if isinstance(sent['text'], str):
+            sent['text'] = str2list(sent['text'])
+        if sent['timestamp'][-1][1] <= start:
+            # print("CASE0")
+            continue
+        if sent['timestamp'][0][0] >= end:
+            # print("CASE4")
+            break
+        # parts in between
+        if (sent['timestamp'][-1][1] <= end and sent['timestamp'][0][0] > start) or (sent['timestamp'][-1][1] == end and sent['timestamp'][0][0] == start):
+            # print("CASE1"); import pdb; pdb.set_trace()
+            t2s = Text2SRT(sent['text'], sent['timestamp'], offset=start)
+            srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+            subs.append((t2s.time(time_acc_ost), t2s.text()))
+            cc += 1
+            continue
+        if sent['timestamp'][0][0] <= start:
+            # print("CASE2"); import pdb; pdb.set_trace()
+            if not sent['timestamp'][-1][1] > end:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        break
+                _text = sent['text'][j:]
+                _ts = sent['timestamp'][j:]
+            else:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        _start = j
+                        break
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > end:
+                        _end = j
+                        break
+                # _text = " ".join(sent['text'][_start:_end])
+                _text = sent['text'][_start:_end]
+                _ts = sent['timestamp'][_start:_end]
+            if len(ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append((t2s.time(time_acc_ost), t2s.text()))
+                cc += 1
+            continue
+        if sent['timestamp'][-1][1] > end:
+            # print("CASE3"); import pdb; pdb.set_trace()
+            for j, ts in enumerate(sent['timestamp']):
+                if ts[1] > end:
+                    break
+            _text = sent['text'][:j]
+            _ts = sent['timestamp'][:j]
+            if len(_ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append(
+                    (t2s.time(time_acc_ost), t2s.text())
+                    )
+                cc += 1
+            continue
+    return srt_total, subs, cc

utils/theme.json ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+	"theme": {
+	"_font": [
+	{
+	"__gradio_font__": true,
+	"name": "Montserrat",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-sans-serif",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "system-ui",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "sans-serif",
+	"class": "font"
+	}
+	],
+	"_font_mono": [
+	{
+	"__gradio_font__": true,
+	"name": "IBM Plex Mono",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-monospace",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "Consolas",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "monospace",
+	"class": "font"
+	}
+	],
+	"background_fill_primary": "*neutral_50",
+	"background_fill_primary_dark": "*neutral_950",
+	"background_fill_secondary": "*neutral_50",
+	"background_fill_secondary_dark": "*neutral_900",
+	"block_background_fill": "white",
+	"block_background_fill_dark": "*neutral_800",
+	"block_border_color": "*border_color_primary",
+	"block_border_color_dark": "*border_color_primary",
+	"block_border_width": "0px",
+	"block_border_width_dark": "0px",
+	"block_info_text_color": "*body_text_color_subdued",
+	"block_info_text_color_dark": "*body_text_color_subdued",
+	"block_info_text_size": "*text_sm",
+	"block_info_text_weight": "400",
+	"block_label_background_fill": "*primary_100",
+	"block_label_background_fill_dark": "*primary_600",
+	"block_label_border_color": "*border_color_primary",
+	"block_label_border_color_dark": "*border_color_primary",
+	"block_label_border_width": "1px",
+	"block_label_border_width_dark": "1px",
+	"block_label_margin": "*spacing_md",
+	"block_label_padding": "*spacing_sm *spacing_md",
+	"block_label_radius": "*radius_md",
+	"block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)",
+	"block_label_text_color": "*primary_500",
+	"block_label_text_color_dark": "*white",
+	"block_label_text_size": "*text_md",
+	"block_label_text_weight": "600",
+	"block_padding": "*spacing_xl calc(*spacing_xl + 2px)",
+	"block_radius": "*radius_lg",
+	"block_shadow": "none",
+	"block_shadow_dark": "none",
+	"block_title_background_fill": "*block_label_background_fill",
+	"block_title_background_fill_dark": "*block_label_background_fill",
+	"block_title_border_color": "none",
+	"block_title_border_color_dark": "none",
+	"block_title_border_width": "0px",
+	"block_title_border_width_dark": "0px",
+	"block_title_padding": "*block_label_padding",
+	"block_title_radius": "*block_label_radius",
+	"block_title_text_color": "*primary_500",
+	"block_title_text_color_dark": "*white",
+	"block_title_text_size": "*text_md",
+	"block_title_text_weight": "600",
+	"body_background_fill": "*background_fill_primary",
+	"body_background_fill_dark": "*background_fill_primary",
+	"body_text_color": "*neutral_800",
+	"body_text_color_dark": "*neutral_100",
+	"body_text_color_subdued": "*neutral_400",
+	"body_text_color_subdued_dark": "*neutral_400",
+	"body_text_size": "*text_md",
+	"body_text_weight": "400",
+	"border_color_accent": "*primary_300",
+	"border_color_accent_dark": "*neutral_600",
+	"border_color_primary": "*neutral_200",
+	"border_color_primary_dark": "*neutral_700",
+	"button_border_width": "*input_border_width",
+	"button_border_width_dark": "*input_border_width",
+	"button_cancel_background_fill": "*button_secondary_background_fill",
+	"button_cancel_background_fill_dark": "*button_secondary_background_fill",
+	"button_cancel_background_fill_hover": "*button_secondary_background_fill_hover",
+	"button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"button_cancel_border_color": "*button_secondary_border_color",
+	"button_cancel_border_color_dark": "*button_secondary_border_color",
+	"button_cancel_border_color_hover": "*button_cancel_border_color",
+	"button_cancel_border_color_hover_dark": "*button_cancel_border_color",
+	"button_cancel_text_color": "*button_secondary_text_color",
+	"button_cancel_text_color_dark": "*button_secondary_text_color",
+	"button_cancel_text_color_hover": "*button_cancel_text_color",
+	"button_cancel_text_color_hover_dark": "*button_cancel_text_color",
+	"button_large_padding": "*spacing_lg calc(2 * *spacing_lg)",
+	"button_large_radius": "*radius_lg",
+	"button_large_text_size": "*text_lg",
+	"button_large_text_weight": "600",
+	"button_primary_background_fill": "*primary_500",
+	"button_primary_background_fill_dark": "*primary_700",
+	"button_primary_background_fill_hover": "*primary_400",
+	"button_primary_background_fill_hover_dark": "*primary_500",
+	"button_primary_border_color": "*primary_200",
+	"button_primary_border_color_dark": "*primary_600",
+	"button_primary_border_color_hover": "*button_primary_border_color",
+	"button_primary_border_color_hover_dark": "*button_primary_border_color",
+	"button_primary_text_color": "white",
+	"button_primary_text_color_dark": "white",
+	"button_primary_text_color_hover": "*button_primary_text_color",
+	"button_primary_text_color_hover_dark": "*button_primary_text_color",
+	"button_secondary_background_fill": "white",
+	"button_secondary_background_fill_dark": "*neutral_600",
+	"button_secondary_background_fill_hover": "*neutral_100",
+	"button_secondary_background_fill_hover_dark": "*primary_500",
+	"button_secondary_border_color": "*neutral_200",
+	"button_secondary_border_color_dark": "*neutral_600",
+	"button_secondary_border_color_hover": "*button_secondary_border_color",
+	"button_secondary_border_color_hover_dark": "*button_secondary_border_color",
+	"button_secondary_text_color": "*neutral_800",
+	"button_secondary_text_color_dark": "white",
+	"button_secondary_text_color_hover": "*button_secondary_text_color",
+	"button_secondary_text_color_hover_dark": "*button_secondary_text_color",
+	"button_shadow": "*shadow_drop_lg",
+	"button_shadow_active": "*shadow_inset",
+	"button_shadow_hover": "*shadow_drop_lg",
+	"button_small_padding": "*spacing_sm calc(2 * *spacing_sm)",
+	"button_small_radius": "*radius_lg",
+	"button_small_text_size": "*text_md",
+	"button_small_text_weight": "400",
+	"button_transition": "background-color 0.2s ease",
+	"checkbox_background_color": "*background_fill_primary",
+	"checkbox_background_color_dark": "*neutral_800",
+	"checkbox_background_color_focus": "*checkbox_background_color",
+	"checkbox_background_color_focus_dark": "*checkbox_background_color",
+	"checkbox_background_color_hover": "*checkbox_background_color",
+	"checkbox_background_color_hover_dark": "*checkbox_background_color",
+	"checkbox_background_color_selected": "*primary_600",
+	"checkbox_background_color_selected_dark": "*primary_700",
+	"checkbox_border_color": "*neutral_100",
+	"checkbox_border_color_dark": "*neutral_600",
+	"checkbox_border_color_focus": "*primary_500",
+	"checkbox_border_color_focus_dark": "*primary_600",
+	"checkbox_border_color_hover": "*neutral_300",
+	"checkbox_border_color_hover_dark": "*neutral_600",
+	"checkbox_border_color_selected": "*primary_600",
+	"checkbox_border_color_selected_dark": "*primary_700",
+	"checkbox_border_radius": "*radius_sm",
+	"checkbox_border_width": "1px",
+	"checkbox_border_width_dark": "*input_border_width",
+	"checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")",
+	"checkbox_label_background_fill": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_dark": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_selected": "*primary_500",
+	"checkbox_label_background_fill_selected_dark": "*primary_600",
+	"checkbox_label_border_color": "*border_color_primary",
+	"checkbox_label_border_color_dark": "*border_color_primary",
+	"checkbox_label_border_color_hover": "*checkbox_label_border_color",
+	"checkbox_label_border_color_hover_dark": "*checkbox_label_border_color",
+	"checkbox_label_border_width": "*input_border_width",
+	"checkbox_label_border_width_dark": "*input_border_width",
+	"checkbox_label_gap": "*spacing_lg",
+	"checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)",
+	"checkbox_label_shadow": "*shadow_drop_lg",
+	"checkbox_label_text_color": "*body_text_color",
+	"checkbox_label_text_color_dark": "*body_text_color",
+	"checkbox_label_text_color_selected": "white",
+	"checkbox_label_text_color_selected_dark": "*checkbox_label_text_color",
+	"checkbox_label_text_size": "*text_md",
+	"checkbox_label_text_weight": "400",
+	"checkbox_shadow": "none",
+	"color_accent": "*primary_500",
+	"color_accent_soft": "*primary_50",
+	"color_accent_soft_dark": "*neutral_700",
+	"container_radius": "*radius_lg",
+	"embed_radius": "*radius_lg",
+	"error_background_fill": "#fee2e2",
+	"error_background_fill_dark": "*background_fill_primary",
+	"error_border_color": "#fecaca",
+	"error_border_color_dark": "*border_color_primary",
+	"error_border_width": "1px",
+	"error_border_width_dark": "1px",
+	"error_text_color": "#ef4444",
+	"error_text_color_dark": "#ef4444",
+	"font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif",
+	"font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace",
+	"form_gap_width": "0px",
+	"input_background_fill": "white",
+	"input_background_fill_dark": "*neutral_700",
+	"input_background_fill_focus": "*secondary_500",
+	"input_background_fill_focus_dark": "*secondary_600",
+	"input_background_fill_hover": "*input_background_fill",
+	"input_background_fill_hover_dark": "*input_background_fill",
+	"input_border_color": "*neutral_50",
+	"input_border_color_dark": "*border_color_primary",
+	"input_border_color_focus": "*secondary_300",
+	"input_border_color_focus_dark": "*neutral_700",
+	"input_border_color_hover": "*input_border_color",
+	"input_border_color_hover_dark": "*input_border_color",
+	"input_border_width": "0px",
+	"input_border_width_dark": "0px",
+	"input_padding": "*spacing_xl",
+	"input_placeholder_color": "*neutral_400",
+	"input_placeholder_color_dark": "*neutral_500",
+	"input_radius": "*radius_lg",
+	"input_shadow": "*shadow_drop",
+	"input_shadow_dark": "*shadow_drop",
+	"input_shadow_focus": "*shadow_drop_lg",
+	"input_shadow_focus_dark": "*shadow_drop_lg",
+	"input_text_size": "*text_md",
+	"input_text_weight": "400",
+	"layout_gap": "*spacing_xxl",
+	"link_text_color": "*secondary_600",
+	"link_text_color_active": "*secondary_600",
+	"link_text_color_active_dark": "*secondary_500",
+	"link_text_color_dark": "*secondary_500",
+	"link_text_color_hover": "*secondary_700",
+	"link_text_color_hover_dark": "*secondary_400",
+	"link_text_color_visited": "*secondary_500",
+	"link_text_color_visited_dark": "*secondary_600",
+	"loader_color": "*color_accent",
+	"loader_color_dark": "*color_accent",
+	"name": "base",
+	"neutral_100": "#f3f4f6",
+	"neutral_200": "#e5e7eb",
+	"neutral_300": "#d1d5db",
+	"neutral_400": "#9ca3af",
+	"neutral_50": "#f9fafb",
+	"neutral_500": "#6b7280",
+	"neutral_600": "#4b5563",
+	"neutral_700": "#374151",
+	"neutral_800": "#1f2937",
+	"neutral_900": "#111827",
+	"neutral_950": "#0b0f19",
+	"panel_background_fill": "*background_fill_secondary",
+	"panel_background_fill_dark": "*background_fill_secondary",
+	"panel_border_color": "*border_color_primary",
+	"panel_border_color_dark": "*border_color_primary",
+	"panel_border_width": "1px",
+	"panel_border_width_dark": "1px",
+	"primary_100": "#e0e7ff",
+	"primary_200": "#c7d2fe",
+	"primary_300": "#a5b4fc",
+	"primary_400": "#818cf8",
+	"primary_50": "#eef2ff",
+	"primary_500": "#6366f1",
+	"primary_600": "#4f46e5",
+	"primary_700": "#4338ca",
+	"primary_800": "#3730a3",
+	"primary_900": "#312e81",
+	"primary_950": "#2b2c5e",
+	"prose_header_text_weight": "600",
+	"prose_text_size": "*text_md",
+	"prose_text_weight": "400",
+	"radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")",
+	"radius_lg": "6px",
+	"radius_md": "4px",
+	"radius_sm": "2px",
+	"radius_xl": "8px",
+	"radius_xs": "1px",
+	"radius_xxl": "12px",
+	"radius_xxs": "1px",
+	"secondary_100": "#ecfccb",
+	"secondary_200": "#d9f99d",
+	"secondary_300": "#bef264",
+	"secondary_400": "#a3e635",
+	"secondary_50": "#f7fee7",
+	"secondary_500": "#84cc16",
+	"secondary_600": "#65a30d",
+	"secondary_700": "#4d7c0f",
+	"secondary_800": "#3f6212",
+	"secondary_900": "#365314",
+	"secondary_950": "#2f4e14",
+	"section_header_text_size": "*text_md",
+	"section_header_text_weight": "400",
+	"shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)",
+	"shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)",
+	"shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset",
+	"shadow_spread": "6px",
+	"shadow_spread_dark": "1px",
+	"slider_color": "*primary_500",
+	"slider_color_dark": "*primary_600",
+	"spacing_lg": "6px",
+	"spacing_md": "4px",
+	"spacing_sm": "2px",
+	"spacing_xl": "9px",
+	"spacing_xs": "1px",
+	"spacing_xxl": "12px",
+	"spacing_xxs": "1px",
+	"stat_background_fill": "*primary_300",
+	"stat_background_fill_dark": "*primary_500",
+	"table_border_color": "*neutral_300",
+	"table_border_color_dark": "*neutral_700",
+	"table_even_background_fill": "white",
+	"table_even_background_fill_dark": "*neutral_950",
+	"table_odd_background_fill": "*neutral_50",
+	"table_odd_background_fill_dark": "*neutral_900",
+	"table_radius": "*radius_lg",
+	"table_row_focus": "*color_accent_soft",
+	"table_row_focus_dark": "*color_accent_soft",
+	"text_lg": "16px",
+	"text_md": "14px",
+	"text_sm": "12px",
+	"text_xl": "22px",
+	"text_xs": "10px",
+	"text_xxl": "26px",
+	"text_xxs": "9px"
+	},
+	"version": "0.0.1"
+	}

utils/trans_utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os
+import re
+import numpy as np
+PUNC_LIST = ['，', '。', '！', '？', '、', ',', '.', '?', '!']
+def pre_proc(text):
+    res = ''
+    for i in range(len(text)):
+        if text[i] in PUNC_LIST:
+            continue
+        if '\u4e00' <= text[i] <= '\u9fff':
+            if len(res) and res[-1] != " ":
+                res += ' ' + text[i]+' '
+            else:
+                res += text[i]+' '
+        else:
+            res += text[i]
+    if res[-1] == ' ':
+        res = res[:-1]
+    return res
+def proc(raw_text, timestamp, dest_text, lang='zh'):
+    # simple matching
+    ld = len(dest_text.split())
+    mi, ts = [], []
+    offset = 0
+    while True:
+        fi = raw_text.find(dest_text, offset, len(raw_text))
+        ti = raw_text[:fi].count(' ')
+        if fi == -1:
+            break
+        offset = fi + ld
+        mi.append(fi)
+        ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
+    return ts
+def proc_spk(dest_spk, sd_sentences):
+    ts = []
+    for d in sd_sentences:
+        d_start = d['timestamp'][0][0]
+        d_end = d['timestamp'][-1][1]
+        spkid=dest_spk[3:]
+        if str(d['spk']) == spkid and d_end-d_start>999:
+            ts.append([d_start*16, d_end*16])
+    return ts
+def generate_vad_data(data, sd_sentences, sr=16000):
+    assert len(data.shape) == 1
+    vad_data = []
+    for d in sd_sentences:
+        d_start = round(d['ts_list'][0][0]/1000, 2)
+        d_end = round(d['ts_list'][-1][1]/1000, 2)
+        vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
+    return vad_data
+def write_state(output_dir, state):
+    for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
+        with open(output_dir+key, 'w') as fout:
+            fout.write(str(state[key[1:]]))
+    if 'sd_sentences' in state:
+        with open(output_dir+'/sd_sentences', 'w') as fout:
+            fout.write(str(state['sd_sentences']))
+def load_state(output_dir):
+    state = {}
+    with open(output_dir+'/recog_res_raw') as fin:
+        line = fin.read()
+        state['recog_res_raw'] = line
+    with open(output_dir+'/timestamp') as fin:
+        line = fin.read()
+        state['timestamp'] = eval(line)
+    with open(output_dir+'/sentences') as fin:
+        line = fin.read()
+        state['sentences'] = eval(line)
+    if os.path.exists(output_dir+'/sd_sentences'):
+        with open(output_dir+'/sd_sentences') as fin:
+            line = fin.read()
+            state['sd_sentences'] = eval(line)
+    return state
+def convert_pcm_to_float(data):
+    if data.dtype == np.float64:
+        return data
+    elif data.dtype == np.float32:
+        return data.astype(np.float64)
+    elif data.dtype == np.int16:
+        bit_depth = 16
+    elif data.dtype == np.int32:
+        bit_depth = 32
+    elif data.dtype == np.int8:
+        bit_depth = 8
+    else:
+        raise ValueError("Unsupported audio data type")
+    # Now handle the integer types
+    max_int_value = float(2 ** (bit_depth - 1))
+    if bit_depth == 8:
+        data = data - 128
+    return (data.astype(np.float64) / max_int_value)
+def convert_time_to_millis(time_str):
+    # 格式: [小时:分钟:秒,毫秒]
+    hours, minutes, seconds, milliseconds = map(int, re.split('[:,]', time_str))
+    return (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
+def extract_timestamps(input_text):
+    # 使用正则表达式查找所有时间戳
+    timestamps = re.findall(r'\[(\d{2}:\d{2}:\d{2},\d{2,3})\s*-\s*(\d{2}:\d{2}:\d{2},\d{2,3})\]', input_text)
+    times_list = []
+    print(timestamps)
+    # 循环遍历找到的所有时间戳，并转换为毫秒
+    for start_time, end_time in timestamps:
+        start_millis = convert_time_to_millis(start_time)
+        end_millis = convert_time_to_millis(end_time)
+        times_list.append([start_millis, end_millis])
+    return times_list
+if __name__ == '__main__':
+    text = ("1. [00:00:00,500-00:00:05,850] 在我们的设计普惠当中，有一个我经常津津乐道的项目叫寻找远方的美好。"
+    "2. [00:00:07,120-00:00:12,940] 啊，在这样一个我们叫寻美在这样的一个项目当中，我们把它跟乡村振兴去结合起来，利用我们的设计的能力。"
+    "3. [00:00:13,240-00:00:25,620] 问我们自身员工的设设计能力，我们设计生态伙伴的能力，帮助乡村振兴当中，要希望把他的产品推向市场，把他的农产品把他加工产品推向市场的这样的伙伴做一件事情，")
+    print(extract_timestamps(text))

videoclipper.py ADDED Viewed

	@@ -0,0 +1,348 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import re
+import os
+import sys
+import copy
+import librosa
+import logging
+import argparse
+import numpy as np
+import soundfile as sf
+from moviepy.editor import *
+import moviepy.editor as mpy
+from moviepy.video.tools.subtitles import SubtitlesClip, TextClip
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+from moviepy.video.compositing import CompositeVideoClip
+from utils.subtitle_utils import generate_srt, generate_srt_clip,generate_audio_srt,trans_format
+from utils.argparse_tools import ArgumentParser, get_commandline_args
+from utils.trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
+import whisper
+class VideoClipper():
+    def __init__(self, model):
+        logging.warning("Initializing VideoClipper.")
+        self.GLOBAL_COUNT = 0
+        self.model = model
+    def recog(self, audio_input, state=None, output_dir=None,text=None):
+        '''
+        将音频输入转化为文本。它可以选择性地进行说话人分离（SD, Speaker Diarization）和生成字幕文件（SRT格式）。
+        return:
+        res_text：识别出的文本内容。
+        res_srt：识别内容生成的 SRT 字幕格式。
+        state：包含了识别的原始结果、时间戳和句子信息的状态字典
+        '''
+        if state is None:
+            state = {}
+        sr, data = audio_input
+        # Convert to float64 consistently (includes data type checking)
+        data = convert_pcm_to_float(data)
+        # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
+        if sr != 16000: # resample with librosa
+            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
+        if len(data.shape) == 2:  # multi-channel wav input
+            logging.warning("Input wav shape: {}, only first channel reserved.".format(data.shape))
+            data = data[:,0]
+        state['audio_input'] = (sr, data)
+        rec_result = trans_format(text)
+        res_srt = generate_srt(rec_result[0]['sentence_info'])
+        state['recog_res_raw'] = rec_result[0]['raw_text']
+        state['timestamp'] = rec_result[0]['timestamp']
+        state['sentences'] = rec_result[0]['sentence_info']
+        res_text = rec_result[0]['text']
+        return res_text, res_srt, state
+    def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None, output_dir=None, timestamp_list=None):
+        # get from state
+        '''
+        dest_text：目标文本，根据这个文本内容来定位音频中相应的片段。
+        start_ost 和 end_ost：起始和结束时间偏移量，用于微调音频片段的起止位置。
+        state：包含函数执行所需的数据状态，例如音频数据、识别结果、时间戳等。
+        dest_spk：目标说话者，如果指定了这个参数，函数会根据说话者信息来提取音频片段。
+        output_dir：输出目录，用于保存结果。
+        timestamp_list：时间戳列表，如果提供了时间戳，则直接按照这些时间戳提取音频片段。
+        '''
+        audio_input = state['audio_input']
+        recog_res_raw = state['recog_res_raw']
+        timestamp = state['timestamp']
+        sentences = state['sentences']
+        sr, data = audio_input
+        data = data.astype(np.float64)
+        if timestamp_list is None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        log_append = ""
+                        offset_b, offset_e = 0, 0
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text) # 得到时间戳
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+                log_append = ""
+        else:
+            all_ts = timestamp_list
+        ts = all_ts
+        # ts.sort()
+        srt_index = 0
+        clip_srt = ""
+        if len(ts):
+            start, end = ts[0]
+            start = min(max(0, start+start_ost*16), len(data))
+            end = min(max(0, end+end_ost*16), len(data))
+            res_audio = data[start:end]
+            start_end_info = "from {} to {}".format(start/16000, end/16000)
+            srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
+            clip_srt += srt_clip
+            for _ts in ts[1:]:  # multiple sentence input or multiple output matched
+                start, end = _ts
+                start = min(max(0, start+start_ost*16), len(data))
+                end = min(max(0, end+end_ost*16), len(data))
+                start_end_info += ", from {} to {}".format(start, end)
+                res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
+                srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
+                clip_srt += srt_clip
+        if len(ts):
+            message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
+        else:
+            message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
+            res_audio = data
+        return (sr, res_audio), message, clip_srt # 音频数据、消息文本和生成的 SRT 字幕
+    def video_recog(self, video_filename, output_dir=None,ASR="whisper"):
+        '''通过处理视频获得想要的视频、音频以及其他信息'''
+        video = mpy.VideoFileClip(video_filename)
+        # Extract the base name, add '_clip.mp4', and 'wav'
+        if output_dir is not None:
+            os.makedirs(output_dir, exist_ok=True)
+            _, base_name = os.path.split(video_filename)
+            base_name, _ = os.path.splitext(base_name)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
+            audio_file = os.path.join(output_dir, audio_file)
+        else:
+            base_name, _ = os.path.splitext(video_filename)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
+        video.audio.write_audiofile(audio_file)
+        # 在这里使用whisper对音频文件进行处理
+        result_audio = self.model.transcribe(audio_file,language = "zh", word_timestamps=True)
+        wav = librosa.load(audio_file, sr=16000)[0]
+        # delete the audio file after processing
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+        state = {
+            'video_filename': video_filename,
+            'clip_video_file': clip_video_file,
+            'video': video,
+        }
+        return self.recog((16000, wav), state, output_dir,text=result_audio)
+    def video_clip(self,
+                   dest_text,
+                   start_ost,
+                   end_ost,
+                   state,
+                   font_size=32,
+                   font_color='white',
+                   add_sub=False,
+                   dest_spk=None,
+                   output_dir=None,
+                   timestamp_list=None):
+        # get from state
+        recog_res_raw = state['recog_res_raw']
+        timestamp = state['timestamp']
+        sentences = state['sentences']
+        video = state['video']
+        clip_video_file = state['clip_video_file']
+        video_filename = state['video_filename']
+        if timestamp_list is None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        offset_b, offset_e = 0, 0
+                        log_append = ""
+                    # import pdb; pdb.set_trace()
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text.lower())
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+        else:  # AI clip pass timestamp as input directly
+            all_ts = [[i[0]*16.0, i[1]*16.0] for i in timestamp_list]
+        srt_index = 0
+        time_acc_ost = 0.0
+        ts = all_ts
+        # ts.sort()
+        clip_srt = ""
+        if len(ts):
+            # if self.lang == 'en' and isinstance(sentences, str):
+            #     sentences = sentences.split()
+            start, end = ts[0][0] / 16000, ts[0][1] / 16000
+            srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
+            start, end = start+start_ost/1000.0, end+end_ost/1000.0
+            video_clip = video.subclip(start, end)
+            start_end_info = "from {} to {}".format(start, end)
+            clip_srt += srt_clip
+            if add_sub: # 叠加字幕
+                generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
+                subtitles = SubtitlesClip(subs, generator)
+                video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
+            concate_clip = [video_clip]
+            time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
+            for _ts in ts[1:]:
+                start, end = _ts[0] / 16000, _ts[1] / 16000
+                srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
+                if not len(subs):
+                    continue
+                chi_subs = []
+                sub_starts = subs[0][0][0]
+                for sub in subs:
+                    chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
+                start, end = start+start_ost/1000.0, end+end_ost/1000.0
+                _video_clip = video.subclip(start, end)
+                start_end_info += ", from {} to {}".format(str(start)[:5], str(end)[:5])
+                clip_srt += srt_clip
+                if add_sub:
+                    generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
+                    subtitles = SubtitlesClip(chi_subs, generator)
+                    _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
+                    # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
+                concate_clip.append(copy.copy(_video_clip))
+                time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
+            message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
+            logging.warning("Concating...")
+            if len(concate_clip) > 1:    # 对视频片段进行拼接
+                video_clip = concatenate_videoclips(concate_clip)
+            # clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+            if output_dir is not None:
+                os.makedirs(output_dir, exist_ok=True)
+                _, file_with_extension = os.path.split(clip_video_file)
+                clip_video_file_name, _ = os.path.splitext(file_with_extension)
+                print(output_dir, clip_video_file)
+                clip_video_file = os.path.join(output_dir, "{}_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+                temp_audio_file = os.path.join(output_dir, "{}_tempaudio_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+            else:
+                clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+                temp_audio_file = clip_video_file[:-4] + '_tempaudio_no{}.mp4'.format(self.GLOBAL_COUNT)
+            video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile=temp_audio_file,fps=25) #写入指定文件路径下
+            self.GLOBAL_COUNT += 1
+        else:
+            clip_video_file = video_filename
+            message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
+            srt_clip = ''
+        return clip_video_file, message, clip_srt
+def get_parser():
+    parser = ArgumentParser(
+        description="ClipVideo Argument",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--stage",
+        type=int,
+        choices=(1, 2),
+        help="Stage, 0 for recognizing and 1 for clipping",
+        required=True
+    )
+    parser.add_argument(
+        "--file",
+        type=str,
+        default=None,
+        help="Input file path",
+        required=True
+    )
+    parser.add_argument(
+        "--sd_switch",
+        type=str,
+        choices=("no", "yes"),
+        default="no",
+        help="Turn on the speaker diarization or not",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='./output',
+        help="Output files path",
+    )
+    parser.add_argument(
+        "--dest_text",
+        type=str,
+        default=None,
+        help="Destination text string for clipping",
+    )
+    parser.add_argument(
+        "--dest_spk",
+        type=str,
+        default=None,
+        help="Destination spk id for clipping",
+    )
+    parser.add_argument(
+        "--start_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at beginning for clipping"
+    )
+    parser.add_argument(
+        "--end_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at ending for clipping"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="Output file path"
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default='zh',
+        help="language"
+    )
+    return parser