File size: 13,619 Bytes
cdbb2b2
 
 
 
 
 
 
 
 
 
1b1a23a
 
 
 
cdbb2b2
 
03e8274
cdbb2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b1a23a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbb2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b1a23a
 
cdbb2b2
 
1b1a23a
cdbb2b2
1b1a23a
 
 
cdbb2b2
 
 
 
1b1a23a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdbb2b2
 
 
 
 
 
 
 
 
 
 
1b1a23a
 
cdbb2b2
03e8274
cdbb2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b1a23a
 
 
 
cdbb2b2
 
 
 
 
 
 
 
 
 
 
1b1a23a
cdbb2b2
 
 
 
 
 
 
 
1b1a23a
cdbb2b2
1b1a23a
cdbb2b2
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import logging
import os
from videoclipper import VideoClipper
import gradio as gr
import requests
from huggingface_hub import whoami
import whisper
from llm.openai_api import openai_call
from llm.yi_moe_api import yi_moe
from utils.trans_utils import extract_timestamps
import os
from pytubefix import YouTube
import ffmpeg
from yidong import YiDong

API_URL_TEMPLATE = "https://api-yidong.lingyiwanwu.com/v1/ops/api_key?user_email={user_email}&user_source=huggingface"
model = whisper.load_model("large-v3")
audio_clipper = VideoClipper(model)

def get_user_email(oauth_token: gr.OAuthToken | None) -> str | None:
    def call_api(user_email):
        url = API_URL_TEMPLATE.format(user_email=user_email)
        headers = {"Authorization":  f'Basic {os.getenv("AUTH")}'}
        response = requests.post(url, headers=headers)
        return response.json()["data"]["display_api_key"]

    if oauth_token is None:
        return None

    user_info = whoami(token=oauth_token.token)
    email = user_info.get("email")
    return call_api(email)



def get_video_size(stream):

    return stream.filesize / (1024 * 1024)

def download_youtube_video(url):
    try:
        yt = YouTube(url, use_po_token=True)

        video_streams = yt.streams.filter(type="video").order_by('resolution').desc()
        audio_stream = yt.streams.filter(only_audio=True).first()

        print("Available video streams:")
        for i, stream in enumerate(video_streams):
            size = get_video_size(stream)
            stream_type = "Progressive" if stream.is_progressive else "Adaptive"
            print(f"{i}. Resolution: {stream.resolution}, Size: {size:.2f} MB, Type: {stream_type}")

        choice = int(input("Enter the number of the video stream to download: "))
        selected_stream = video_streams[choice]

        if not os.path.exists('videos'):
            os.makedirs('videos')

        print(f"Downloading video: {yt.title}")
        video_file = selected_stream.download(output_path='videos', filename_prefix="video_")

        if not selected_stream.is_progressive:
            print("Downloading audio...")
            audio_file = audio_stream.download(output_path='videos', filename_prefix="audio_")

            print("Merging video and audio...")
            output_file = os.path.join('videos', f"{yt.title}.mp4")
            stream = ffmpeg.input(video_file)
            audio = ffmpeg.input(audio_file)
            stream = ffmpeg.output(stream, audio, output_file, vcodec='libx264', acodec='aac', strict='experimental')
            ffmpeg.run(stream, overwrite_output=True)

            os.remove(video_file)
            os.remove(audio_file)
        else:
            output_file = video_file

        
        print(f"Downloaded: {yt.title} to 'videos' folder")
        print(f"File path: {output_file}")
        return output_file

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Please make sure you have the latest version of pytube and ffmpeg-python installed.")
        print("You can update them by running:")
        print("pip install --upgrade pytube ffmpeg-python")
        print("Also, ensure that ffmpeg is installed on your system and available in your PATH.")

def updata_video(url):
    video_path = download_youtube_video(url)
    return video_path

def audio_recog(audio_input, output_dir):
    return audio_clipper.recog(audio_input, None, output_dir=output_dir)

def video_recog(video_input, output_dir, ASR):
    return audio_clipper.video_recog(video_input, output_dir=output_dir, ASR=ASR)

def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):

    return audio_clipper.video_clip(
        dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
        )

def mix_recog(video_input, audio_input,output_dir,ASR="whisper"):
    '''
    识别视频或音频,返回识别的文本、字幕和状态信息。
    '''
    output_dir = output_dir.strip()
    if not len(output_dir):
        output_dir = None
    else:
        output_dir = os.path.abspath(output_dir)
    audio_state, video_state = None, None
    if video_input is not None:
        # import pdb; pdb.set_trace() ############
        res_text, res_srt, video_state = video_recog(
            video_input, output_dir=output_dir, ASR = ASR)
        return res_text, res_srt, video_state, None
    
    if audio_input is not None:
        res_text, res_srt, audio_state = audio_recog(
            audio_input, output_dir=output_dir)
        return res_text, res_srt, None, audio_state

def llm_inference(system_content, user_content, srt_text, model, apikey):
    SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot',"gpt-4o","22A"]
    if model.startswith('qwen'):
        return call_qwen_model(apikey, model, user_content+'\n'+srt_text, system_content)
    if model.startswith('gpt') or model.startswith('moonshot'):
        return openai_call(apikey, model, system_content = system_content, user_content = user_content+'\n'+srt_text)
    if model.startswith('22A'):
        return yi_moe(apikey, model, user_content+'\n'+srt_text, system_content)
    elif model.startswith('g4f'):
        model = "-".join(model.split('-')[1:])
        return g4f_openai_call(model, system_content, user_content+'\n'+srt_text)
    else:
        logging.error("LLM name error, only {} are supported as LLM name prefix."
                        .format(SUPPORT_LLM_PREFIX))


def clip_and_summary(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir, apikey):
    def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
        timestamp_list = extract_timestamps(LLM_res)
        output_dir = output_dir.strip()
        if not len(output_dir):
            output_dir = None
        else:
            output_dir = os.path.abspath(output_dir)
        if video_state is not None:
            clip_video_file, message, clip_srt = audio_clipper.video_clip(
                dest_text, start_ost, end_ost, video_state, 
                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
            return clip_video_file, None, message, clip_srt
        if audio_state is not None:
            (sr, res_audio), message, clip_srt = audio_clipper.clip(
                dest_text, start_ost, end_ost, audio_state, 
                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
            return None, (sr, res_audio), message, clip_srt
    def get_summarizes(api_key,input_file):
        yd = YiDong(api_key = api_key)
        rid = yd.add_resource(input_file)
        t = yd.video_summary(rid)
        data = t()
        return data.video_summary.summary
    clip_video_file, nouse, message, clip_srt = AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir)
    summary = get_summarizes(apikey, clip_video_file)
    return clip_video_file, nouse, message, clip_srt, summary

with gr.Blocks() as clip_service:
    video_state, audio_state = gr.State(), gr.State()
    with gr.Row():
        login_button = gr.LoginButton()
        user_email_display = gr.Textbox(
            label="In order to get your user key, please click on huggingface login, the first time you login you will have the full key, please save it. After that your key will be hidden.",
            interactive=True,
        )
        clip_service.load(get_user_email, inputs=None, outputs=user_email_display)
        logging.info(f"The value of the current variable is: {user_email_display}")
    youtube_url = gr.Textbox(label="🔗 Youtube视频链接|Youtube Video URL")
    download_button = gr.Button("📥 下载 | Download", variant="primary")    
    video_input = gr.Video(label="视频输入 | Video Input")
    audio_input = gr.Audio(label="音频输入 | Audio Input", visible=False)
    with gr.Column():
        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4', 
                        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4', 
                        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
                    [video_input],
                    label='示例视频 | Demo Video')
        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
                    [video_input],
                    label='多说话人示例视频 | Multi-speaker Demo Video')
        gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
                    [audio_input],
                    label="示例音频 | Demo Audio",visible=False)
        ASR_model = gr.Dropdown(
                choices=["whisper"], 
                value="whisper",
                label="ASR Model Name",
                allow_custom_value=True)
        recog_button = gr.Button("👂 识别 | ASR", variant="primary")
    output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空,Linux, mac系统可以稳定使用)")
    video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
    video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
    prompt_head = gr.Textbox(label="Prompt System (按需更改,最好不要变动主体和要求)", value=("你是一个视频srt字幕分析剪辑器,输入视频的srt字幕和用户命令,请你结合用户指令选出符合要求的片段并输出。注意:要谨慎分析用户的问题,找出符合用户提问的srt字幕片段。"
                        "尽可能将连续的片段并裁剪出来,将片段中在时间上连续的多个句子及它们的时间戳合并为一条,保证合并后的片段有着相同的主题"
                        "注意确保文字与时间戳的正确匹配。你应该按照以下顺序进行处理:"
                        "1.将srt字幕合并成数段主题的内容。2.将用户命令和查询进行匹配。"
                        "输出需严格按照如下格式:1. [开始时间-结束时间] 文本,注意其中的连接符是“-”"))

    prompt_head2 = gr.Textbox(label="Prompt User(请输入用户指令)")
    with gr.Column():
        with gr.Row():
            llm_model = gr.Dropdown(
                choices=["gpt-4o",
                        "22A"], 
                value="22A",
                label="LLM Model Name",
                allow_custom_value=True)
            apikey_input = gr.Textbox(label="APIKEY")
        llm_button =  gr.Button("LLM推理 | LLM Inference(首先进行识别,非g4f需配置对应apikey)", variant="primary")
    llm_result = gr.Textbox(label="LLM Clipper Result")
    llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
    video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)",value ="这个不需要", visible=False)
    video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)",value ="这个不需要", visible=False)
    with gr.Row():
        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)",visible=False)
        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)",visible=False)
    video_output = gr.Video(label="裁剪结果 | Video Clipped")
    audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
    clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log",visible=False)
    srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles",visible=False) 
    summary = gr.Textbox(label="📖 视频摘要 | Video Summary")
    download_button.click(updata_video, inputs=youtube_url, outputs=video_input)  
    recog_button.click(mix_recog, 
                    inputs=[video_input, 
                            audio_input, 
                            output_dir,
                            ASR_model
                            ], 
                    outputs=[video_text_output, video_srt_output, video_state, audio_state])

    llm_button.click(llm_inference,
                    inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
                    outputs=[llm_result])
    llm_clip_button.click(clip_and_summary, 
                    inputs=[llm_result,
                            video_text_input, 
                            video_spk_input, 
                            video_start_ost, 
                            video_end_ost, 
                            video_state, 
                            audio_state, 
                            output_dir,
                            user_email_display,
                            ],
                    outputs=[video_output, audio_output, clip_message, srt_clipped,summary])



if __name__ == "__main__":

    clip_service.queue(
    max_size=10,
    default_concurrency_limit=10,
)

    clip_service.launch(ssr_mode=False)