import gradio as gr from gradio_client import Client # 1. extract and store 1 image every 5 images from video input # 2. extract audio # 3. for each image from extracted_images, get caption from caption model and concatenate into list # 4. for audio, ask audio questioning model to describe sound/scene # 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption def extract_image() def get_moondream() def get_salmonn() def llm_process() def infer(video_in): return video_description with gr.Blocks() as demo : with gr.Column(elem_id="col-container"): gr.HTML("""

Video description

""") video_in = gr.Video(label="Video input") submit_btn = gr.Button("SUbmit") video_description = gr.Textbox(label="Video description") submit_btn.click( fn = infer, inputs = [video_in], outputs = [video_description] ) demo.queue().launch()