import gradio as gr from gradio_client import Client # 1. extract and store 1 image every 5 images from video input # 2. extract audio # 3. for each image from extracted_images, get caption from caption model and concatenate into list # 4. for audio, ask audio questioning model to describe sound/scene # 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption def extract_image() def get_moondream() def get_salmonn() def llm_process() def infer(video_in): return video_description with gr.Blocks() as demo : with gr.Column(elem_id="col-container"): gr.HTML("""