import gradio as gr
from gradio_client import Client

# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

def extract_image()

def get_moondream()

def get_salmonn()

def llm_process()

def infer(video_in):
    
    return video_description

with gr.Blocks() as demo :
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">Video description</h2>
        """)
        video_in = gr.Video(label="Video input")
        submit_btn = gr.Button("SUbmit")
        video_description = gr.Textbox(label="Video description")
    submit_btn.click(
        fn = infer,
        inputs = [video_in],
        outputs = [video_description]
    )
demo.queue().launch()