Spaces:

fffiloni
/

soft-video-understanding

Paused

File size: 6,927 Bytes

027e8a9
 
b345f1b
 
027e8a9
 
 
 
 
 
 
b345f1b
 
 
027e8a9
b345f1b
 
027e8a9
b345f1b
933471e
2aabdcd
95e144f
2aabdcd
 
7c8a760
 
 
 
 
 
 
6b5a175
2aabdcd
 
6b5a175
7c8a760
b345f1b
 
720f703
b345f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
042d40b
53b4fd0
 
042d40b
 
 
 
 
 
53b4fd0
b345f1b
53b4fd0
e71cfc7
b345f1b
 
 
 
 
 
 
 
53b4fd0
9da309a
042d40b
 
53b4fd0
 
b345f1b
 
f4675b5
 
 
 
b345f1b
 
933471e
b345f1b
 
 
 
 
 
 
 
 
 
 
 
3b6a87c
b345f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
027e8a9
 
b345f1b
 
027e8a9
b345f1b
 
 
 
 
 
 
 
 
 
 
 
720f703
b345f1b
 
70f5d8a
b345f1b
 
70f5d8a
b345f1b
 
 
 
 
933471e
70f5d8a
933471e
027e8a9
 
 
 
 
 
 
b345f1b
70f5d8a
027e8a9

import gradio as gr
from gradio_client import Client
import cv2
from moviepy.editor import *

# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

import re
import torch
from transformers import pipeline

zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

standard_sys = f"""
You will be provided a list of visual events, and an audio description. All these informations come from a single video. 

List of visual events are actually extracted from this video every 12 frames.
These visual infos are extracted from the video that is usually a short sequence.

As a smart assistant, you must understand that Repetitive visual element of the same person or group of subject means that it is the same person/subject, filmed without cut.
For example, if visual elements is like this: 
"An older man wearing a brown hat and glasses, looking off into the distance.
 An older man wearing a brown hat and glasses, with a beard and a mustache, is looking directly at the camera.
 An older man wearing a brown hat and glasses, with a beard and a beard on his chin, is looking at the camera."
It does not mean there are 3 older men, but this is the same man. Because we have extracted vere close frame from the video sequence.

Audio events are actually the entire scene description based on the audio of the video.

Your job is to use these informations to smartly deduce and provide a very short resume about what is happening in the video.
Provide a short resume about what you understood.

"""

def extract_frames(video_in, interval=24, output_format='.jpg'):
    """Extract frames from a video at a specified interval and store them in a list.

    Args:
    - video_in: string or path-like object pointing to the video file
    - interval: integer specifying how many frames apart to extract images (default: 5)
    - output_format: string indicating desired format for saved images (default: '.jpg')

    Returns:
    A list of strings containing paths to saved images.
    """

    # Initialize variables
    vidcap = cv2.VideoCapture(video_in)
    frames = []
    count = 0

    # Loop through frames until there are no more
    while True:
        success, image = vidcap.read()

        # Check if successful read and not past end of video
        if success:
            print('Read a new frame:', success)

            # Save current frame if it meets criteria
            if count % interval == 0:
                filename = f'frame_{count // interval}{output_format}'
                frames.append(filename)
                cv2.imwrite(filename, image)
                print(f'Saved {filename}')

            # Increment counter
            count += 1

        # Break out of loop when done reading frames
        else:
            break

    # Close video capture
    vidcap.release()
    print('Done extracting frames!')

    return frames

from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

def process_image(image_in):
    '''
    client = Client("https://vikhyatk-moondream1.hf.space/")
    result = client.predict(
		image_in,	# filepath  in 'image' Image component
		"Describe precisely the image in one sentence.",	# str  in 'Question' Textbox component
		api_name="/answer_question"
        #api_name="/predict"
    )
    print(result)
    return result
    '''
    image = Image.open(image_in)
    enc_image = model.encode_image(image)
    result = model.answer_question(enc_image, "Describe the image in one sentence.", tokenizer)
    print(result)
    return result

def extract_audio(video_path):
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile("output_audio.mp3")
    return "output_audio.mp3"

def get_salmonn(audio_in):
    salmonn_prompt = "Please describe the audio"
    client = Client("fffiloni/SALMONN-7B-gradio")
    result = client.predict(
    		audio_in,	# filepath  in 'Audio' Audio component
    		salmonn_prompt,	# str  in 'User question' Textbox component
    		4,	# float (numeric value between 1 and 10) in 'beam search numbers' Slider component
    		1,	# float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
    		0.9,	# float (numeric value between 0.1 and 1.0) in 'top p' Slider component
    		api_name="/gradio_answer"
    )
    print(result)
    return result

def llm_process(user_prompt):
    agent_maker_sys = standard_sys
    
    instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
    
    prompt = f"{instruction.strip()}\n{user_prompt}</s>"    
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
    
    print(f"SUGGESTED video description: {cleaned_text}")
    return cleaned_text.lstrip("\n")

def infer(video_in):
    # Extract frames from a video
    frame_files = extract_frames(video_in)
    
    # Process each extracted frame and collect results in a list
    processed_texts = []
    for frame_file in frame_files:
        text = process_image(frame_file)
        processed_texts.append(text)
    print(processed_texts)

    # Convert processed_texts list to a string list with line breaks
    string_list = '\n'.join(processed_texts)

    # Extract audio from video
    extracted_audio = extract_audio(video_in)
    print(extracted_audio)

    # Get description of audio content
    audio_content_described = get_salmonn(extracted_audio)

    # Assemble captions
    formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
    print(formatted_captions)

    # Send formatted captions to LLM
    video_description_from_llm = llm_process(formatted_captions)
    
    return video_description_from_llm

with gr.Blocks() as demo :
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">Video description</h2>
        """)
        video_in = gr.Video(label="Video input")
        submit_btn = gr.Button("Submit")
        video_description = gr.Textbox(label="Video description")
    submit_btn.click(
        fn = infer,
        inputs = [video_in],
        outputs = [video_description]
    )
demo.queue().launch()