File size: 9,411 Bytes
ad5a806 027e8a9 91c5ebc b345f1b 027e8a9 ad6ae4a 027e8a9 b345f1b 027e8a9 b345f1b 027e8a9 b345f1b 003efa5 2aabdcd 003efa5 7c8a760 003efa5 b345f1b 39ff4a4 a550e5b 63c362f 2c12515 fa29376 2c12515 b345f1b 91c5ebc b345f1b 91c5ebc 042d40b 53b4fd0 042d40b 91c5ebc 53b4fd0 91c5ebc b345f1b 91c5ebc b345f1b 53b4fd0 9da309a 042d40b 53b4fd0 91c5ebc b345f1b d682a07 ad6ae4a b345f1b 933471e b345f1b ad5a806 3b6a87c b345f1b 027e8a9 b345f1b 1b6024a b345f1b 027e8a9 b345f1b fa29376 b345f1b 24987c8 ad6ae4a fa29376 ad6ae4a b345f1b 70f5d8a b345f1b fa29376 933471e 70f5d8a 933471e 027e8a9 7c60508 24987c8 7c60508 ae0f617 7c60508 027e8a9 fa29376 c1ba3d1 fa29376 027e8a9 ad6ae4a 2c12515 59a74a9 cd5d77e 2c12515 52be6ba 3c3adba 52be6ba 2c12515 ad6ae4a a550e5b 63c362f 2c12515 63c362f 027e8a9 63c362f 027e8a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
import spaces
import gradio as gr
from gradio_client import Client
client = Client("https://vikhyatk-moondream1.hf.space/")
import cv2
from moviepy.editor import *
# 1. extract and store 1 image every 24 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption
import re
import torch
from transformers import pipeline
zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
standard_sys = f"""
You will be provided a list of visual details observed at regular intervals, along with an audio description. These pieces of information originate from a single video. The visual details are extracted from the video at fixed time intervals and represent consecutive frames. Typically, the video consists of a brief sequence showing one or more subjects...
Please note that the following list of image descriptions (visual details) was obtained by extracting individual frames from a continuous video featuring one or more subjects. Depending on the case, all depicted individuals may correspond to the same person(s), with minor variations due to changes in lighting, angle, and facial expressions over time. Regardless, assume temporal continuity among the frames unless otherwise specified.
Audio events are actually the entire scene description based only on the audio of the video. Your job is to integrate these multimodal inputs intelligently and provide a very short resume about what is happening in the origin video. Provide a succinct overview of what you understood.
"""
def trim_video(input_path, max_duration=10):
if input_path is not None:
video_clip = VideoFileClip(input_path)
output_path = "video_cut_10.mp4"
if video_clip.duration > max_duration:
trimmed_clip = video_clip.subclip(0, max_duration)
trimmed_clip.write_videofile(output_path, audio_codec='aac')
return output_path
else:
return input_path
else :
return None
def extract_frames(video_in, output_format='.jpg'):
# Adjust interval to video length
video_clip = VideoFileClip(video_in)
if video_clip.duration <= 5:
interval = 6
else :
interval = 24
"""Extract frames from a video at a specified interval and store them in a list.
Args:
- video_in: string or path-like object pointing to the video file
- interval: integer specifying how many frames apart to extract images (default: 5)
- output_format: string indicating desired format for saved images (default: '.jpg')
Returns:
A list of strings containing paths to saved images.
"""
# Initialize variables
vidcap = cv2.VideoCapture(video_in)
frames = []
count = 0
# Loop through frames until there are no more
while True:
success, image = vidcap.read()
# Check if successful read and not past end of video
if success:
#print('Read a new frame:', success)
# Save current frame if it meets criteria
if count % interval == 0:
filename = f'frame_{count // interval}{output_format}'
frames.append(filename)
cv2.imwrite(filename, image)
print(f'Saved {filename}')
# Increment counter
count += 1
# Break out of loop when done reading frames
else:
break
# Close video capture
vidcap.release()
print('Done extracting frames!')
return frames
'''
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
'''
#@spaces.GPU()
def process_image(image_in):
result = client.predict(
image_in, # filepath in 'image' Image component
"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
api_name="/answer_question"
#api_name="/predict"
)
print(result)
return result
'''
image = Image.open(image_in)
enc_image = model.encode_image(image)
result = model.answer_question(enc_image, "Describe the image in one sentence.", tokenizer)
print(result)
return result
'''
def extract_audio(video_path):
video_clip = VideoFileClip(video_path)
# Check if the video has audio
if video_clip.audio is not None:
audio_clip = video_clip.audio
audio_clip.write_audiofile("output_audio.mp3")
return "output_audio.mp3"
else:
print("The video does not have any audio.")
return None
def get_salmonn(audio_in):
salmonn_prompt = "Please describe the audio"
client = Client("fffiloni/SALMONN-7B-gradio")
result = client.predict(
audio_in, # filepath in 'Audio' Audio component
salmonn_prompt, # str in 'User question' Textbox component
4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
api_name="/gradio_answer"
)
print(result)
return result
@spaces.GPU()
def llm_process(user_prompt):
agent_maker_sys = standard_sys
instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
prompt = f"{instruction.strip()}\n{user_prompt}</s>"
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"SUGGESTED video description: {cleaned_text}")
return cleaned_text.lstrip("\n")
def infer(video_in):
# Extract frames from a video
gr.Info("Extracting frames...")
frame_files = extract_frames(video_in)
# Process each extracted frame and collect results in a list
gr.Info("Captioning frames ...")
processed_texts = []
for frame_file in frame_files:
text = process_image(frame_file)
processed_texts.append(text)
print(processed_texts)
# Convert processed_texts list to a string list with line breaks
string_list = '\n'.join(processed_texts)
# Extract audio from video
extracted_audio = extract_audio(video_in)
if extracted_audio is not None :
print(extracted_audio)
# Get description of audio content
gr.Info("Getting audio description from extracted sound ...")
audio_content_described = get_salmonn(extracted_audio)
else :
audio_content_described = "Video has no sound."
# Assemble captions
formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
print(formatted_captions)
# Send formatted captions to LLM
gr.Info("Try to provide a video understanding with provided elements ...")
video_description_from_llm = llm_process(formatted_captions)
return video_description_from_llm
css = """
div#col-container{
margin: 0 auto;
max-width: 1280px;
}
div#video-text textarea {
font-size: 20px;
line-height: 1.2em;
font-weight: 600;
}
"""
with gr.Blocks(css=css) as demo :
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">Soft Video Understanding</h2>
<p style="text-align: center;">
An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then send visual and audio details to Zephyr which is instructed to resume what it understood.
Instructions prompt is available for further discussion with the Community. Note that audio is crucial for better overall vision. Video longer than 10 seconds will be cut.
</p>
""")
with gr.Row():
with gr.Column():
video_in = gr.Video(label="Video input")
with gr.Accordion("System Instructions", open=False):
system_instruction = gr.Markdown(
value = standard_sys
)
gr.Examples(
examples = ["examples/train.mp4"],
inputs = [video_in]
)
with gr.Column():
video_cut = gr.Video(label="Video cut to 10 seconds", interactive=False)
submit_btn = gr.Button("Submit")
video_description = gr.Textbox(label="Video description", elem_id="video-text")
video_in.change(
fn = trim_video,
inputs = [video_in],
outputs = [video_cut],
queue = False
)
submit_btn.click(
fn = infer,
inputs = [video_cut],
outputs = [video_description]
)
demo.queue().launch() |