File size: 6,927 Bytes
027e8a9 b345f1b 027e8a9 b345f1b 027e8a9 b345f1b 027e8a9 b345f1b 933471e 2aabdcd 95e144f 2aabdcd 7c8a760 6b5a175 2aabdcd 6b5a175 7c8a760 b345f1b 720f703 b345f1b 042d40b 53b4fd0 042d40b 53b4fd0 b345f1b 53b4fd0 e71cfc7 b345f1b 53b4fd0 9da309a 042d40b 53b4fd0 b345f1b f4675b5 b345f1b 933471e b345f1b 3b6a87c b345f1b 027e8a9 b345f1b 027e8a9 b345f1b 720f703 b345f1b 70f5d8a b345f1b 70f5d8a b345f1b 933471e 70f5d8a 933471e 027e8a9 b345f1b 70f5d8a 027e8a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
from gradio_client import Client
import cv2
from moviepy.editor import *
# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption
import re
import torch
from transformers import pipeline
zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
standard_sys = f"""
You will be provided a list of visual events, and an audio description. All these informations come from a single video.
List of visual events are actually extracted from this video every 12 frames.
These visual infos are extracted from the video that is usually a short sequence.
As a smart assistant, you must understand that Repetitive visual element of the same person or group of subject means that it is the same person/subject, filmed without cut.
For example, if visual elements is like this:
"An older man wearing a brown hat and glasses, looking off into the distance.
An older man wearing a brown hat and glasses, with a beard and a mustache, is looking directly at the camera.
An older man wearing a brown hat and glasses, with a beard and a beard on his chin, is looking at the camera."
It does not mean there are 3 older men, but this is the same man. Because we have extracted vere close frame from the video sequence.
Audio events are actually the entire scene description based on the audio of the video.
Your job is to use these informations to smartly deduce and provide a very short resume about what is happening in the video.
Provide a short resume about what you understood.
"""
def extract_frames(video_in, interval=24, output_format='.jpg'):
"""Extract frames from a video at a specified interval and store them in a list.
Args:
- video_in: string or path-like object pointing to the video file
- interval: integer specifying how many frames apart to extract images (default: 5)
- output_format: string indicating desired format for saved images (default: '.jpg')
Returns:
A list of strings containing paths to saved images.
"""
# Initialize variables
vidcap = cv2.VideoCapture(video_in)
frames = []
count = 0
# Loop through frames until there are no more
while True:
success, image = vidcap.read()
# Check if successful read and not past end of video
if success:
print('Read a new frame:', success)
# Save current frame if it meets criteria
if count % interval == 0:
filename = f'frame_{count // interval}{output_format}'
frames.append(filename)
cv2.imwrite(filename, image)
print(f'Saved {filename}')
# Increment counter
count += 1
# Break out of loop when done reading frames
else:
break
# Close video capture
vidcap.release()
print('Done extracting frames!')
return frames
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
def process_image(image_in):
'''
client = Client("https://vikhyatk-moondream1.hf.space/")
result = client.predict(
image_in, # filepath in 'image' Image component
"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
api_name="/answer_question"
#api_name="/predict"
)
print(result)
return result
'''
image = Image.open(image_in)
enc_image = model.encode_image(image)
result = model.answer_question(enc_image, "Describe the image in one sentence.", tokenizer)
print(result)
return result
def extract_audio(video_path):
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile("output_audio.mp3")
return "output_audio.mp3"
def get_salmonn(audio_in):
salmonn_prompt = "Please describe the audio"
client = Client("fffiloni/SALMONN-7B-gradio")
result = client.predict(
audio_in, # filepath in 'Audio' Audio component
salmonn_prompt, # str in 'User question' Textbox component
4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
api_name="/gradio_answer"
)
print(result)
return result
def llm_process(user_prompt):
agent_maker_sys = standard_sys
instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
prompt = f"{instruction.strip()}\n{user_prompt}</s>"
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"SUGGESTED video description: {cleaned_text}")
return cleaned_text.lstrip("\n")
def infer(video_in):
# Extract frames from a video
frame_files = extract_frames(video_in)
# Process each extracted frame and collect results in a list
processed_texts = []
for frame_file in frame_files:
text = process_image(frame_file)
processed_texts.append(text)
print(processed_texts)
# Convert processed_texts list to a string list with line breaks
string_list = '\n'.join(processed_texts)
# Extract audio from video
extracted_audio = extract_audio(video_in)
print(extracted_audio)
# Get description of audio content
audio_content_described = get_salmonn(extracted_audio)
# Assemble captions
formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
print(formatted_captions)
# Send formatted captions to LLM
video_description_from_llm = llm_process(formatted_captions)
return video_description_from_llm
with gr.Blocks() as demo :
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">Video description</h2>
""")
video_in = gr.Video(label="Video input")
submit_btn = gr.Button("Submit")
video_description = gr.Textbox(label="Video description")
submit_btn.click(
fn = infer,
inputs = [video_in],
outputs = [video_description]
)
demo.queue().launch() |