fffiloni's picture
Update app.py
39ff4a4 verified
raw
history blame
7.59 kB
import spaces
import gradio as gr
from gradio_client import Client
client = Client("https://vikhyatk-moondream1.hf.space/")
import cv2
from moviepy.editor import *
# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption
import re
import torch
from transformers import pipeline
zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
standard_sys = f"""
You will be provided a list of visual details observed at regular intervals, along with an audio description. These pieces of information originate from a single video. The visual details are extracted from the video at fixed time intervals and represent consecutive frames. Typically, the video consists of a brief sequence showing one or more subjects...
Please note that the following list of image descriptions (visual details) was obtained by extracting individual frames from a continuous video featuring one or more subjects. Depending on the case, all depicted individuals may correspond to the same person(s), with minor variations due to changes in lighting, angle, and facial expressions over time. Regardless, assume temporal continuity among the frames unless otherwise specified.
Audio events are actually the entire scene description based only on the audio of the video. Your job is to integrate these multimodal inputs intelligently and provide a very short resume about what is happening in the origin video. Provide a succinct overview of what you understood.
"""
def trim_video(input_path, max_duration=10):
video_clip = VideoFileClip(input_path)
output_path = "video_cut_10.mp4"
if video_clip.duration > max_duration:
trimmed_clip = video_clip.subclip(0, max_duration)
trimmed_clip.write_videofile(output_path, audio_codec='aac')
return output_path
else:
return input_path
def extract_frames(video_in, interval=24, output_format='.jpg'):
"""Extract frames from a video at a specified interval and store them in a list.
Args:
- video_in: string or path-like object pointing to the video file
- interval: integer specifying how many frames apart to extract images (default: 5)
- output_format: string indicating desired format for saved images (default: '.jpg')
Returns:
A list of strings containing paths to saved images.
"""
# Initialize variables
vidcap = cv2.VideoCapture(video_in)
frames = []
count = 0
# Loop through frames until there are no more
while True:
success, image = vidcap.read()
# Check if successful read and not past end of video
if success:
#print('Read a new frame:', success)
# Save current frame if it meets criteria
if count % interval == 0:
filename = f'frame_{count // interval}{output_format}'
frames.append(filename)
cv2.imwrite(filename, image)
print(f'Saved {filename}')
# Increment counter
count += 1
# Break out of loop when done reading frames
else:
break
# Close video capture
vidcap.release()
print('Done extracting frames!')
return frames
'''
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
'''
#@spaces.GPU()
def process_image(image_in):
result = client.predict(
image_in, # filepath in 'image' Image component
"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
api_name="/answer_question"
#api_name="/predict"
)
print(result)
return result
'''
image = Image.open(image_in)
enc_image = model.encode_image(image)
result = model.answer_question(enc_image, "Describe the image in one sentence.", tokenizer)
print(result)
return result
'''
def extract_audio(video_path):
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile("output_audio.mp3")
return "output_audio.mp3"
def get_salmonn(audio_in):
salmonn_prompt = "Please describe the audio"
client = Client("fffiloni/SALMONN-7B-gradio")
result = client.predict(
audio_in, # filepath in 'Audio' Audio component
salmonn_prompt, # str in 'User question' Textbox component
4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
api_name="/gradio_answer"
)
print(result)
return result
@spaces.GPU()
def llm_process(user_prompt):
agent_maker_sys = standard_sys
instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
prompt = f"{instruction.strip()}\n{user_prompt}</s>"
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"SUGGESTED video description: {cleaned_text}")
return cleaned_text.lstrip("\n")
def infer(video_in):
# Extract frames from a video
frame_files = extract_frames(video_in)
# Process each extracted frame and collect results in a list
processed_texts = []
for frame_file in frame_files:
text = process_image(frame_file)
processed_texts.append(text)
print(processed_texts)
# Convert processed_texts list to a string list with line breaks
string_list = '\n'.join(processed_texts)
# Extract audio from video
extracted_audio = extract_audio(video_in)
print(extracted_audio)
# Get description of audio content
audio_content_described = get_salmonn(extracted_audio)
# Assemble captions
formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
print(formatted_captions)
# Send formatted captions to LLM
video_description_from_llm = llm_process(formatted_captions)
return video_description_from_llm
css = """
div#col-container{
margin: 0 auto;
max-width: 720px;
}
div#video-text textarea {
font-size: 20px;
line-height: 1.2em;
font-weight: 600;
}
"""
with gr.Blocks(css=css) as demo :
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">Soft video understanding</h2>
""")
video_in = gr.Video(label="Video input")
video_cut = gr.Video(label="Video cut")
submit_btn = gr.Button("Submit")
video_description = gr.Textbox(label="Video description", elem_id="video-text")
video_in.upload(
fn = trim_video,
inputs = [video_in],
outputs = [video_cut]
)
submit_btn.click(
fn = infer,
inputs = [video_cut],
outputs = [video_description]
)
demo.queue().launch()