Spaces:

fffiloni
/

soft-video-understanding

Paused

App Files Files Community

soft-video-understanding / app.py

fffiloni

Update app.py

e71cfc7 verified 8 months ago

raw

history blame

5.56 kB

	import gradio as gr
	from gradio_client import Client
	import cv2
	from moviepy.editor import *

	# 1. extract and store 1 image every 5 images from video input
	# 2. extract audio
	# 3. for each image from extracted_images, get caption from caption model and concatenate into list
	# 4. for audio, ask audio questioning model to describe sound/scene
	# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

	import re
	import torch
	from transformers import pipeline

	zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
	pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

	standard_sys = f"""
	You will be provided a list of visual events, and an audio description. All these informations come from a single video.
	List of visual events are actually images extracted from this video every 12 frames.
	Audio events are actually the description from the audio of the video.
	Your job is to use these information to provide a short resume about what is happening in the video.
	"""

	def extract_frames(video_in, interval=24, output_format='.jpg'):
	"""Extract frames from a video at a specified interval and store them in a list.

	Args:
	- video_in: string or path-like object pointing to the video file
	- interval: integer specifying how many frames apart to extract images (default: 5)
	- output_format: string indicating desired format for saved images (default: '.jpg')

	Returns:
	A list of strings containing paths to saved images.
	"""

	# Initialize variables
	vidcap = cv2.VideoCapture(video_in)
	frames = []
	count = 0

	# Loop through frames until there are no more
	while True:
	success, image = vidcap.read()

	# Check if successful read and not past end of video
	if success:
	print('Read a new frame:', success)

	# Save current frame if it meets criteria
	if count % interval == 0:
	filename = f'frame_{count // interval}{output_format}'
	frames.append(filename)
	cv2.imwrite(filename, image)
	print(f'Saved {filename}')

	# Increment counter
	count += 1

	# Break out of loop when done reading frames
	else:
	break

	# Close video capture
	vidcap.release()
	print('Done extracting frames!')

	return frames

	def process_image(image_in):
	client = Client("https://vikhyatk-moondream1.hf.space/")
	result = client.predict(
	image_in, # filepath in 'image' Image component
	"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
	api_name="/answer_question"
	#api_name="/predict"
	)
	print(result)
	return result

	def extract_audio(video_path):
	video_clip = VideoFileClip(video_path)
	audio_clip = video_clip.audio
	audio_clip.write_audiofile("output_audio.mp3")
	return "output_audio.mp3"

	def get_salmonn(audio_in):
	salmonn_prompt = "Please describe the audio"
	client = Client("fffiloni/SALMONN-7B-gradio")
	result = client.predict(
	audio_in, # filepath in 'Audio' Audio component
	salmonn_prompt, # str in 'User question' Textbox component
	4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
	1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
	0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
	api_name="/gradio_answer"
	)
	print(result)
	return result

	def llm_process(user_prompt):
	agent_maker_sys = standard_sys

	instruction = f"""
	<\|system\|>
	{agent_maker_sys}</s>
	<\|user\|>
	"""

	prompt = f"{instruction.strip()}\n{user_prompt}</s>"
	outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
	pattern = r'\<\\|system\\|\>(.*?)\<\\|assistant\\|\>'
	cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)

	print(f"SUGGESTED video description: {cleaned_text}")
	return cleaned_text.lstrip("\n")

	def infer(video_in):
	# Extract frames from a video
	frame_files = extract_frames(video_in)

	# Process each extracted frame and collect results in a list
	processed_texts = []
	for frame_file in frame_files:
	text = process_image(frame_file)
	processed_texts.append(text)
	print(processed_texts)

	# Convert processed_texts list to a string list with line breaks
	string_list = '\n'.join(processed_texts)

	# Extract audio from video
	extracted_audio = extract_audio(video_in)
	print(extracted_audio)

	# Get description of audio content
	audio_content_described = get_salmonn(extracted_audio)

	# Assemble captions
	formatted_captions = f"""
	### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
	"""
	print(formatted_captions)

	# Send formatted captions to LLM
	video_description_from_llm = llm_process(formatted_captions)

	return video_description_from_llm

	with gr.Blocks() as demo :
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">Video description</h2>
	""")
	video_in = gr.Video(label="Video input")
	submit_btn = gr.Button("Submit")
	video_description = gr.Textbox(label="Video description")
	submit_btn.click(
	fn = infer,
	inputs = [video_in],
	outputs = [video_description]
	)
	demo.queue().launch()