Spaces:

Pome7o
/

OmAgent

Sleeping

OmAgent / omagent_core /services /handlers /video_scenes.py

韩宇

init

1b7e88c 4 months ago

6.41 kB

	from typing import Dict, List, Optional, Tuple, Union

	import cv2
	from omagent_core.utils.logger import logging
	from PIL import Image
	from pydantic import BaseModel
	from pydub import AudioSegment
	from pydub.effects import normalize
	from scenedetect import (ContentDetector, FrameTimecode, SceneManager,
	VideoStream, open_video)


	class Scene(BaseModel):
	start: FrameTimecode
	end: FrameTimecode
	stt_res: Optional[Dict] = None
	summary: Optional[Dict] = None

	class Config:
	"""Configuration for this pydantic object."""

	arbitrary_types_allowed = True

	@classmethod
	def init(cls, start: FrameTimecode, end: FrameTimecode, summary: dict = None):
	return cls(start=start, end=end, summary=summary)

	@property
	def conversation(self):
	# for self deployed whisper
	if isinstance(self.stt_res, list):
	output_conversation = "\n".join(
	[f"{item.get('text', None)}" for item in self.stt_res]
	)
	else:
	output_conversation = self.stt_res
	return output_conversation


	class VideoScenes(BaseModel):
	stream: VideoStream
	audio: Union[AudioSegment, None]
	scenes: List[Scene]
	frame_extraction_interval: int

	class Config:
	"""Configuration for this pydantic object."""

	extra = "allow"
	arbitrary_types_allowed = True

	@classmethod
	def load(
	cls,
	video_path: str,
	threshold: int = 27,
	min_scene_len: int = 1,
	frame_extraction_interval: int = 5,
	show_progress: bool = False,
	):
	"""Load a video file.

	Args:
	video_path (str): The path of the video file. Only support local file.
	threshold (int): The scene detection threshold.
	min_scene_len (int): Once a cut is detected, this long time must pass before a new one can
	be added to the scene list. Count in seconds, defaults to 1.
	show_progress (bool, optional): Whether to display the progress bar when processing the video. Defaults to False.
	"""
	video = open_video(video_path)
	scene_manager = SceneManager()
	scene_manager.add_detector(
	ContentDetector(
	threshold=threshold, min_scene_len=video.frame_rate * min_scene_len
	)
	)
	scene_manager.detect_scenes(video, show_progress=show_progress)
	scenes = scene_manager.get_scene_list(start_in_scene=True)

	try:
	audio = AudioSegment.from_file(video_path)
	audio = normalize(audio)
	except Exception as e:
	logging.warning(f"Failed to load audio from {video_path}: {e}")
	audio = None
	return cls(
	stream=video,
	scenes=[Scene.init(*scene) for scene in scenes],
	audio=audio,
	frame_extraction_interval=frame_extraction_interval,
	)

	def get_video_frames(
	self, scene: Union[int, Scene, Tuple[FrameTimecode]], interval: int = None
	) -> Tuple[List[Image.Image], List[float]]:
	"""Get the frames of a scene.

	Args:
	scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get frames. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.
	interval (int, optional): The interval of the frames to get. Defaults to None.
	Raises:
	ValueError: If the type of scene is not int, Scene or tuple.

	Returns:
	List[ndarray]: The frames of the scene.
	"""
	if isinstance(scene, int):
	scene = self.scenes[scene]
	start, end = scene.start, scene.end
	elif isinstance(scene, Scene):
	start, end = scene.start, scene.end
	elif isinstance(scene, tuple):
	start, end = scene
	else:
	raise ValueError(
	f"scene should be int, Scene or tuple, not {type(scene).__name__}"
	)
	self.stream.seek(start)
	frames = []
	time_stamps = []
	if interval is None:
	interval = self.frame_extraction_interval * self.stream.frame_rate
	scene_len = end.get_frames() - start.get_frames()
	if scene_len / 10 > interval:
	interval = int(scene_len / 10) + 1
	for index in range(scene_len):
	if index % interval == 0:
	f = self.stream.read()
	if f is False:
	continue
	f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
	frames.append(Image.fromarray(f))
	time_stamps.append(self.stream.position.get_seconds())
	else:
	self.stream.read(decode=False)
	self.stream.seek(0)
	return frames, time_stamps

	def get_audio_clip(
	self, scene: Union[int, Scene, Tuple[FrameTimecode]]
	) -> AudioSegment:
	"""Get the audio clip of a scene.

	Args:
	scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get audio clip. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.

	Raises:
	ValueError: If the type of scene is not int, Scene or tuple.

	Returns:
	AudioSegment: The audio clip of the scene.
	"""
	if self.audio is None:
	return None
	if isinstance(scene, int):
	scene = self.scenes[scene]
	start, end = scene.start, scene.end
	elif isinstance(scene, Scene):
	start, end = scene.start, scene.end
	elif isinstance(scene, tuple):
	start, end = scene
	else:
	raise ValueError(
	f"scene should be int, Scene or tuple, not {type(scene).__name__}"
	)

	return self.audio[
	int(start.get_seconds() * 1000) : int(end.get_seconds() * 1000)
	]

	def __len__(self):
	return len(self.scenes)

	def __iter__(self):
	self.index = 0
	return self

	def __next__(self):
	if self.index >= len(self.scenes):
	raise StopIteration
	scene = self.scenes[self.index]
	self.index += 1
	return scene

	def __getitem__(self, index):
	return self.scenes[index]

	def __setitem__(self, index, value):
	self.scenes[index] = value