OmAgentVideoUnderstanding

Sleeping

File size: 6,412 Bytes

1b7e88c

from typing import Dict, List, Optional, Tuple, Union

import cv2
from omagent_core.utils.logger import logging
from PIL import Image
from pydantic import BaseModel
from pydub import AudioSegment
from pydub.effects import normalize
from scenedetect import (ContentDetector, FrameTimecode, SceneManager,
                         VideoStream, open_video)


class Scene(BaseModel):
    start: FrameTimecode
    end: FrameTimecode
    stt_res: Optional[Dict] = None
    summary: Optional[Dict] = None

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    @classmethod
    def init(cls, start: FrameTimecode, end: FrameTimecode, summary: dict = None):
        return cls(start=start, end=end, summary=summary)

    @property
    def conversation(self):
        # for self deployed whisper
        if isinstance(self.stt_res, list):
            output_conversation = "\n".join(
                [f"{item.get('text', None)}" for item in self.stt_res]
            )
        else:
            output_conversation = self.stt_res
        return output_conversation


class VideoScenes(BaseModel):
    stream: VideoStream
    audio: Union[AudioSegment, None]
    scenes: List[Scene]
    frame_extraction_interval: int

    class Config:
        """Configuration for this pydantic object."""

        extra = "allow"
        arbitrary_types_allowed = True

    @classmethod
    def load(
        cls,
        video_path: str,
        threshold: int = 27,
        min_scene_len: int = 1,
        frame_extraction_interval: int = 5,
        show_progress: bool = False,
    ):
        """Load a video file.

        Args:
            video_path (str): The path of the video file. Only support local file.
            threshold (int): The scene detection threshold.
            min_scene_len (int): Once a cut is detected, this long time must pass before a new one can
                be added to the scene list. Count in seconds, defaults to 1.
            show_progress (bool, optional): Whether to display the progress bar when processing the video. Defaults to False.
        """
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(
            ContentDetector(
                threshold=threshold, min_scene_len=video.frame_rate * min_scene_len
            )
        )
        scene_manager.detect_scenes(video, show_progress=show_progress)
        scenes = scene_manager.get_scene_list(start_in_scene=True)

        try:
            audio = AudioSegment.from_file(video_path)
            audio = normalize(audio)
        except Exception as e:
            logging.warning(f"Failed to load audio from {video_path}: {e}")
            audio = None
        return cls(
            stream=video,
            scenes=[Scene.init(*scene) for scene in scenes],
            audio=audio,
            frame_extraction_interval=frame_extraction_interval,
        )

    def get_video_frames(
        self, scene: Union[int, Scene, Tuple[FrameTimecode]], interval: int = None
    ) -> Tuple[List[Image.Image], List[float]]:
        """Get the frames of a scene.

        Args:
            scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get frames. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.
            interval (int, optional): The interval of the frames to get. Defaults to None.
        Raises:
            ValueError: If the type of scene is not int, Scene or tuple.

        Returns:
            List[ndarray]: The frames of the scene.
        """
        if isinstance(scene, int):
            scene = self.scenes[scene]
            start, end = scene.start, scene.end
        elif isinstance(scene, Scene):
            start, end = scene.start, scene.end
        elif isinstance(scene, tuple):
            start, end = scene
        else:
            raise ValueError(
                f"scene should be int, Scene or tuple, not {type(scene).__name__}"
            )
        self.stream.seek(start)
        frames = []
        time_stamps = []
        if interval is None:
            interval = self.frame_extraction_interval * self.stream.frame_rate
        scene_len = end.get_frames() - start.get_frames()
        if scene_len / 10 > interval:
            interval = int(scene_len / 10) + 1
        for index in range(scene_len):
            if index % interval == 0:
                f = self.stream.read()
                if f is False:
                    continue
                f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                frames.append(Image.fromarray(f))
                time_stamps.append(self.stream.position.get_seconds())
            else:
                self.stream.read(decode=False)
        self.stream.seek(0)
        return frames, time_stamps

    def get_audio_clip(
        self, scene: Union[int, Scene, Tuple[FrameTimecode]]
    ) -> AudioSegment:
        """Get the audio clip of a scene.

        Args:
            scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get audio clip. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.

        Raises:
            ValueError: If the type of scene is not int, Scene or tuple.

        Returns:
            AudioSegment: The audio clip of the scene.
        """
        if self.audio is None:
            return None
        if isinstance(scene, int):
            scene = self.scenes[scene]
            start, end = scene.start, scene.end
        elif isinstance(scene, Scene):
            start, end = scene.start, scene.end
        elif isinstance(scene, tuple):
            start, end = scene
        else:
            raise ValueError(
                f"scene should be int, Scene or tuple, not {type(scene).__name__}"
            )

        return self.audio[
            int(start.get_seconds() * 1000) : int(end.get_seconds() * 1000)
        ]

    def __len__(self):
        return len(self.scenes)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        if self.index >= len(self.scenes):
            raise StopIteration
        scene = self.scenes[self.index]
        self.index += 1
        return scene

    def __getitem__(self, index):
        return self.scenes[index]

    def __setitem__(self, index, value):
        self.scenes[index] = value