# Importing the requirements
import uuid
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
from src.utils import encode_video


# Device for the model
device = "cuda"

# Load the model and tokenizer
model = AutoModel.from_pretrained(
    "openbmb/MiniCPM-V-2_6",
    trust_remote_code=True,
    attn_implementation="sdpa",
    torch_dtype=torch.bfloat16,
)
model = model.to(device=device)
tokenizer = AutoTokenizer.from_pretrained(
    "openbmb/MiniCPM-V-2_6", trust_remote_code=True
)
model.eval()


class _GeneratorPickleHack:
    def __init__(self, generator, generator_id=None):
        self.generator = generator
        self.generator_id = (
            generator_id if generator_id is not None else str(uuid.uuid4())
        )

    def __call__(self, *args, **kwargs):
        return self.generator(*args, **kwargs)

    def __reduce__(self):
        return (_GeneratorPickleHack_raise, (self.generator_id,))


def _GeneratorPickleHack_raise(*args, **kwargs):
    raise AssertionError("cannot actually unpickle _GeneratorPickleHack!")


@spaces.GPU()
def describe_video(video, question):
    """
    Describes a video by generating an answer to a given question.

    Args:
        - video (str): The path to the video file.
        - question (str): The question to be answered about the video.

    Returns:
        str: The generated answer to the question.
    """
    # Encode the video frames
    frames = _GeneratorPickleHack(encode_video)(video)
    #frames = encode_video(video)
    #frames = list(frames)  # Convert generator or any iterable to list

    # Message format for the model
    msgs = [{"role": "user", "content": frames + [question]}]

    # Set decode params for video
    params = {
        "use_image_id": False,
        "max_slice_nums": 1,  # Use 1 if CUDA OOM and video resolution > 448*448
    }

    # Generate the answer
    answer = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
        sampling=True,
        temperature=0.7,
        stream=True,
        system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
        **params
    )

    # Return the answer
    return answer