# Importing the requirements import uuid import torch from transformers import AutoModel, AutoTokenizer import spaces from src.utils import encode_video # Device for the model device = "cuda" # Load the model and tokenizer model = AutoModel.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, ) model = model.to(device=device) tokenizer = AutoTokenizer.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True ) model.eval() class _GeneratorPickleHack: def __init__(self, generator, generator_id=None): self.generator = generator self.generator_id = ( generator_id if generator_id is not None else str(uuid.uuid4()) ) def __call__(self, *args, **kwargs): return self.generator(*args, **kwargs) def __reduce__(self): return (_GeneratorPickleHack_raise, (self.generator_id,)) def _GeneratorPickleHack_raise(*args, **kwargs): raise AssertionError("cannot actually unpickle _GeneratorPickleHack!") @spaces.GPU() def describe_video(video, question): """ Describes a video by generating an answer to a given question. Args: - video (str): The path to the video file. - question (str): The question to be answered about the video. Returns: str: The generated answer to the question. """ # Encode the video frames frames = _GeneratorPickleHack(encode_video)(video) #frames = encode_video(video) #frames = list(frames) # Convert generator or any iterable to list # Message format for the model msgs = [{"role": "user", "content": frames + [question]}] # Set decode params for video params = { "use_image_id": False, "max_slice_nums": 1, # Use 1 if CUDA OOM and video resolution > 448*448 } # Generate the answer answer = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, sampling=True, temperature=0.7, stream=True, system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.", **params ) # Return the answer return answer