from .payload_model import SingleInferencePayload, VideoInferencePayload from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info from pydantic import BaseModel from typing import Optional class Qwen2_5(BaseModel): model: Optional[AutoModelForVision2Seq] = None tokenizer: Optional[AutoTokenizer] = None processor: Optional[AutoProcessor] = None model_config = { "arbitrary_types_allowed": True, "from_attributes": True } def __init__(self, model_path: str): super().__init__() self.model = AutoModelForVision2Seq.from_pretrained( model_path, torch_dtype="auto", device_map="auto" ) self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.processor = AutoProcessor.from_pretrained(model_path) def prepare_single_inference(self, image: str, question: str): image = f"data:image;base64,{image}" messages = [ { "role": "user", "content": [ { "type": "text", "image": image, }, { "type": "text", "text": question }, ], } ] text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") return inputs def prepare_video_inference(self, video: list[str], question: str): base64_videos = [] for frame in video: base64_videos.append(f"data:image;base64,{frame}") messages = [ { "role": "user", "content": [ { "type": "video", "video": base64_videos, }, { "type": "text", "text": question }, ], } ] text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, fps=1.0, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") return inputs def get_single_inference(self, payload: SingleInferencePayload): try: processed_inputs = self.prepare_single_inference(payload.image_path, payload.question) generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids) ] output_text = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(f"Model generated text: {output_text}") return { "message": output_text, "status": 200 } except Exception as e: return { "message": str(e), "status": 500 } def get_video_inference(self, payload: VideoInferencePayload): try: processed_inputs = self.prepare_video_inference(payload.video_path, payload.question) generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids) ] output_text = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(f"Model generated text: {output_text}") return { "message": output_text, "status": 200 } except Exception as e: return { "message": str(e), "status": 500 }