from typing import Dict, Any import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from PIL import Image import io import base64 import requests from qwen_vl_utils import process_vision_info class EndpointHandler(): def __init__(self, path=""): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = Qwen2VLForConditionalGeneration.from_pretrained( path, torch_dtype="auto", device_map="auto" ).to(self.device) self.processor = AutoProcessor.from_pretrained(path) # Optionally, adjust min_pixels and max_pixels if needed # min_pixels = 256*28*28 # max_pixels = 1280*28*28 # self.processor = AutoProcessor.from_pretrained(path, min_pixels=min_pixels, max_pixels=max_pixels) def __call__(self, data: Any) -> Dict[str, Any]: """ Args: data (Any): The input data, which can be: - Binary image data in the request body. - A dictionary with 'image' and 'text' keys: - 'image': Base64-encoded image string or image URL. - 'text': The text prompt. Returns: Dict[str, Any]: The generated text output from the model. """ default_prompt = "Describe this image." if isinstance(data, (bytes, bytearray)): image = Image.open(io.BytesIO(data)).convert('RGB') text_input = default_prompt elif isinstance(data, dict): image_input = data.get('image', None) text_input = data.get('text', default_prompt) if image_input is None: return {"error": "No image provided."} if image_input.startswith('http'): response = requests.get(image_input) image = Image.open(io.BytesIO(response.content)).convert('RGB') else: image_data = base64.b64decode(image_input) image = Image.open(io.BytesIO(image_data)).convert('RGB') else: return {"error": "Invalid input data. Expected binary image data or a dictionary with 'image' key."} messages = [ { "role": "user", "content": [ { "type": "image", "image": image, }, {"type": "text", "text": text_input}, ], } ] text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to(self.device) generated_ids = self.model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return {"generated_text": output_text[0]}