File size: 4,678 Bytes
083d486 655d5ae 083d486 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from .payload_model import SingleInferencePayload, VideoInferencePayload
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from pydantic import BaseModel
from typing import Optional
class Qwen2_5(BaseModel):
model: Optional[AutoModelForVision2Seq] = None
tokenizer: Optional[AutoTokenizer] = None
processor: Optional[AutoProcessor] = None
model_config = {
"arbitrary_types_allowed": True,
"from_attributes": True
}
def __init__(self, model_path: str):
super().__init__()
self.model = AutoModelForVision2Seq.from_pretrained(
model_path, torch_dtype="auto", device_map="auto"
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.processor = AutoProcessor.from_pretrained(model_path)
def prepare_single_inference(self, image: str, question: str):
image = f"data:image;base64,{image}"
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"image": image,
},
{
"type": "text",
"text": question
},
],
}
]
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
return inputs
def prepare_video_inference(self, video: list[str], question: str):
base64_videos = []
for frame in video:
base64_videos.append(f"data:image;base64,{frame}")
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": base64_videos,
},
{
"type": "text",
"text": question
},
],
}
]
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
fps=1.0,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
return inputs
def get_single_inference(self, payload: SingleInferencePayload):
try:
processed_inputs = self.prepare_single_inference(payload.image_path, payload.question)
generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(f"Model generated text: {output_text}")
return {
"message": output_text,
"status": 200
}
except Exception as e:
return {
"message": str(e),
"status": 500
}
def get_video_inference(self, payload: VideoInferencePayload):
try:
processed_inputs = self.prepare_video_inference(payload.video_path, payload.question)
generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(f"Model generated text: {output_text}")
return {
"message": output_text,
"status": 200
}
except Exception as e:
return {
"message": str(e),
"status": 500
} |