File size: 4,678 Bytes
083d486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655d5ae
083d486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from .payload_model import SingleInferencePayload, VideoInferencePayload
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from pydantic import BaseModel
from typing import Optional


class Qwen2_5(BaseModel):
    model: Optional[AutoModelForVision2Seq] = None
    tokenizer: Optional[AutoTokenizer] = None
    processor: Optional[AutoProcessor] = None

    model_config = {
        "arbitrary_types_allowed": True,
        "from_attributes": True
    }

    def __init__(self, model_path: str):
        super().__init__()
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_path, torch_dtype="auto", device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.processor = AutoProcessor.from_pretrained(model_path)

    def prepare_single_inference(self, image: str, question: str):
        image = f"data:image;base64,{image}"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "image": image,
                        
                    },
                    {
                        "type": "text",
                        "text": question
                    },
                ],
            }
        ]
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        return inputs
    
    def prepare_video_inference(self, video: list[str], question: str):
        base64_videos = []
        for frame in video:
            base64_videos.append(f"data:image;base64,{frame}")
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": base64_videos,
                    },
                    {
                        "type": "text",
                        "text": question
                    },
                ],
            }
        ]
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            fps=1.0,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")
        return inputs

    def get_single_inference(self, payload: SingleInferencePayload):
        try:
            processed_inputs = self.prepare_single_inference(payload.image_path, payload.question)
            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            print(f"Model generated text: {output_text}")
            return {
                "message": output_text,
                "status": 200
            }
        except Exception as e:
            return {
                "message": str(e),
                "status": 500
            }

    def get_video_inference(self, payload: VideoInferencePayload):
        try:
            processed_inputs = self.prepare_video_inference(payload.video_path, payload.question)
            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            print(f"Model generated text: {output_text}")
            return {
                "message": output_text,
                "status": 200
            }
        except Exception as e:
            return {
                "message": str(e),
                "status": 500
            }