sreejith8100 commited on
Commit
9c2bfc6
·
verified ·
1 Parent(s): 7cfa330

Update endpoint_handler.py

Browse files
Files changed (1) hide show
  1. endpoint_handler.py +86 -86
endpoint_handler.py CHANGED
@@ -1,86 +1,86 @@
1
- import torch
2
- from PIL import Image
3
- from transformers import AutoModel, AutoTokenizer
4
- from io import BytesIO
5
- import base64
6
- from huggingface_hub import login
7
- import os
8
-
9
- class EndpointHandler:
10
- def __init__(self, model_dir=None):
11
- print("[Init] Initializing EndpointHandler...")
12
- self.load_model()
13
-
14
- def load_model(self):
15
- hf_token = os.getenv("HF_TOKEN")
16
- # Replace with the quantized version if available
17
- model_path = "openbmb/MiniCPM-V-2_6-int4" # Hypothetical quantized version
18
-
19
- if hf_token:
20
- print("[Auth] Logging into Hugging Face Hub with token...")
21
- login(token=hf_token)
22
-
23
- print(f"[Model Load] Loading quantized model from: {model_path}")
24
- try:
25
- self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
26
- self.model = AutoModel.from_pretrained(
27
- model_path,
28
- trust_remote_code=True,
29
- ).eval()
30
- print("[Model Load] Quantized model successfully loaded.")
31
- except Exception as e:
32
- print(f"[Model Load Error] {e}")
33
- raise RuntimeError(f"Failed to load quantized model: {e}")
34
-
35
- def load_image(self, image_base64):
36
- try:
37
- print("[Image Load] Decoding base64 image...")
38
- image_bytes = base64.b64decode(image_base64)
39
- image = Image.open(BytesIO(image_bytes)).convert("RGB")
40
- print("[Image Load] Image successfully decoded and converted to RGB.")
41
- return image
42
- except Exception as e:
43
- print(f"[Image Load Error] {e}")
44
- raise ValueError(f"Failed to open image from base64 string: {e}")
45
-
46
- def predict(self, request):
47
- print(f"[Predict] Received request: {request}")
48
-
49
- image_base64 = request.get("inputs", {}).get("image")
50
- question = request.get("inputs", {}).get("question")
51
- stream = request.get("inputs", {}).get("stream", False)
52
-
53
- if not image_base64 or not question:
54
- print("[Predict Error] Missing 'image' or 'question' in the request.")
55
- return {"error": "Missing 'image' or 'question' in inputs."}
56
-
57
- try:
58
- image = self.load_image(image_base64)
59
- msgs = [{"role": "user", "content": [image, question]}]
60
-
61
- print(f"[Predict] Asking model with question: {question}")
62
- print("[Predict] Starting chat inference...")
63
-
64
- res = self.model.chat(
65
- image=None,
66
- msgs=msgs,
67
- tokenizer=self.tokenizer,
68
- sampling=True,
69
- stream=stream
70
- )
71
-
72
- if stream:
73
- for new_text in res:
74
- yield {"output": new_text}
75
- else:
76
- generated_text = "".join(res)
77
- print("[Predict] Inference complete.")
78
- return {"output": generated_text}
79
-
80
- except Exception as e:
81
- print(f"[Predict Error] {e}")
82
- return {"error": str(e)}
83
-
84
- def __call__(self, data):
85
- print("[__call__] Invoked handler with data.")
86
- return self.predict(data)
 
1
+ import torch
2
+ from PIL import Image
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from io import BytesIO
5
+ import base64
6
+ from huggingface_hub import login
7
+ import os
8
+
9
+ class EndpointHandler:
10
+ def __init__(self, model_dir=None):
11
+ print("[Init] Initializing EndpointHandler...")
12
+ self.load_model()
13
+
14
+ def load_model(self):
15
+ hf_token = os.getenv("HF_TOKEN")
16
+ # Replace with the quantized version if available
17
+ model_path = "openbmb/MiniCPM-V-4" # Hypothetical quantized version
18
+
19
+ if hf_token:
20
+ print("[Auth] Logging into Hugging Face Hub with token...")
21
+ login(token=hf_token)
22
+
23
+ print(f"[Model Load] Loading quantized model from: {model_path}")
24
+ try:
25
+ self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True,
26
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16).eval().cuda()
27
+ print("model loaded sucessfully")
28
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
29
+ print("tokenizer loaded sucessfully")
30
+
31
+ except Exception as e:
32
+ print(f"[Model Load Error] {e}")
33
+ raise RuntimeError(f"Failed to load quantized model: {e}")
34
+
35
+ def load_image(self, image_base64):
36
+ try:
37
+ print("[Image Load] Decoding base64 image...")
38
+ image_bytes = base64.b64decode(image_base64)
39
+ image = Image.open(BytesIO(image_bytes)).convert("RGB")
40
+ print("[Image Load] Image successfully decoded and converted to RGB.")
41
+ return image
42
+ except Exception as e:
43
+ print(f"[Image Load Error] {e}")
44
+ raise ValueError(f"Failed to open image from base64 string: {e}")
45
+
46
+ def predict(self, request):
47
+ print(f"[Predict] Received request: {request}")
48
+
49
+ image_base64 = request.get("inputs", {}).get("image")
50
+ question = request.get("inputs", {}).get("question")
51
+ stream = request.get("inputs", {}).get("stream", False)
52
+
53
+ if not image_base64 or not question:
54
+ print("[Predict Error] Missing 'image' or 'question' in the request.")
55
+ return {"error": "Missing 'image' or 'question' in inputs."}
56
+
57
+ try:
58
+ image = self.load_image(image_base64)
59
+ msgs = [{"role": "user", "content": [image, question]}]
60
+
61
+ print(f"[Predict] Asking model with question: {question}")
62
+ print("[Predict] Starting chat inference...")
63
+
64
+ res = self.model.chat(
65
+ image=image,
66
+ msgs=msgs,
67
+ tokenizer=self.tokenizer,
68
+ sampling=True,
69
+ stream=stream
70
+ )
71
+
72
+ if stream:
73
+ for new_text in res:
74
+ yield {"output": new_text}
75
+ else:
76
+ generated_text = "".join(res)
77
+ print("[Predict] Inference complete.")
78
+ return {"output": generated_text}
79
+
80
+ except Exception as e:
81
+ print(f"[Predict Error] {e}")
82
+ return {"error": str(e)}
83
+
84
+ def __call__(self, data):
85
+ print("[__call__] Invoked handler with data.")
86
+ return self.predict(data)