Lemorra commited on
Commit
e6cc48e
Β·
1 Parent(s): 674ecbb

🎨 Mimic GenAIOrchestration

Browse files
app.py CHANGED
@@ -19,7 +19,9 @@ def healthcheck():
19
  async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
20
  try:
21
  model_response = await internvl_inference(model, payload)
22
- return JSONResponse(status_code=200, content={"status": "ok", "response": model_response})
 
 
23
  except Exception as e:
24
  print(f"Error: {e}")
25
  return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
 
19
  async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
20
  try:
21
  model_response = await internvl_inference(model, payload)
22
+ model_response = "True" if model_response else "False"
23
+ final_response = {"1":{"query_status": model_response}}
24
+ return JSONResponse(status_code=200, content={"final_response": final_response})
25
  except Exception as e:
26
  print(f"Error: {e}")
27
  return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
internvl_utils.py CHANGED
@@ -6,7 +6,7 @@ from torchvision.transforms.functional import InterpolationMode
6
  from transformers import AutoConfig
7
  from models import InternVL3
8
  from payload_model import PayloadModel
9
- from models.misc_utils import convert_base64_to_pil
10
 
11
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
12
  IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -75,9 +75,10 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
75
  processed_images.append(thumbnail_img)
76
  return processed_images
77
 
78
- def load_image(image, input_size=448, max_num=12):
79
- # image = Image.open(image_file).convert('RGB')
80
- pil_image = convert_base64_to_pil(image)
 
81
  transform = build_transform(input_size=input_size)
82
  images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
83
  pixel_values = [transform(image) for image in images]
@@ -114,4 +115,15 @@ def split_model(model_name):
114
  return device_map
115
 
116
  async def internvl_inference(model: InternVL3, payload: PayloadModel):
117
- return await model(payload)
 
 
 
 
 
 
 
 
 
 
 
 
6
  from transformers import AutoConfig
7
  from models import InternVL3
8
  from payload_model import PayloadModel
9
+ from models.misc_utils import get_images_using_bbox, get_whole_image
10
 
11
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
12
  IMAGENET_STD = (0.229, 0.224, 0.225)
 
75
  processed_images.append(thumbnail_img)
76
  return processed_images
77
 
78
+ def load_image(pil_image, input_size=448, max_num=12):
79
+ pil_image = pil_image.convert('RGB')
80
+
81
+ # pil_image = convert_base64_to_pil(image)
82
  transform = build_transform(input_size=input_size)
83
  images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
84
  pixel_values = [transform(image) for image in images]
 
115
  return device_map
116
 
117
  async def internvl_inference(model: InternVL3, payload: PayloadModel):
118
+ try:
119
+ if payload.input_utilization_mode == "bbox":
120
+ images = get_images_using_bbox(payload)
121
+ else:
122
+ images = get_whole_image(payload)
123
+
124
+ except Exception as e:
125
+ raise Exception(f"Error: {e}")
126
+
127
+ prompt_keyword = payload.prompt
128
+ prompt_eval_mode = payload.prompt_eval_mode
129
+ return await model(images, prompt_keyword, prompt_eval_mode)
models/InternVL3/intervl3.py CHANGED
@@ -4,7 +4,7 @@ from payload_model import PayloadModel
4
  from internvl_utils import load_image
5
  from pydantic import BaseModel, Field
6
  from typing import Optional
7
-
8
  class InternVL3(BaseModel):
9
  model_name: str
10
  model: Optional[AutoModel] = None
@@ -43,11 +43,11 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
43
  query_prompt = None
44
  return query_prompt
45
 
46
- def predict(self, payload: PayloadModel):
47
- pixel_values = load_image(payload.image)
48
- query_prompt = self.get_query_prompt(payload.prompt_keyword)
49
  if query_prompt is None:
50
- model_response = f"Invalid prompt keyword: {payload.prompt_keyword}"
51
  else:
52
  model_response = self.model.chat(
53
  self.tokenizer,
@@ -58,11 +58,37 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
58
 
59
  return model_response
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def extract_model_response(self, model_response: str):
62
  return "Yes" in model_response
63
 
64
- async def __call__(self, payload: PayloadModel):
65
- model_response = self.predict(payload)
66
- extracted_response = self.extract_model_response(model_response)
67
- return extracted_response
 
 
 
 
 
 
 
 
68
 
 
4
  from internvl_utils import load_image
5
  from pydantic import BaseModel, Field
6
  from typing import Optional
7
+ import PIL
8
  class InternVL3(BaseModel):
9
  model_name: str
10
  model: Optional[AutoModel] = None
 
43
  query_prompt = None
44
  return query_prompt
45
 
46
+ def predict(self, pil_image: PIL.Image.Image, prompt_keyword: str):
47
+ pixel_values = load_image(pil_image)
48
+ query_prompt = self.get_query_prompt(prompt_keyword)
49
  if query_prompt is None:
50
+ model_response = f"Invalid prompt keyword: {prompt_keyword}"
51
  else:
52
  model_response = self.model.chat(
53
  self.tokenizer,
 
58
 
59
  return model_response
60
 
61
+ def eval_or(self, images: list[PIL.Image.Image], prompt_keyword: str):
62
+ model_responses = []
63
+ for image in images:
64
+ model_response = self.predict(image, prompt_keyword)
65
+ model_responses.append(model_response)
66
+ if self.extract_model_response(model_response):
67
+ return True, model_responses
68
+ return False, model_responses
69
+
70
+ def eval_and(self, images: list[PIL.Image.Image], prompt_keyword: str):
71
+ model_responses = []
72
+ for image in images:
73
+ model_response = self.predict(image, prompt_keyword)
74
+ model_responses.append(model_response)
75
+ if not self.extract_model_response(model_response):
76
+ return False, model_responses
77
+ return True, model_responses
78
+
79
  def extract_model_response(self, model_response: str):
80
  return "Yes" in model_response
81
 
82
+ async def __call__(self, images: list[PIL.Image.Image], prompt_keyword: str, prompt_eval_mode: str):
83
+ overall_response = False
84
+ if prompt_eval_mode == "or":
85
+ overall_response, model_responses = self.eval_or(images, prompt_keyword)
86
+ elif prompt_eval_mode == "and":
87
+ overall_response, model_responses = self.eval_and(images, prompt_keyword)
88
+ else:
89
+ raise ValueError(f"Invalid prompt eval mode: {prompt_eval_mode}")
90
+
91
+ print(f"Model responses: {model_responses}")
92
+
93
+ return overall_response
94
 
models/misc_utils.py CHANGED
@@ -2,7 +2,7 @@ import cv2
2
  import numpy as np
3
  import base64
4
  from PIL import Image
5
-
6
 
7
  def convert_base64_to_cv2(base64_string: str):
8
  return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
@@ -13,3 +13,24 @@ def convert_cv2_to_pil(image: np.ndarray):
13
  def convert_base64_to_pil(base64_string: str):
14
  return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import base64
4
  from PIL import Image
5
+ from payload_model import PayloadModel
6
 
7
  def convert_base64_to_cv2(base64_string: str):
8
  return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
 
13
  def convert_base64_to_pil(base64_string: str):
14
  return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
15
 
16
+ def get_images_using_bbox(payload: PayloadModel):
17
+ images = []
18
+ # Forcing that only a single image is received
19
+ cv2_image = convert_base64_to_cv2(payload.input_data[0])
20
+ print(f"Bbox: {payload.bbox}")
21
+ images_bboxes = payload.bbox
22
+ image_bboxes = images_bboxes[0]
23
+ for idx, bbox in enumerate(image_bboxes):
24
+ x1, y1, x2, y2 = bbox
25
+ image = cv2_image[y1:y2, x1:x2]
26
+ cv2.imwrite(f"image_{idx}.png", image)
27
+ pil_image = convert_cv2_to_pil(image)
28
+ images.append(pil_image)
29
+ return images
30
+
31
+ def get_whole_image(payload: PayloadModel):
32
+ images = []
33
+ # Forcing that only a single image is received
34
+ pil_image = convert_base64_to_pil(payload.input_data[0])
35
+ images.append(pil_image)
36
+ return images
payload_model.py CHANGED
@@ -2,5 +2,8 @@ from pydantic import BaseModel
2
 
3
  class PayloadModel(BaseModel):
4
  """Type check for payload parameters"""
5
- image: str
6
- prompt_keyword: str
 
 
 
 
2
 
3
  class PayloadModel(BaseModel):
4
  """Type check for payload parameters"""
5
+ input_data: list[str]
6
+ prompt: str
7
+ prompt_eval_mode: str
8
+ input_utilization_mode: str
9
+ bbox: list[list[list[int]]]