Spaces:
Sleeping
Sleeping
π¨ Mimic GenAIOrchestration
Browse files- app.py +3 -1
- internvl_utils.py +17 -5
- models/InternVL3/intervl3.py +35 -9
- models/misc_utils.py +22 -1
- payload_model.py +5 -2
app.py
CHANGED
@@ -19,7 +19,9 @@ def healthcheck():
|
|
19 |
async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
|
20 |
try:
|
21 |
model_response = await internvl_inference(model, payload)
|
22 |
-
|
|
|
|
|
23 |
except Exception as e:
|
24 |
print(f"Error: {e}")
|
25 |
return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
|
|
|
19 |
async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
|
20 |
try:
|
21 |
model_response = await internvl_inference(model, payload)
|
22 |
+
model_response = "True" if model_response else "False"
|
23 |
+
final_response = {"1":{"query_status": model_response}}
|
24 |
+
return JSONResponse(status_code=200, content={"final_response": final_response})
|
25 |
except Exception as e:
|
26 |
print(f"Error: {e}")
|
27 |
return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
|
internvl_utils.py
CHANGED
@@ -6,7 +6,7 @@ from torchvision.transforms.functional import InterpolationMode
|
|
6 |
from transformers import AutoConfig
|
7 |
from models import InternVL3
|
8 |
from payload_model import PayloadModel
|
9 |
-
from models.misc_utils import
|
10 |
|
11 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
12 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
@@ -75,9 +75,10 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
|
|
75 |
processed_images.append(thumbnail_img)
|
76 |
return processed_images
|
77 |
|
78 |
-
def load_image(
|
79 |
-
|
80 |
-
|
|
|
81 |
transform = build_transform(input_size=input_size)
|
82 |
images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
83 |
pixel_values = [transform(image) for image in images]
|
@@ -114,4 +115,15 @@ def split_model(model_name):
|
|
114 |
return device_map
|
115 |
|
116 |
async def internvl_inference(model: InternVL3, payload: PayloadModel):
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from transformers import AutoConfig
|
7 |
from models import InternVL3
|
8 |
from payload_model import PayloadModel
|
9 |
+
from models.misc_utils import get_images_using_bbox, get_whole_image
|
10 |
|
11 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
12 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
|
75 |
processed_images.append(thumbnail_img)
|
76 |
return processed_images
|
77 |
|
78 |
+
def load_image(pil_image, input_size=448, max_num=12):
|
79 |
+
pil_image = pil_image.convert('RGB')
|
80 |
+
|
81 |
+
# pil_image = convert_base64_to_pil(image)
|
82 |
transform = build_transform(input_size=input_size)
|
83 |
images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
84 |
pixel_values = [transform(image) for image in images]
|
|
|
115 |
return device_map
|
116 |
|
117 |
async def internvl_inference(model: InternVL3, payload: PayloadModel):
|
118 |
+
try:
|
119 |
+
if payload.input_utilization_mode == "bbox":
|
120 |
+
images = get_images_using_bbox(payload)
|
121 |
+
else:
|
122 |
+
images = get_whole_image(payload)
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
raise Exception(f"Error: {e}")
|
126 |
+
|
127 |
+
prompt_keyword = payload.prompt
|
128 |
+
prompt_eval_mode = payload.prompt_eval_mode
|
129 |
+
return await model(images, prompt_keyword, prompt_eval_mode)
|
models/InternVL3/intervl3.py
CHANGED
@@ -4,7 +4,7 @@ from payload_model import PayloadModel
|
|
4 |
from internvl_utils import load_image
|
5 |
from pydantic import BaseModel, Field
|
6 |
from typing import Optional
|
7 |
-
|
8 |
class InternVL3(BaseModel):
|
9 |
model_name: str
|
10 |
model: Optional[AutoModel] = None
|
@@ -43,11 +43,11 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
|
|
43 |
query_prompt = None
|
44 |
return query_prompt
|
45 |
|
46 |
-
def predict(self,
|
47 |
-
pixel_values = load_image(
|
48 |
-
query_prompt = self.get_query_prompt(
|
49 |
if query_prompt is None:
|
50 |
-
model_response = f"Invalid prompt keyword: {
|
51 |
else:
|
52 |
model_response = self.model.chat(
|
53 |
self.tokenizer,
|
@@ -58,11 +58,37 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
|
|
58 |
|
59 |
return model_response
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def extract_model_response(self, model_response: str):
|
62 |
return "Yes" in model_response
|
63 |
|
64 |
-
async def __call__(self,
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
4 |
from internvl_utils import load_image
|
5 |
from pydantic import BaseModel, Field
|
6 |
from typing import Optional
|
7 |
+
import PIL
|
8 |
class InternVL3(BaseModel):
|
9 |
model_name: str
|
10 |
model: Optional[AutoModel] = None
|
|
|
43 |
query_prompt = None
|
44 |
return query_prompt
|
45 |
|
46 |
+
def predict(self, pil_image: PIL.Image.Image, prompt_keyword: str):
|
47 |
+
pixel_values = load_image(pil_image)
|
48 |
+
query_prompt = self.get_query_prompt(prompt_keyword)
|
49 |
if query_prompt is None:
|
50 |
+
model_response = f"Invalid prompt keyword: {prompt_keyword}"
|
51 |
else:
|
52 |
model_response = self.model.chat(
|
53 |
self.tokenizer,
|
|
|
58 |
|
59 |
return model_response
|
60 |
|
61 |
+
def eval_or(self, images: list[PIL.Image.Image], prompt_keyword: str):
|
62 |
+
model_responses = []
|
63 |
+
for image in images:
|
64 |
+
model_response = self.predict(image, prompt_keyword)
|
65 |
+
model_responses.append(model_response)
|
66 |
+
if self.extract_model_response(model_response):
|
67 |
+
return True, model_responses
|
68 |
+
return False, model_responses
|
69 |
+
|
70 |
+
def eval_and(self, images: list[PIL.Image.Image], prompt_keyword: str):
|
71 |
+
model_responses = []
|
72 |
+
for image in images:
|
73 |
+
model_response = self.predict(image, prompt_keyword)
|
74 |
+
model_responses.append(model_response)
|
75 |
+
if not self.extract_model_response(model_response):
|
76 |
+
return False, model_responses
|
77 |
+
return True, model_responses
|
78 |
+
|
79 |
def extract_model_response(self, model_response: str):
|
80 |
return "Yes" in model_response
|
81 |
|
82 |
+
async def __call__(self, images: list[PIL.Image.Image], prompt_keyword: str, prompt_eval_mode: str):
|
83 |
+
overall_response = False
|
84 |
+
if prompt_eval_mode == "or":
|
85 |
+
overall_response, model_responses = self.eval_or(images, prompt_keyword)
|
86 |
+
elif prompt_eval_mode == "and":
|
87 |
+
overall_response, model_responses = self.eval_and(images, prompt_keyword)
|
88 |
+
else:
|
89 |
+
raise ValueError(f"Invalid prompt eval mode: {prompt_eval_mode}")
|
90 |
+
|
91 |
+
print(f"Model responses: {model_responses}")
|
92 |
+
|
93 |
+
return overall_response
|
94 |
|
models/misc_utils.py
CHANGED
@@ -2,7 +2,7 @@ import cv2
|
|
2 |
import numpy as np
|
3 |
import base64
|
4 |
from PIL import Image
|
5 |
-
|
6 |
|
7 |
def convert_base64_to_cv2(base64_string: str):
|
8 |
return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
|
@@ -13,3 +13,24 @@ def convert_cv2_to_pil(image: np.ndarray):
|
|
13 |
def convert_base64_to_pil(base64_string: str):
|
14 |
return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import base64
|
4 |
from PIL import Image
|
5 |
+
from payload_model import PayloadModel
|
6 |
|
7 |
def convert_base64_to_cv2(base64_string: str):
|
8 |
return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
|
|
|
13 |
def convert_base64_to_pil(base64_string: str):
|
14 |
return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
|
15 |
|
16 |
+
def get_images_using_bbox(payload: PayloadModel):
|
17 |
+
images = []
|
18 |
+
# Forcing that only a single image is received
|
19 |
+
cv2_image = convert_base64_to_cv2(payload.input_data[0])
|
20 |
+
print(f"Bbox: {payload.bbox}")
|
21 |
+
images_bboxes = payload.bbox
|
22 |
+
image_bboxes = images_bboxes[0]
|
23 |
+
for idx, bbox in enumerate(image_bboxes):
|
24 |
+
x1, y1, x2, y2 = bbox
|
25 |
+
image = cv2_image[y1:y2, x1:x2]
|
26 |
+
cv2.imwrite(f"image_{idx}.png", image)
|
27 |
+
pil_image = convert_cv2_to_pil(image)
|
28 |
+
images.append(pil_image)
|
29 |
+
return images
|
30 |
+
|
31 |
+
def get_whole_image(payload: PayloadModel):
|
32 |
+
images = []
|
33 |
+
# Forcing that only a single image is received
|
34 |
+
pil_image = convert_base64_to_pil(payload.input_data[0])
|
35 |
+
images.append(pil_image)
|
36 |
+
return images
|
payload_model.py
CHANGED
@@ -2,5 +2,8 @@ from pydantic import BaseModel
|
|
2 |
|
3 |
class PayloadModel(BaseModel):
|
4 |
"""Type check for payload parameters"""
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
2 |
|
3 |
class PayloadModel(BaseModel):
|
4 |
"""Type check for payload parameters"""
|
5 |
+
input_data: list[str]
|
6 |
+
prompt: str
|
7 |
+
prompt_eval_mode: str
|
8 |
+
input_utilization_mode: str
|
9 |
+
bbox: list[list[list[int]]]
|