ChatVID / model /vision /grit_src /image_dense_captions.py
Yiqin's picture
init
6ef31de
import sys
from detectron2.config import get_cfg
sys.path.insert(
0, 'model/vision/grit_src/third_party/CenterNet2/projects/CenterNet2/')
from model.vision.grit_src.third_party.CenterNet2.projects.CenterNet2.centernet.config import add_centernet_config
from model.vision.grit_src.grit.config import add_grit_config
from model.vision.grit_src.grit.predictor import VisualizationDemo
# constants
WINDOW_NAME = "GRiT"
def dense_pred_to_caption_no_bbox(predictions):
object_description = predictions["instances"].pred_object_descriptions.data
new_caption = ""
for i in range(len(object_description) - 1):
new_caption += (object_description[i] + ", ")
new_caption += (object_description[-1] + ".")
return new_caption
def dense_pred_to_caption(predictions):
boxes = predictions["instances"].pred_boxes if predictions[
"instances"].has("pred_boxes") else None
object_description = predictions["instances"].pred_object_descriptions.data
new_caption = ""
for i in range(len(object_description)):
new_caption += (object_description[i] + ": " + str(
[int(a)
for a in boxes[i].tensor.cpu().detach().numpy()[0]])) + "; "
return new_caption
def setup_cfg(args):
cfg = get_cfg()
if args["cpu"]:
cfg.MODEL.DEVICE = "cpu"
add_centernet_config(cfg)
add_grit_config(cfg)
cfg.merge_from_file(args["config_file"])
cfg.merge_from_list(args["opts"])
# Set score_threshold for builtin models
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args["confidence_threshold"]
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args[
"confidence_threshold"]
if args["test_task"]:
cfg.MODEL.TEST_TASK = args["test_task"]
cfg.MODEL.BEAM_SIZE = 1
cfg.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
cfg.USE_ACT_CHECKPOINT = False
cfg.freeze()
return cfg
def get_parser(device):
arg_dict = {
'config_file':
"model/vision/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml",
'cpu':
False,
'confidence_threshold':
0.5,
'test_task':
'DenseCap',
'opts':
["MODEL.WEIGHTS", "pretrained_models/grit_b_densecap_objectdet.pth"]
}
if device == "cpu":
arg_dict["cpu"] = True
return arg_dict
def image_caption_api(cv2_img, device='cuda'):
args2 = get_parser(device)
cfg = setup_cfg(args2)
demo = VisualizationDemo(cfg)
predictions, _ = demo.run_on_image(cv2_img)
new_caption = dense_pred_to_caption_no_bbox(predictions)
return new_caption