Spaces:

kevinssy
/

CLIP_as_RNN

Runtime error

Kevin Sun

init commit

6cd90b7 about 1 year ago

7.68 kB

	# coding=utf-8
	# Copyright 2024 The Google Research Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""CAM utils."""

	# pylint: disable=g-importing-member
	import os

	import cv2
	import numpy as np
	from PIL import Image
	from scipy.ndimage import binary_fill_holes
	import torch
	from torchvision.transforms import Compose
	from torchvision.transforms import Normalize
	from torchvision.transforms import Resize
	from torchvision.transforms import ToTensor

	# pylint: disable=g-import-not-at-top
	try:
	from torchvision.transforms import InterpolationMode

	BICUBIC = InterpolationMode.BICUBIC
	except ImportError:
	BICUBIC = Image.BICUBIC

	_CONTOUR_INDEX = 1 if cv2.__version__.split('.')[0] == '3' else 0


	def _convert_image_to_rgb(image):
	return image.convert('RGB')


	def _transform_resize(h, w):
	return Compose([
	Resize((h, w), interpolation=BICUBIC),
	_convert_image_to_rgb,
	ToTensor(),
	Normalize(
	(0.48145466, 0.4578275, 0.40821073),
	(0.26862954, 0.26130258, 0.27577711),
	),
	])


	def img_ms_and_flip(image, ori_height, ori_width, scales=1.0, patch_size=16):
	"""Resizes and flips the image."""
	if isinstance(scales, float):
	scales = [scales]

	all_imgs = []
	for scale in scales:
	preprocess = _transform_resize(
	int(np.ceil(scale * int(ori_height) / patch_size) * patch_size),
	int(np.ceil(scale * int(ori_width) / patch_size) * patch_size),
	)
	image = preprocess(image)
	image_ori = image
	image_flip = torch.flip(image, [-1])
	all_imgs.append(image_ori)
	all_imgs.append(image_flip)
	return all_imgs


	def reshape_transform(tensor, height=28, width=28):
	tensor = tensor.permute(1, 0, 2)
	result = tensor[:, 1:, :].reshape(
	tensor.size(0), height, width, tensor.size(2)
	)

	# Bring the channels to the first dimension, like in CNNs.
	result = result.transpose(2, 3).transpose(1, 2)
	return result


	def vis_mask(image, mask, mask_color):
	# switch the height and width of image
	# image = image.transpose(1, 0, 2)
	if mask.shape[0] != image.shape[0] or mask.shape[1] != image.shape[1]:
	mask = cv2.resize(mask, (image.shape[1], image.shape[0]))
	fg = mask > 0.5
	rgb = np.copy(image)
	rgb[fg] = (rgb[fg] * 0.3 + np.array(mask_color) * 0.7).astype(np.uint8)
	return Image.fromarray(rgb)


	def scoremap2bbox(scoremap, threshold, multi_contour_eval=False):
	"""Get bounding boxes from scoremap."""
	height, width = scoremap.shape
	scoremap_image = np.expand_dims((scoremap * 255).astype(np.uint8), 2)
	while True:
	_, thr_gray_heatmap = cv2.threshold(
	src=scoremap_image,
	thresh=int(threshold * np.max(scoremap_image)),
	maxval=255,
	type=cv2.THRESH_BINARY,
	)
	if thr_gray_heatmap.max() > 0 or threshold <= 0:
	break
	threshold -= 0.1
	contours = cv2.findContours(
	image=thr_gray_heatmap, mode=cv2.RETR_TREE, method=cv2.CHAIN_APPROX_SIMPLE
	)[_CONTOUR_INDEX]

	# if len(contours) == 0:
	if not contours:
	return np.asarray([[0, 0, 0, 0]]), 1

	if not multi_contour_eval:
	contours = [max(contours, key=cv2.contourArea)]

	estimated_boxes = []
	for contour in contours:
	x, y, w, h = cv2.boundingRect(contour)
	x0, y0, x1, y1 = x, y, x + w, y + h
	x1 = min(x1, width - 1)
	y1 = min(y1, height - 1)
	estimated_boxes.append([x0, y0, x1, y1])

	return np.asarray(estimated_boxes), len(contours)


	def mask2chw(arr):
	# Find the row and column indices where the array is 1
	rows, cols = np.where(arr == 1)
	# Calculate center of the mask
	center_y = int(np.mean(rows))
	center_x = int(np.mean(cols))
	# Calculate height and width of the mask
	height = rows.max() - rows.min() + 1
	width = cols.max() - cols.min() + 1
	return (center_y, center_x), height, width


	def unpad(image_array, pad=None):
	if pad is not None:
	left, top, width, height = pad
	image_array = image_array[top : top + height, left : left + width, :]
	return image_array


	def apply_visual_prompts(
	image_array,
	mask,
	visual_prompt_type=('circle',),
	visualize=False,
	color=(255, 0, 0),
	thickness=1,
	blur_strength=(15, 15),
	):
	"""Applies visual prompts to the image."""
	prompted_image = image_array.copy()
	if 'blur' in visual_prompt_type:
	# blur the part out side the mask
	# Blur the entire image
	blurred = cv2.GaussianBlur(prompted_image.copy(), blur_strength, 0)
	# Get the sharp region using the mask
	sharp_region = cv2.bitwise_and(
	prompted_image.copy(),
	prompted_image.copy(),
	mask=np.clip(mask, 0, 255).astype(np.uint8),
	)
	# Get the blurred region using the inverted mask
	inv_mask = 1 - mask
	blurred_region = (blurred * inv_mask[:, :, None]).astype(np.uint8)
	# Combine the sharp and blurred regions
	prompted_image = cv2.add(sharp_region, blurred_region)
	if 'gray' in visual_prompt_type:
	gray = cv2.cvtColor(prompted_image.copy(), cv2.COLOR_BGR2GRAY)
	# make gray part 3 channel
	gray = np.stack([gray, gray, gray], axis=-1)
	# Get the sharp region using the mask
	color_region = cv2.bitwise_and(
	prompted_image.copy(),
	prompted_image.copy(),
	mask=np.clip(mask, 0, 255).astype(np.uint8),
	)
	# Get the blurred region using the inverted mask
	inv_mask = 1 - mask
	gray_region = (gray * inv_mask[:, :, None]).astype(np.uint8)
	# Combine the sharp and blurred regions
	prompted_image = cv2.add(color_region, gray_region)
	if 'black' in visual_prompt_type:
	prompted_image = cv2.bitwise_and(
	prompted_image.copy(),
	prompted_image.copy(),
	mask=np.clip(mask, 0, 255).astype(np.uint8),
	)
	if 'circle' in visual_prompt_type:
	mask_center, mask_height, mask_width = mask2chw(mask)
	center_coordinates = (mask_center[1], mask_center[0])
	axes_length = (mask_width // 2, mask_height // 2)
	prompted_image = cv2.ellipse(
	prompted_image,
	center_coordinates,
	axes_length,
	0,
	0,
	360,
	color,
	thickness,
	)
	if 'rectangle' in visual_prompt_type:
	mask_center, mask_height, mask_width = mask2chw(mask)
	# center_coordinates = (mask_center[1], mask_center[0])
	# axes_length = (mask_width // 2, mask_height // 2)
	start_point = (
	mask_center[1] - mask_width // 2,
	mask_center[0] - mask_height // 2,
	)
	end_point = (
	mask_center[1] + mask_width // 2,
	mask_center[0] + mask_height // 2,
	)
	prompted_image = cv2.rectangle(
	prompted_image, start_point, end_point, color, thickness
	)
	if 'contour' in visual_prompt_type:
	# Find the contours of the mask
	# fill holes for the mask
	mask = binary_fill_holes(mask)
	contours, _ = cv2.findContours(
	mask.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
	)
	# Draw the contours on the image
	prompted_image = cv2.drawContours(
	prompted_image.copy(), contours, -1, color, thickness
	)

	if visualize:
	cv2.imwrite(os.path.join('masked_img.png'), prompted_image)
	prompted_image = Image.fromarray(prompted_image.astype(np.uint8))
	return prompted_image