Spaces:

pantatwiai
/

Newspapers-OCR-Demo

Sleeping

Newspapers-OCR-Demo / run_yolo.py

Devesh Pant

1b870f4 over 1 year ago

10.3 kB

	import cv2
	import time
	import requests
	import random
	import numpy as np
	from PIL import Image
	from pathlib import Path
	from collections import OrderedDict,namedtuple
	import onnxruntime as ort
	import torch
	import torchvision
	import math

	def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
	# Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
	box2 = box2.T

	# Get the coordinates of bounding boxes
	if x1y1x2y2: # x1, y1, x2, y2 = box1
	b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
	b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
	else: # transform from xywh to xyxy
	b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
	b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
	b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
	b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2

	# Intersection area
	inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
	(torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)

	# Union Area
	w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
	w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
	union = w1 * h1 + w2 * h2 - inter + eps

	iou = inter / union

	if GIoU or DIoU or CIoU:
	cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) # convex (smallest enclosing box) width
	ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height
	if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
	c2 = cw 2 + ch 2 + eps # convex diagonal squared
	rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
	(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center distance squared
	if DIoU:
	return iou - rho2 / c2 # DIoU
	elif CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
	v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / (h2 + eps)) - torch.atan(w1 / (h1 + eps)), 2)
	with torch.no_grad():
	alpha = v / (v - iou + (1 + eps))
	return iou - (rho2 / c2 + v * alpha) # CIoU
	else: # GIoU https://arxiv.org/pdf/1902.09630.pdf
	c_area = cw * ch + eps # convex area
	return iou - (c_area - union) / c_area # GIoU
	else:
	return iou # IoU


	def xywh2xyxy(x):
	# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
	y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
	y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
	y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
	y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
	y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
	return y


	def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
	labels=()):
	"""Runs Non-Maximum Suppression (NMS) on inference results

	Returns:
	list of detections, on (n,6) tensor per image [xyxy, conf, cls]
	"""

	nc = prediction.shape[2] - 5 # number of classes
	xc = prediction[..., 4] > conf_thres # candidates

	# Settings
	min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
	max_det = 300 # maximum number of detections per image
	max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
	time_limit = 10.0 # seconds to quit after
	redundant = True # require redundant detections
	multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
	merge = False # use merge-NMS

	t = time.time()
	output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
	for xi, x in enumerate(prediction): # image index, image inference
	# Apply constraints
	# x[((x[..., 2:4] < min_wh) \| (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
	x = x[xc[xi]] # confidence

	# Cat apriori labels if autolabelling
	if labels and len(labels[xi]):
	l = labels[xi]
	v = torch.zeros((len(l), nc + 5), device=x.device)
	v[:, :4] = l[:, 1:5] # box
	v[:, 4] = 1.0 # conf
	v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
	x = torch.cat((x, v), 0)

	# If none remain process next image
	if not x.shape[0]:
	continue

	# Compute conf
	if nc == 1:
	x[:, 5:] = x[:, 4:5] # for models with one class, cls_loss is 0 and cls_conf is always 0.5,
	# so there is no need to multiplicate.
	else:
	x[:, 5:] = x[:, 4:5] # conf = obj_conf cls_conf

	# Box (center x, center y, width, height) to (x1, y1, x2, y2)
	box = xywh2xyxy(x[:, :4])

	# Detections matrix nx6 (xyxy, conf, cls)
	if multi_label:
	i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
	x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
	else: # best class only
	conf, j = x[:, 5:].max(1, keepdim=True)
	x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

	# Filter by class
	if classes is not None:
	x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

	# Apply finite constraint
	# if not torch.isfinite(x).all():
	# x = x[torch.isfinite(x).all(1)]

	# Check shape
	n = x.shape[0] # number of boxes
	if not n: # no boxes
	continue
	elif n > max_nms: # excess boxes
	x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence

	# Batched NMS
	c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
	boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
	i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
	if i.shape[0] > max_det: # limit detections
	i = i[:max_det]
	if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
	# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
	iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
	weights = iou * scores[None] # box weights
	x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
	if redundant:
	i = i[iou.sum(1) > 1] # require redundancy

	output[xi] = x[i]
	if (time.time() - t) > time_limit:
	print(f'WARNING: NMS time limit {time_limit}s exceeded')
	break # time limit exceeded

	return output


	def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
	# Resize and pad image while meeting stride-multiple constraints
	shape = im.shape[:2] # current shape [height, width]
	if isinstance(new_shape, int):
	new_shape = (new_shape, new_shape)

	# Scale ratio (new / old)
	r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
	if not scaleup: # only scale down, do not scale up (for better val mAP)
	r = min(r, 1.0)

	# Compute padding
	new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
	dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding

	if auto: # minimum rectangle
	dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding

	dw /= 2 # divide padding into 2 sides
	dh /= 2

	if shape[::-1] != new_unpad: # resize
	im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
	top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
	left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
	im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
	return im, r, (dw, dh)


	def get_layout_results(img, onnx_path):
	providers = ['CPUExecutionProvider']
	session = ort.InferenceSession(onnx_path, providers=providers)
	names = ['Articles', 'Advertisement', 'Headlines', 'Sub-headlines', 'Graphics', 'Images', 'Tables', 'Text Block', 'Header']
	# colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}
	# instead of random color, use specific easily distinguishable colors for each class
	colors = {
	'Articles': [255, 0, 0], # Red
	'Advertisement': [0, 255, 0], # Green
	'Headlines': [0, 0, 255], # Blue
	'Sub-headlines': [255, 255, 0], # Yellow
	'Graphics': [255, 0, 255], # Magenta
	'Images': [128, 0, 128], # Purple
	'Tables': [0, 255, 255], # Teal
	'Text Block': [0, 128, 128], # Navy
	'Header': [0, 0, 0] # Black
	}

	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	image = img.copy()
	image, ratio, dwdh = letterbox(image, auto=False)
	image = image.transpose((2, 0, 1))
	image = np.expand_dims(image, 0)
	image = np.ascontiguousarray(image)
	im = image.astype(np.float32)
	im /= 255.0
	outname = [i.name for i in session.get_outputs()]
	inname = [i.name for i in session.get_inputs()]
	inp = {inname[0]:im}

	# ONNX inference
	outputs = session.run(outname, inp)[0]
	# convert to torch tensor
	outputs = torch.from_numpy(outputs)
	det = non_max_suppression(outputs, 0.25, 0.45, classes=None, agnostic=False)[0] # conf_thres=0.25, iou_thres=0.45
	results = []
	# postprocess the output
	for i,(x0,y0,x1,y1,score,cls_id) in enumerate(det):
	box = np.array([x0,y0,x1,y1])
	box -= np.array(dwdh*2)
	box /= ratio
	box = box.round().astype(np.int32).tolist()
	cls_id = int(cls_id)
	score = round(float(score),3)
	name = names[cls_id]
	color = colors[name]
	results.append([box, score, cls_id, color])

	return results

	if __name__ == '__main__':
	onnx_path = "/home/ubuntu/devesh/yolov7/runs/train/yolov7-custom9/weights/best.onnx"
	img_ori = cv2.imread('/home/ubuntu/devesh/yolov7/Language_wise_imgs/Hindi/_Dainik_Navajyoti_-_04-11-2023_3.png')
	lines = get_layout_results(img_ori, onnx_path)
	print(lines[0])