Spaces:

yumikimi381
/

alps

Build error

App Files Files Community

alps / ocr_component1.py

yumikimi381

Upload folder using huggingface_hub

b32cc49 verified 3 months ago

raw

history blame contribute delete

8.7 kB

	from typing import Any, List, Literal, Mapping, Optional, Tuple
	import time

	from PIL import Image
	# Numpy image type
	import numpy.typing as npt
	from numpy import uint8
	ImageType = npt.NDArray[uint8]

	import numpy as np
	import uuid

	from doctrfiles import DoctrWordDetector,DoctrTextRecognizer,Wordboxes
	from deepdoc import RagFlow
	from utils import LineAnnotation,WordAnnotation,getlogger,cropImageExtraMargin,crop_an_Image,cropImages,get_new_coord
	from numpy.typing import NDArray

	MARGIN_FACTOR = 1.4
	class OCRComponent1():
	"""
	This component uses RagFlow as text line detector \\
	Uses DocTR's word detector and text recognizer
	"""
	def __init__(self,englishflag =False):
	logger = getlogger("1")
	start_time = time.time()
	self.textlineDetector = RagFlow()
	end_time = time.time()
	execution_time = end_time - start_time
	logger.info(f"time to initialize Ragflow: {execution_time} seconds")


	start_time = time.time()
	"""
	self.wordDetector = DoctrWordDetector(architecture="db_resnet50",
	path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt")

	"""

	self.wordDetector = DoctrWordDetector(architecture="db_resnet50",
	path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt",
	path_config_json ="doctrfiles/models/db_resnet50_config.json")


	end_time = time.time()

	execution_time = end_time - start_time
	logger.info(f"time to initialize DoctrWordDetectorDebug: {execution_time} seconds")
	start_time = time.time()
	if not englishflag:
	self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="doctrfiles/models/doctr-multilingual-parseq.bin",
	path_config_json="doctrfiles/models/multilingual-parseq-config.json")
	else:
	self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="doctrfiles/models/master-fde31e4a.pt",
	path_config_json="doctrfiles/models/master.json")
	end_time = time.time()
	execution_time = end_time - start_time
	logger.info(f"time to initialize DoctrTextRecognizer: {execution_time} seconds")


	@staticmethod
	def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'):
	i = 0
	for img in detected_lines_images:
	pilimg = Image.fromarray(img)
	pilimg.save(prefix+str(i)+'.png')
	i=i+1

	@staticmethod
	def convert_coordinates(original_coord = NDArray[np.float32],detection_res = NDArray[np.float32])-> NDArray[np.float32]:
	"""
	Type if original_coord : np.array([
	[xmin, ymin],
	[xmax, ymin],
	[xmax, ymax],
	[xmin, ymax]
	]
	"""
	height = original_coord[3][1] - original_coord[0][1]
	width = original_coord[1][0] - original_coord[0][0]
	if width/height<1.6:
	bigger = max(height,width)
	new_height = int(bigger *3)
	new_width = int(bigger*3)
	else:
	bigger = max(height,width)
	new_height = int(bigger *MARGIN_FACTOR)
	new_width = int(bigger*MARGIN_FACTOR)

	y_offset = (new_height - height) // 2
	x_offset = (new_width - width) // 2
	#new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img
	#x,y offsets are the min x and y

	# Calculate relative coordinate to the original image in the padded image

	rel = np.array(
	[
	[detection_res[0][0] - x_offset, detection_res[0][1]-y_offset],
	[detection_res[1][0] - x_offset, detection_res[1][1]-y_offset],
	[detection_res[2][0] - x_offset, detection_res[2][1]-y_offset],
	[detection_res[3][0] - x_offset, detection_res[3][1]-y_offset],
	]
	)
	xmin = original_coord[0][0]
	ymin = original_coord[0][1]
	xmax = original_coord[1][0]
	ymax = original_coord[2][1]
	#This used to return 4 x 2 array
	#rel_in_page =[[xmin+b[0],ymin+b[1]] for b in rel]
	#Now returns 4x1 array
	rel_in_page = np.array([xmin+rel[0][0],ymin+rel[0][1], xmin +rel[1][0], ymin +rel[2][1]])
	return rel_in_page



	def predict(self, img:ImageType)->Tuple[List[LineAnnotation],List[WordAnnotation]]:

	logger = getlogger("1")
	start_time = time.time()

	"""
	bxs : Text line detection results - bounding boxes
	Each element looks like : [array([[ 90., 98.],
	[313., 100.],
	[312., 129.],
	[ 90., 127.]], dtype=float32)
	[left_lower, right_lower, right_upper, left_upper]
	"""
	# 4x2 array
	bxs:List[NDArray[np.float32]] = self.textlineDetector.predict(img = np.array(img))

	end_time = time.time()
	execution_time = end_time - start_time
	logger.info(f"time to detecttextline: {execution_time} seconds")

	line_annotations = {}
	straightboxs = []
	for points in bxs:
	xmin, ymin, xmax, ymax = get_new_coord(img.shape[1],img.shape[0],points)
	b = np.array([
	[xmin, ymin],
	[xmax, ymin],
	[xmax, ymax],
	[xmin, ymax]
	], dtype=np.float32)
	straightboxs.append(b)
	ann = LineAnnotation(box =[xmin, ymin, xmax, ymax])
	line_annotations[ann.index] = ann

	"""
	detected_lines_images : cropped images of detected lines
	"""
	# Double computation in line 117 - we calculate the straight lines again
	#Straightboxes : 4x 2 array
	detected_lines_images:List[ImageType] = cropImageExtraMargin(straightboxs, img,margin =MARGIN_FACTOR,straight=True)
	#self.save_detection(detected_lines_images,prefix = './res/12June_two_Line_')
	start_time = time.time()
	word_annotations =[]

	#viz_word_detection =[]
	for uuid, lineimg in zip(line_annotations.keys(),detected_lines_images):

	original_coord = line_annotations[uuid].box
	xmin, ymin, xmax, ymax = original_coord
	original_coord_b = np.array([
	[xmin, ymin],
	[xmax, ymin],
	[xmax, ymax],
	[xmin, ymax]
	], dtype=np.float32)

	#List of 4 x 2
	detection_results :List[Wordboxes]= self.wordDetector.predict(lineimg)

	input_Word_recog ={}

	for wordbox in detection_results:
	#So i think cropped_image's expected form is different that what is being returned
	#takes in 4x2 array : box
	cropped_image= crop_an_Image(wordbox.box,lineimg)
	"""
	We need to convert coordintes in wordbox.box to the original image
	wordbox.box = np.array(wordbox.box)
	"""
	#original_coord_b :4x2 array
	#coord_in_page :4 x 1 array
	coord_in_page = self.convert_coordinates(original_coord_b,wordbox.box)
	#logger.info("returned coordinate in page ")
	#logger.info(coord_in_page)


	wordAnn = WordAnnotation(box = coord_in_page, text = None)
	word_uuid = wordAnn.index
	input_Word_recog[word_uuid]= [cropped_image,wordAnn]
	#print("uuid is ")
	#print(uuid)
	#print(len(line_annotations[uuid].words))
	line_annotations[uuid].words.append(wordAnn)

	#viz_word_detection.append(cropped_image)


	#input_Word_recog contains only word detection
	#It is dictionary of annotation id as key, than as values - list of cropped_image and Annotation Instance with key as uuid

	word_annotations_in_line = self.textRecognizer.predict(input_Word_recog)
	word_annotations.append(word_annotations_in_line)

	#self.save_detection(viz_word_detection,prefix = './res/test4/rel_page_')
	end_time = time.time()
	execution_time = end_time - start_time
	logger.info(f"Entire DocTR pipeline: {execution_time} seconds")
	return line_annotations