from typing import Any, List, Literal, Mapping, Optional, Tuple import time from PIL import Image # Numpy image type import numpy.typing as npt from numpy import uint8 ImageType = npt.NDArray[uint8] import numpy as np import uuid from doctrfiles import DoctrWordDetector,DoctrTextRecognizer,Wordboxes from deepdoc import RagFlow from utils import LineAnnotation,WordAnnotation,getlogger,cropImageExtraMargin,crop_an_Image,cropImages,get_new_coord from numpy.typing import NDArray MARGIN_FACTOR = 1.4 class OCRComponent1(): """ This component uses RagFlow as text line detector \\ Uses DocTR's word detector and text recognizer """ def __init__(self,englishflag =False): logger = getlogger("1") start_time = time.time() self.textlineDetector = RagFlow() end_time = time.time() execution_time = end_time - start_time logger.info(f"time to initialize Ragflow: {execution_time} seconds") start_time = time.time() """ self.wordDetector = DoctrWordDetector(architecture="db_resnet50", path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt") """ self.wordDetector = DoctrWordDetector(architecture="db_resnet50", path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt", path_config_json ="doctrfiles/models/db_resnet50_config.json") end_time = time.time() execution_time = end_time - start_time logger.info(f"time to initialize DoctrWordDetectorDebug: {execution_time} seconds") start_time = time.time() if not englishflag: self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="doctrfiles/models/doctr-multilingual-parseq.bin", path_config_json="doctrfiles/models/multilingual-parseq-config.json") else: self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="doctrfiles/models/master-fde31e4a.pt", path_config_json="doctrfiles/models/master.json") end_time = time.time() execution_time = end_time - start_time logger.info(f"time to initialize DoctrTextRecognizer: {execution_time} seconds") @staticmethod def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): i = 0 for img in detected_lines_images: pilimg = Image.fromarray(img) pilimg.save(prefix+str(i)+'.png') i=i+1 @staticmethod def convert_coordinates(original_coord = NDArray[np.float32],detection_res = NDArray[np.float32])-> NDArray[np.float32]: """ Type if original_coord : np.array([ [xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax] ] """ height = original_coord[3][1] - original_coord[0][1] width = original_coord[1][0] - original_coord[0][0] if width/height<1.6: bigger = max(height,width) new_height = int(bigger *3) new_width = int(bigger*3) else: bigger = max(height,width) new_height = int(bigger *MARGIN_FACTOR) new_width = int(bigger*MARGIN_FACTOR) y_offset = (new_height - height) // 2 x_offset = (new_width - width) // 2 #new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img #x,y offsets are the min x and y # Calculate relative coordinate to the original image in the padded image rel = np.array( [ [detection_res[0][0] - x_offset, detection_res[0][1]-y_offset], [detection_res[1][0] - x_offset, detection_res[1][1]-y_offset], [detection_res[2][0] - x_offset, detection_res[2][1]-y_offset], [detection_res[3][0] - x_offset, detection_res[3][1]-y_offset], ] ) xmin = original_coord[0][0] ymin = original_coord[0][1] xmax = original_coord[1][0] ymax = original_coord[2][1] #This used to return 4 x 2 array #rel_in_page =[[xmin+b[0],ymin+b[1]] for b in rel] #Now returns 4x1 array rel_in_page = np.array([xmin+rel[0][0],ymin+rel[0][1], xmin +rel[1][0], ymin +rel[2][1]]) return rel_in_page def predict(self, img:ImageType)->Tuple[List[LineAnnotation],List[WordAnnotation]]: logger = getlogger("1") start_time = time.time() """ bxs : Text line detection results - bounding boxes Each element looks like : [array([[ 90., 98.], [313., 100.], [312., 129.], [ 90., 127.]], dtype=float32) [left_lower, right_lower, right_upper, left_upper] """ # 4x2 array bxs:List[NDArray[np.float32]] = self.textlineDetector.predict(img = np.array(img)) end_time = time.time() execution_time = end_time - start_time logger.info(f"time to detecttextline: {execution_time} seconds") line_annotations = {} straightboxs = [] for points in bxs: xmin, ymin, xmax, ymax = get_new_coord(img.shape[1],img.shape[0],points) b = np.array([ [xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax] ], dtype=np.float32) straightboxs.append(b) ann = LineAnnotation(box =[xmin, ymin, xmax, ymax]) line_annotations[ann.index] = ann """ detected_lines_images : cropped images of detected lines """ # Double computation in line 117 - we calculate the straight lines again #Straightboxes : 4x 2 array detected_lines_images:List[ImageType] = cropImageExtraMargin(straightboxs, img,margin =MARGIN_FACTOR,straight=True) #self.save_detection(detected_lines_images,prefix = './res/12June_two_Line_') start_time = time.time() word_annotations =[] #viz_word_detection =[] for uuid, lineimg in zip(line_annotations.keys(),detected_lines_images): original_coord = line_annotations[uuid].box xmin, ymin, xmax, ymax = original_coord original_coord_b = np.array([ [xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax] ], dtype=np.float32) #List of 4 x 2 detection_results :List[Wordboxes]= self.wordDetector.predict(lineimg) input_Word_recog ={} for wordbox in detection_results: #So i think cropped_image's expected form is different that what is being returned #takes in 4x2 array : box cropped_image= crop_an_Image(wordbox.box,lineimg) """ We need to convert coordintes in wordbox.box to the original image wordbox.box = np.array(wordbox.box) """ #original_coord_b :4x2 array #coord_in_page :4 x 1 array coord_in_page = self.convert_coordinates(original_coord_b,wordbox.box) #logger.info("returned coordinate in page ") #logger.info(coord_in_page) wordAnn = WordAnnotation(box = coord_in_page, text = None) word_uuid = wordAnn.index input_Word_recog[word_uuid]= [cropped_image,wordAnn] #print("uuid is ") #print(uuid) #print(len(line_annotations[uuid].words)) line_annotations[uuid].words.append(wordAnn) #viz_word_detection.append(cropped_image) #input_Word_recog contains only word detection #It is dictionary of annotation id as key, than as values - list of cropped_image and Annotation Instance with key as uuid word_annotations_in_line = self.textRecognizer.predict(input_Word_recog) word_annotations.append(word_annotations_in_line) #self.save_detection(viz_word_detection,prefix = './res/test4/rel_page_') end_time = time.time() execution_time = end_time - start_time logger.info(f"Entire DocTR pipeline: {execution_time} seconds") return line_annotations