Spaces:
Build error
Build error
from typing import Any, List, Literal, Mapping, Optional, Tuple | |
import time | |
from PIL import Image | |
# Numpy image type | |
import numpy.typing as npt | |
from numpy import uint8 | |
ImageType = npt.NDArray[uint8] | |
import numpy as np | |
import uuid | |
from doctrfiles import DoctrWordDetector,DoctrTextRecognizer,Wordboxes | |
from deepdoc import RagFlow | |
from utils import LineAnnotation,WordAnnotation,getlogger,cropImageExtraMargin,crop_an_Image,cropImages,get_new_coord | |
from numpy.typing import NDArray | |
MARGIN_FACTOR = 1.4 | |
class OCRComponent1(): | |
""" | |
This component uses RagFlow as text line detector \\ | |
Uses DocTR's word detector and text recognizer | |
""" | |
def __init__(self,englishflag =False): | |
logger = getlogger("1") | |
start_time = time.time() | |
self.textlineDetector = RagFlow() | |
end_time = time.time() | |
execution_time = end_time - start_time | |
logger.info(f"time to initialize Ragflow: {execution_time} seconds") | |
start_time = time.time() | |
""" | |
self.wordDetector = DoctrWordDetector(architecture="db_resnet50", | |
path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt") | |
""" | |
self.wordDetector = DoctrWordDetector(architecture="db_resnet50", | |
path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt", | |
path_config_json ="doctrfiles/models/db_resnet50_config.json") | |
end_time = time.time() | |
execution_time = end_time - start_time | |
logger.info(f"time to initialize DoctrWordDetectorDebug: {execution_time} seconds") | |
start_time = time.time() | |
if not englishflag: | |
self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="doctrfiles/models/doctr-multilingual-parseq.bin", | |
path_config_json="doctrfiles/models/multilingual-parseq-config.json") | |
else: | |
self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="doctrfiles/models/master-fde31e4a.pt", | |
path_config_json="doctrfiles/models/master.json") | |
end_time = time.time() | |
execution_time = end_time - start_time | |
logger.info(f"time to initialize DoctrTextRecognizer: {execution_time} seconds") | |
def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): | |
i = 0 | |
for img in detected_lines_images: | |
pilimg = Image.fromarray(img) | |
pilimg.save(prefix+str(i)+'.png') | |
i=i+1 | |
def convert_coordinates(original_coord = NDArray[np.float32],detection_res = NDArray[np.float32])-> NDArray[np.float32]: | |
""" | |
Type if original_coord : np.array([ | |
[xmin, ymin], | |
[xmax, ymin], | |
[xmax, ymax], | |
[xmin, ymax] | |
] | |
""" | |
height = original_coord[3][1] - original_coord[0][1] | |
width = original_coord[1][0] - original_coord[0][0] | |
if width/height<1.6: | |
bigger = max(height,width) | |
new_height = int(bigger *3) | |
new_width = int(bigger*3) | |
else: | |
bigger = max(height,width) | |
new_height = int(bigger *MARGIN_FACTOR) | |
new_width = int(bigger*MARGIN_FACTOR) | |
y_offset = (new_height - height) // 2 | |
x_offset = (new_width - width) // 2 | |
#new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img | |
#x,y offsets are the min x and y | |
# Calculate relative coordinate to the original image in the padded image | |
rel = np.array( | |
[ | |
[detection_res[0][0] - x_offset, detection_res[0][1]-y_offset], | |
[detection_res[1][0] - x_offset, detection_res[1][1]-y_offset], | |
[detection_res[2][0] - x_offset, detection_res[2][1]-y_offset], | |
[detection_res[3][0] - x_offset, detection_res[3][1]-y_offset], | |
] | |
) | |
xmin = original_coord[0][0] | |
ymin = original_coord[0][1] | |
xmax = original_coord[1][0] | |
ymax = original_coord[2][1] | |
#This used to return 4 x 2 array | |
#rel_in_page =[[xmin+b[0],ymin+b[1]] for b in rel] | |
#Now returns 4x1 array | |
rel_in_page = np.array([xmin+rel[0][0],ymin+rel[0][1], xmin +rel[1][0], ymin +rel[2][1]]) | |
return rel_in_page | |
def predict(self, img:ImageType)->Tuple[List[LineAnnotation],List[WordAnnotation]]: | |
logger = getlogger("1") | |
start_time = time.time() | |
""" | |
bxs : Text line detection results - bounding boxes | |
Each element looks like : [array([[ 90., 98.], | |
[313., 100.], | |
[312., 129.], | |
[ 90., 127.]], dtype=float32) | |
[left_lower, right_lower, right_upper, left_upper] | |
""" | |
# 4x2 array | |
bxs:List[NDArray[np.float32]] = self.textlineDetector.predict(img = np.array(img)) | |
end_time = time.time() | |
execution_time = end_time - start_time | |
logger.info(f"time to detecttextline: {execution_time} seconds") | |
line_annotations = {} | |
straightboxs = [] | |
for points in bxs: | |
xmin, ymin, xmax, ymax = get_new_coord(img.shape[1],img.shape[0],points) | |
b = np.array([ | |
[xmin, ymin], | |
[xmax, ymin], | |
[xmax, ymax], | |
[xmin, ymax] | |
], dtype=np.float32) | |
straightboxs.append(b) | |
ann = LineAnnotation(box =[xmin, ymin, xmax, ymax]) | |
line_annotations[ann.index] = ann | |
""" | |
detected_lines_images : cropped images of detected lines | |
""" | |
# Double computation in line 117 - we calculate the straight lines again | |
#Straightboxes : 4x 2 array | |
detected_lines_images:List[ImageType] = cropImageExtraMargin(straightboxs, img,margin =MARGIN_FACTOR,straight=True) | |
#self.save_detection(detected_lines_images,prefix = './res/12June_two_Line_') | |
start_time = time.time() | |
word_annotations =[] | |
#viz_word_detection =[] | |
for uuid, lineimg in zip(line_annotations.keys(),detected_lines_images): | |
original_coord = line_annotations[uuid].box | |
xmin, ymin, xmax, ymax = original_coord | |
original_coord_b = np.array([ | |
[xmin, ymin], | |
[xmax, ymin], | |
[xmax, ymax], | |
[xmin, ymax] | |
], dtype=np.float32) | |
#List of 4 x 2 | |
detection_results :List[Wordboxes]= self.wordDetector.predict(lineimg) | |
input_Word_recog ={} | |
for wordbox in detection_results: | |
#So i think cropped_image's expected form is different that what is being returned | |
#takes in 4x2 array : box | |
cropped_image= crop_an_Image(wordbox.box,lineimg) | |
""" | |
We need to convert coordintes in wordbox.box to the original image | |
wordbox.box = np.array(wordbox.box) | |
""" | |
#original_coord_b :4x2 array | |
#coord_in_page :4 x 1 array | |
coord_in_page = self.convert_coordinates(original_coord_b,wordbox.box) | |
#logger.info("returned coordinate in page ") | |
#logger.info(coord_in_page) | |
wordAnn = WordAnnotation(box = coord_in_page, text = None) | |
word_uuid = wordAnn.index | |
input_Word_recog[word_uuid]= [cropped_image,wordAnn] | |
#print("uuid is ") | |
#print(uuid) | |
#print(len(line_annotations[uuid].words)) | |
line_annotations[uuid].words.append(wordAnn) | |
#viz_word_detection.append(cropped_image) | |
#input_Word_recog contains only word detection | |
#It is dictionary of annotation id as key, than as values - list of cropped_image and Annotation Instance with key as uuid | |
word_annotations_in_line = self.textRecognizer.predict(input_Word_recog) | |
word_annotations.append(word_annotations_in_line) | |
#self.save_detection(viz_word_detection,prefix = './res/test4/rel_page_') | |
end_time = time.time() | |
execution_time = end_time - start_time | |
logger.info(f"Entire DocTR pipeline: {execution_time} seconds") | |
return line_annotations | |