File size: 3,669 Bytes
03484ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8fc179
03484ca
 
 
 
d8fc179
03484ca
 
d8fc179
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from paddleocr import PaddleOCR
from vietocr.tool.config import Cfg
from vietocr.tool.predictor import Predictor

import cv2
import requests
import unidecode
import numpy as np
from PIL import Image, ImageFont, ImageDraw

class OCRDetector:
    def __init__(self) -> None:
        self.paddle_ocr = PaddleOCR(lang='en', use_angle_cls=False)
        # config['weights'] = './weights/transformerocr.pth'
        self.config = Cfg.load_config_from_name('vgg_transformer')
        self.config['weights'] = "./storage/ocr_model.pth"
        self.config['cnn']['pretrained']=False
        self.config['device'] =  "cpu"
        self.config['predictor']['beamsearch']=False
        self.viet_ocr = Predictor(self.config)
                
    def find_box(self, image):
        '''Xác định box dựa vào mô hình paddle_ocr'''
        result = self.paddle_ocr.ocr(image, cls = False)
        result = result[0]
        # Extracting detected components
        boxes = [res[0] for res in result] 
        texts = [{"text": res[1][0], "score": res[1][1]} for res in result]
        
        # scores = [res[1][1] for res in result]
        return boxes, texts
        
    def vietnamese_text(self, boxes, image):
        '''Xác định text dựa vào mô hình viet_ocr'''
        texts = []
        for box in boxes:
            A = box[0]
            B = box[1]
            C = box[2]
            D = box[3]
            y1 = min(A[1], B[1])
            y1 = int(max(0, y1 - max(0, 10 - abs(A[1] - B[1]))))
            y2 = max(C[1], D[1])
            y2 = int(y2 + max(0, 10 - abs(A[1] - B[1])))
            x1 = int(max(0, min(A[0], D[0]) ))
            x2 = int(max(B[0], C[0]) )
            cut_image = image[y1:y2, x1:x2]
            cut_image = Image.fromarray(np.uint8(cut_image))
            text, score = self.viet_ocr.predict(cut_image, return_prob=True)
            texts.append({"text": text,
                          "score": score})
        return texts

    #Merge
    def text_detector(self, image_path, is_local=False):
        if is_local:
            image = Image.open(image_path).convert("RGB")
        else:
            image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
        image = np.array(image)
        boxes, paddle_texts = self.find_box(image)
        if not boxes:
            return image, None, None
        viet_texts = self.vietnamese_text(boxes, image)
        results_texts = []
        for i, viet_txt in enumerate(viet_texts):
            if viet_txt["text"] != unidecode.unidecode(viet_txt["text"]):
                results_texts.append(viet_txt)
            else:
                results_texts.append(paddle_texts[i])
        if results_texts != []:
            return image, results_texts, boxes
        else:
            return image, None, None
    
    
    def visualize_ocr(self, image, texts, boxes):
        if not texts:
            return image
        
        img = image.copy()
        for box, text in zip(boxes, texts):
            (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
            
            h = y3 - y1
            scl = max(h//1000,1)
            font = ImageFont.truetype("./storage/Roboto-Black.ttf", 22*scl)
            img = cv2.rectangle(img, (int(x1), int(y1)), (int(x3), int(y3)), (0, 255, 0), 1)
            
            img_pil = Image.fromarray(img)
            draw = ImageDraw.Draw(img_pil)
            draw.text((int(x1), int(y1-h//2)), text["text"], font = font, fill = (255, 51, 51))
            img = np.array(img_pil)
            # img = cv2.putText(img, text["text"], (int(x1), int(y1)-3), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255,0,0), 1)
        return img