Spaces:

aico
/

TrOCR-digit

Running

File size: 3,690 Bytes

import gradio as gr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests
from PIL import Image
import numpy as np
import cv2
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("aico/TrOCR-MNIST")

def _group_rectangles(rec):
    """
    Uion intersecting rectangles.
    Args:
        rec - list of rectangles in form [x, y, w, h]
    Return:
        list of grouped ractangles 
    """
    tested = [False for i in range(len(rec))]
    final = []
    i = 0
    while i < len(rec):
        if not tested[i]:
            j = i+1
            while j < len(rec):
                if not tested[j] and intersect_area(rec[i], rec[j]):
                    rec[i] = union(rec[i], rec[j])
                    tested[j] = True
                    j = i
                j += 1
            final += [rec[i]]
        i += 1

    return final

def process_image(image):
    bounding_boxes = []
    generated_text_list = []
    #boundingBoxes_2 = []
    #print(np.shape(image))
    #print(image)
    #dim = (28,28)
    #resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
    #rint(image.astype('uint8'))
    #cv2.imwrite("image.png",image.astype('uint8'),(28, 28))
    #mask = np.zeros(np.shape(image), dtype=np.uint8)
    thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    #gray = cv2.cvtColor(thresh, cv2.COLOR_BGR2GRAY)
    
    cnts = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    (cnts, _) = contours.sort_contours(cnts, method="left-to-right")
    dim = (28, 28)
    for c in cnts:
        area = cv2.contourArea(c)
        #print(area)
        #if area < 120:
        bounding_boxes.append(cv2.boundingRect(c))
        #print("for loop bb: ",bounding_boxes)
    
    boundingBoxes_filter =  [i for i in bounding_boxes if i != (0 , 0, 128, 128)]
    
    boundingBoxes = _group_rectangles(boundingBoxes_filter)
    #print(boundingBoxes)
    #
    #print(boundingBoxes_2)
    for (x, y, w, h) in boundingBoxes:
        #print(x,y,w,h)
        ROI = thresh[y:y+h, x:x+w]
        ROI2 = cv2.bitwise_not(ROI)
        borderoutput = cv2.copyMakeBorder(ROI2, 30, 30, 30, 30, cv2.BORDER_CONSTANT, value=[0, 0, 0])
        
        resized = cv2.resize(borderoutput, dim, interpolation = cv2.INTER_AREA)
        cv2.imwrite('ROI_{}.png'.format(x), resized)
        #imageinv = cv2.bitwise_not(resized)
        img = Image.fromarray(resized.astype('uint8')).convert("RGB")
        
        pixel_values = processor(img, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        #print(generated_text)
        generated_text_list.append(generated_text)
    #img = Image.fromarray(image.astype('uint8')).convert("RGB")
    #img = Image.open("image.png").convert("RGB")
    #print(img)
    
    # prepare image
    #pixel_values = processor(img, return_tensors="pt").pixel_values

    # generate (no beam search)
    #generated_ids = model.generate(pixel_values)

    # decode
    #generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return ''.join(generated_text_list)
    #return generated_text

title = "Interactive demo: Single Digits MNIST"
description = "Aico - University Utrecht"
iface = gr.Interface(fn=process_image, 
                     inputs="sketchpad", 
                     outputs="label",
                     title = title,
                     description = description)
iface.launch(debug=True)