# -*- coding: utf-8 -*- """OCR check """ import os #os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu') import os, glob, fitz import cv2 import os import PIL import pandas as pd import numpy as np import gradio as gr from tqdm import tqdm from scipy import ndimage from PIL import Image, ImageDraw, ImageFont import paddleocr from paddleocr import draw_ocr def unnormalize_box(bbox, width, height): #print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox) return [ width * (bbox[0] / 1000), height * (bbox[1] / 1000), width * (bbox[2] / 1000), height * (bbox[3] / 1000), ] def imageconversion(pdffile): doc = fitz.open(pdffile) page = doc.load_page(0) zoom = 2 # zoom factor mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix = mat,dpi = 300) image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) t=pix.save("page.jpg") return image def process_image_pytesseract(image,width,height): width, height = image.size #feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang) #encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True) #words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes words, boxes, scores = [], [], [] return words,boxes,scores,image def process_image_PaddleOCR(image,width,height): ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True) width, height = image.size width_scale = 1000 / width height_scale = 1000 / height # Perform OCR on the image results = ocr.ocr(np.array(image)) # Extract the words and bounding boxes from the OCR results words = [] boxes = [] scores = [] for line in results: for bbox in line: words.append(bbox[1][0]) scores.append(bbox[1][1]) boxes.append(bbox[0]) output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf') return words, boxes, scores, output_image def createDataframe(boxes, words, scores): df = pd.DataFrame(list(zip(boxes, words, scores)), columns=['bbox','text', 'score']) return df def completepreprocess(pdffile, ocr_type): t=imageconversion(pdffile) image = t.convert("RGB") width,height=image.size if ocr_type == "PaddleOCR": words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height) elif ocr_type == "Pytesseract": words, boxes, scores, output_img = process_image_pytesseract(image, width, height) else: words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height) dataframe = createDataframe(boxes, words, scores) return output_img, dataframe title = "OCR outputs" description = "" css = """.output_image, .input_image {height: 600px !important}""" #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"] # ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]] # ["744BJQ69.PDF"], ['tarros_2.jpg'], examples = [['3.jpg']] iface = gr.Interface(fn=completepreprocess, #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"), inputs=[ gr.inputs.File(label="PDF"), gr.inputs.Dropdown(label="Select the OCR", choices=["PaddleOCR", "Pytesseract"]), ], #inputs=gr.inputs.Image(type="pil") outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] , title=title, description=description, examples=examples, css=css, analytics_enabled = True, enable_queue=True) iface.launch(inline=False , debug=True)