# -*- coding: utf-8 -*- """Untitled1.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0 """ import os os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu') import os, glob, fitz import cv2 import os import PIL import torch import pandas as pd import numpy as np import gradio as gr from tqdm import tqdm from scipy import ndimage from PIL import Image, ImageDraw, ImageFont def unnormalize_box(bbox, width, height): #print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox) return [ width * (bbox[0] / 1000), height * (bbox[1] / 1000), width * (bbox[2] / 1000), height * (bbox[3] / 1000), ] def imageconversion(pdffile): doc = fitz.open(pdffile) page = doc.load_page(0) zoom = 2 # zoom factor mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix = mat,dpi = 300) image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) t=pix.save("page.jpg") # img = removeBorders(image) # noise_img = add_noise(np.array(image)) # image = Image.fromarray(noise_img) return image def completepreprocess(pdffile): t=imageconversion(pdffile) image = t.convert("RGB") width,height=image.size if ocr_type == "PaddleOCR": words, boxes = process_image_PaddleOCR(image, width, height) elif ocr_type == "Pytesseract": words, boxes = process_image_pytesseract(image, width, height) myDataFrame = pd.DataFrame() a=[] doc = fitz.open(pdffile) for i in range(0,len(doc)): page = doc.load_page(i) zoom = 2 # zoom factor mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix = mat,dpi = 200) t=pix.save("page"+str(i)+".jpg") images = Image.open("page"+str(i)+".jpg") image = images.convert("RGB") bbox, preds, words, image = process_image(image) im, df = visualize_image(bbox, preds, words, image) im1 = im.save("page"+str(i)+".jpg") a.append("page"+str(i)+".jpg") pred_list = [] for number in preds: pred_list.append(iob_to_label(number)) _bbox, _preds, _words = process_form(pred_list, words, bbox) print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words))) df = createDataframe(_preds, _words) myDataFrame=myDataFrame.append(df) im2=mergeImageVertical(a) return im2,myDataFrame title = "OCR outputs" description = "" css = """.output_image, .input_image {height: 600px !important}""" #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"] # ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]] # ["744BJQ69.PDF"], ['tarros_2.jpg'], iface = gr.Interface(fn=completepreprocess, #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"), inputs=[ gr.inputs.File(label="PDF"), gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]), ], #inputs=gr.inputs.Image(type="pil") outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] , title=title, description=description, #examples=examples, css=css, analytics_enabled = True, enable_queue=True) iface.launch(inline=False , debug=True)