Spaces:

atatavana
/

OCR_check

Build error

App Files Files Community

OCR_check / app.py

atatavana

Update app.py

358bc3a over 2 years ago

raw

history blame

4.13 kB

	# -- coding: utf-8 --
	"""Untitled1.ipynb
	Automatically generated by Colaboratory.
	Original file is located at
	https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0
	"""

	import os
	os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
	import os, glob, fitz
	import cv2
	import os
	import PIL
	import torch
	import pandas as pd
	import numpy as np
	import gradio as gr
	from tqdm import tqdm
	from scipy import ndimage
	from PIL import Image, ImageDraw, ImageFont
	import paddleocr
	from paddleocr import draw_ocr



	def unnormalize_box(bbox, width, height):
	#print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox)
	return [
	width * (bbox[0] / 1000),
	height * (bbox[1] / 1000),
	width * (bbox[2] / 1000),
	height * (bbox[3] / 1000),
	]

	def imageconversion(pdffile):
	doc = fitz.open(pdffile)
	page = doc.load_page(0)
	zoom = 2 # zoom factor
	mat = fitz.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix = mat,dpi = 300)
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	t=pix.save("page.jpg")
	return image

	def process_image_pytesseract(image,width,height):
	width, height = image.size
	#feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
	#encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
	#words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
	words, boxes, scores = [], [], []
	return words,boxes,scores,image

	def process_image_PaddleOCR(image,width,height):
	ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
	width, height = image.size
	width_scale = 1000 / width
	height_scale = 1000 / height

	# Perform OCR on the image
	results = ocr.ocr(np.array(image))

	# Extract the words and bounding boxes from the OCR results
	words = []
	boxes = []
	scores = []
	for line in results:
	for bbox in line:
	words.append(bbox[1][0])
	scores.append(bbox[1][1])
	boxes.append(create_bounding_box1(bbox[0], width_scale, height_scale))

	output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
	return words, boxes, scores, output_image

	def createDataframe(boxes, words, scores):
	df = pd.DataFrame([boxes, words, scores], columns=['bbox','text', 'score'])
	return df


	def completepreprocess(pdffile):
	t=imageconversion(pdffile)
	image = t.convert("RGB")
	width,height=image.size
	if ocr_type == "PaddleOCR":
	words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
	elif ocr_type == "Pytesseract":
	words, boxes, scores, output_img = process_image_pytesseract(image, width, height)

	dataframe = createDataframe(boxes, words, scores)
	return output_img,myDataFrame


	title = "OCR outputs"
	description = ""

	css = """.output_image, .input_image {height: 600px !important}"""
	#examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
	# ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
	# ["744BJQ69.PDF"], ['tarros_2.jpg'],

	iface = gr.Interface(fn=completepreprocess,
	#inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
	inputs=[
	gr.inputs.File(label="PDF"),
	gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]),
	],
	#inputs=gr.inputs.Image(type="pil")
	outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
	title=title,
	description=description,
	#examples=examples,
	css=css,
	analytics_enabled = True, enable_queue=True)

	iface.launch(inline=False , debug=True)