Spaces:

Raj-Master
/

PDF_DATA_EXTRACTOR

Runtime error

App Files Files Community

PDF_DATA_EXTRACTOR / app.py

Raj-Master

Upload 2 files

5a3dc06 verified over 1 year ago

raw

history blame

1.91 kB

	import gradio as gr
	from pdf2image import convert_from_path
	from paddleocr import PaddleOCR
	from numpy import asarray
	import gradio as gr
	from gradio_pdf import PDF
	from pdf2image import convert_from_path
	from transformers import pipeline
	from pathlib import Path

	ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv3")

	def p(image,question):
	result = ocr.ocr(asarray(image), cls=True)
	ocr_text = " ".join([line[1][0] for line in result[0]])

	return ocr_text

	# up_file="/home/raj/Downloads/ICBC Aviation Leasing Company Limited_ND2A_220808.pdf"
	# images = convert_from_path(up_file, fmt="jpeg")

	# output = ""
	# for idx, image in enumerate(images, start=1):
	# # result = reader.readtext(image, detail = 0)
	# # ocr_text = " ".join(result)
	# result = ocr.ocr(asarray(image), cls=True)
	# ocr_text = " ".join([line[1][0] for line in result[0]])

	# new_prompt = f"""
	# {ocr_text}

	# Above is OCR'ed text from a form PDF file.

	# List out all the form key value which have data. Don't include fields that are empty.
	# """
	# # llm_output = llm(prompt=new_prompt)

	# # output += f"Page {idx}\n"
	# # output += llm_output
	# output += "\n\n "

	# output=new_prompt
	# print(output)

	dir_ = Path(__file__).parent

	# p = pipeline(
	# "document-question-answering",
	# model="impira/layoutlm-document-qa",
	# )

	def qa(question: str, doc: str) -> str:
	output=""
	img = convert_from_path(doc)
	for i in range(len(img)):
	result=p(img[i], question)
	output+="\n"+result
	return output


	demo = gr.Interface(
	qa,
	[gr.Textbox(label="Question"), PDF(label="Document")],
	gr.Textbox(),
	)

	demo.launch()
	# def greet(name):
	# return "Hello " + name + "!!"

	# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
	# iface.launch()