Spaces:

Raj-Master
/

PDF_DATA_EXTRACTOR

Runtime error

File size: 1,913 Bytes

5a3dc06

import gradio as gr
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from numpy import asarray
import gradio as gr
from gradio_pdf import PDF
from pdf2image import convert_from_path
from transformers import pipeline
from pathlib import Path

ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv3") 

def p(image,question):
    result = ocr.ocr(asarray(image), cls=True)
    ocr_text = " ".join([line[1][0] for line in result[0]])

    return ocr_text

# up_file="/home/raj/Downloads/ICBC Aviation Leasing Company Limited_ND2A_220808.pdf"
# images = convert_from_path(up_file, fmt="jpeg")

# output = ""
# for idx, image in enumerate(images, start=1):
#     # result = reader.readtext(image, detail = 0)
#     # ocr_text = " ".join(result)
#     result = ocr.ocr(asarray(image), cls=True)
#     ocr_text = " ".join([line[1][0] for line in result[0]])

#     new_prompt = f"""
#     {ocr_text}

#     Above is OCR'ed text from a form PDF file.

#     List out all the form key value which have data. Don't include fields that are empty.
#     """
#     # llm_output = llm(prompt=new_prompt)

#     # output += f"Page {idx}\n"
#     # output += llm_output
#     output += "\n\n                                                                "

#     output=new_prompt
# print(output)

dir_ = Path(__file__).parent

# p = pipeline(
#     "document-question-answering",
#     model="impira/layoutlm-document-qa",
# )

def qa(question: str, doc: str) -> str:
    output=""
    img = convert_from_path(doc)
    for i in range(len(img)):
        result=p(img[i], question)
        output+="\n"+result
    return output

 
demo = gr.Interface(
    qa,
    [gr.Textbox(label="Question"), PDF(label="Document")],
    gr.Textbox(),
)

demo.launch()
# def greet(name):
#     return "Hello " + name + "!!"

# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()