Spaces:
Runtime error
Runtime error
File size: 1,913 Bytes
5a3dc06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from numpy import asarray
import gradio as gr
from gradio_pdf import PDF
from pdf2image import convert_from_path
from transformers import pipeline
from pathlib import Path
ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv3")
def p(image,question):
result = ocr.ocr(asarray(image), cls=True)
ocr_text = " ".join([line[1][0] for line in result[0]])
return ocr_text
# up_file="/home/raj/Downloads/ICBC Aviation Leasing Company Limited_ND2A_220808.pdf"
# images = convert_from_path(up_file, fmt="jpeg")
# output = ""
# for idx, image in enumerate(images, start=1):
# # result = reader.readtext(image, detail = 0)
# # ocr_text = " ".join(result)
# result = ocr.ocr(asarray(image), cls=True)
# ocr_text = " ".join([line[1][0] for line in result[0]])
# new_prompt = f"""
# {ocr_text}
# Above is OCR'ed text from a form PDF file.
# List out all the form key value which have data. Don't include fields that are empty.
# """
# # llm_output = llm(prompt=new_prompt)
# # output += f"Page {idx}\n"
# # output += llm_output
# output += "\n\n "
# output=new_prompt
# print(output)
dir_ = Path(__file__).parent
# p = pipeline(
# "document-question-answering",
# model="impira/layoutlm-document-qa",
# )
def qa(question: str, doc: str) -> str:
output=""
img = convert_from_path(doc)
for i in range(len(img)):
result=p(img[i], question)
output+="\n"+result
return output
demo = gr.Interface(
qa,
[gr.Textbox(label="Question"), PDF(label="Document")],
gr.Textbox(),
)
demo.launch()
# def greet(name):
# return "Hello " + name + "!!"
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()
|