Spaces:

Raj-Master
/

PDF_DATA_EXTRACTOR

Runtime error

App Files Files Community

Raj-Master commited on Jan 23, 2024

Commit

5a3dc06

verified ·

1 Parent(s): e7bba61

Upload 2 files

Browse files

Files changed (2) hide show

app.py +73 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import gradio as gr
+from pdf2image import convert_from_path
+from paddleocr import PaddleOCR
+from numpy import asarray
+import gradio as gr
+from gradio_pdf import PDF
+from pdf2image import convert_from_path
+from transformers import pipeline
+from pathlib import Path
+ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv3")
+def p(image,question):
+    result = ocr.ocr(asarray(image), cls=True)
+    ocr_text = " ".join([line[1][0] for line in result[0]])
+    return ocr_text
+# up_file="/home/raj/Downloads/ICBC Aviation Leasing Company Limited_ND2A_220808.pdf"
+# images = convert_from_path(up_file, fmt="jpeg")
+# output = ""
+# for idx, image in enumerate(images, start=1):
+#     # result = reader.readtext(image, detail = 0)
+#     # ocr_text = " ".join(result)
+#     result = ocr.ocr(asarray(image), cls=True)
+#     ocr_text = " ".join([line[1][0] for line in result[0]])
+#     new_prompt = f"""
+#     {ocr_text}
+#     Above is OCR'ed text from a form PDF file.
+#     List out all the form key value which have data. Don't include fields that are empty.
+#     """
+#     # llm_output = llm(prompt=new_prompt)
+#     # output += f"Page {idx}\n"
+#     # output += llm_output
+#     output += "\n\n                                                                "
+#     output=new_prompt
+# print(output)
+dir_ = Path(__file__).parent
+# p = pipeline(
+#     "document-question-answering",
+#     model="impira/layoutlm-document-qa",
+# )
+def qa(question: str, doc: str) -> str:
+    output=""
+    img = convert_from_path(doc)
+    for i in range(len(img)):
+        result=p(img[i], question)
+        output+="\n"+result
+    return output
+demo = gr.Interface(
+    qa,
+    [gr.Textbox(label="Question"), PDF(label="Document")],
+    gr.Textbox(),
+)
+demo.launch()
+# def greet(name):
+#     return "Hello " + name + "!!"
+# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+# iface.launch()

requirements.txt ADDED Viewed

File without changes