File size: 4,523 Bytes
3dab771
7be4744
 
3dab771
7be4744
 
 
3dab771
 
 
7be4744
 
 
3dab771
 
7be4744
3dab771
 
 
7be4744
3dab771
 
 
7be4744
 
 
 
3dab771
 
 
 
 
 
7be4744
 
3dab771
7be4744
 
 
 
 
 
 
3dab771
 
7be4744
 
 
 
 
 
 
3dab771
 
 
 
7be4744
3dab771
 
7be4744
3dab771
 
 
 
7be4744
 
 
 
3dab771
 
 
 
 
 
 
 
 
7be4744
 
 
 
 
 
 
 
 
3dab771
 
 
 
 
 
 
7be4744
 
 
 
 
 
 
3dab771
 
 
7be4744
 
 
 
3dab771
 
7be4744
3dab771
 
 
 
7be4744
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os

os.system("pip install pyyaml==5.1")
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
os.system(
    "pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html"
)

# install detectron2 that matches pytorch 1.8
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
os.system(
    "pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html"
)

## install PyTesseract
os.system("pip install -q pytesseract")

import gradio as gr
import numpy as np
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from datasets import load_dataset
from PIL import Image, ImageDraw, ImageFont

processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "nielsr/layoutlmv3-finetuned-funsd"
)

# load image example
dataset = load_dataset("nielsr/funsd", split="test")
image = Image.open(dataset[0]["image_path"]).convert("RGB")
image = Image.open("./invoice.png")
image.save("document.png")

labels = dataset.features["ner_tags"].feature.names
id2label = {v: k for v, k in enumerate(labels)}
label2color = {
    "question": "blue",
    "answer": "green",
    "header": "orange",
    "other": "violet",
}


def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]


def iob_to_label(label):
    label = label[2:]
    if not label:
        return "other"
    return label


def process_image(image):
    width, height = image.size

    # encode
    encoding = processor(
        image, truncation=True, return_offsets_mapping=True, return_tensors="pt"
    )
    offset_mapping = encoding.pop("offset_mapping")

    # forward pass
    outputs = model(**encoding)

    # get predictions
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoding.bbox.squeeze().tolist()

    # only keep non-subword predictions
    is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0
    true_predictions = [
        id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]
    ]
    true_boxes = [
        unnormalize_box(box, width, height)
        for idx, box in enumerate(token_boxes)
        if not is_subword[idx]
    ]

    # draw predictions over the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for prediction, box in zip(true_predictions, true_boxes):
        predicted_label = iob_to_label(prediction).lower()
        draw.rectangle(box, outline=label2color[predicted_label])
        draw.text(
            (box[0] + 10, box[1] - 10),
            text=predicted_label,
            fill=label2color[predicted_label],
            font=font,
        )

    return image


title = "Interactive demo: LayoutLMv3"
description = "Demo for Microsoft's LayoutLMv3, a Transformer for state-of-the-art document image understanding tasks. This particular model is fine-tuned on FUNSD, a dataset of manually annotated forms. It annotates the words appearing in the image as QUESTION/ANSWER/HEADER/OTHER. To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.08387' target='_blank'>LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking</a> | <a href='https://github.com/microsoft/unilm' target='_blank'>Github Repo</a></p>"
examples = [["document.png"]]

css = ".output-image, .input-image {height: 40rem !important; width: 100% !important;}"
# css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"

css = ".image-preview {height: auto !important;}"

iface = gr.Interface(
    fn=process_image,
    inputs=gr.inputs.Image(type="pil"),
    outputs=gr.outputs.Image(type="pil", label="annotated image"),
    title=title,
    description=description,
    article=article,
    examples=examples,
    css=css,
    enable_queue=True,
)
iface.launch(debug=True)