Spaces:

uartimcs
/

donut-invoice-gradio

Sleeping

App Files Files Community

uartimcs commited on Nov 20, 2024

Commit

7cd9ba4

verified ·

1 Parent(s): a4ccafe

Upload 22 files

Browse files

Files changed (22) hide show

.gitignore +139 -0
LICENSE +21 -0
NOTICE +213 -0
README.md +243 -8
app.py +26 -0
config/train_booking.yaml +22 -0
config/train_cord.yaml +22 -0
config/train_docvqa.yaml +23 -0
config/train_invoices.yaml +22 -0
config/train_rvlcdip.yaml +23 -0
config/train_zhtrainticket.yaml +22 -0
donut/__init__.py +16 -0
donut/__pycache__/__init__.cpython-310.pyc +0 -0
donut/__pycache__/model.cpython-310.pyc +0 -0
donut/__pycache__/util.cpython-310.pyc +0 -0
donut/_version.py +6 -0
donut/model.py +613 -0
donut/util.py +340 -0
lightning_module.py +198 -0
setup.py +77 -0
test.py +98 -0
train.py +176 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,139 @@

+core.*
+*.bin
+.nfs*
+.vscode/*
+dataset/*
+result/*
+misc/*
+!misc/*.png
+!dataset/.gitkeep
+!result/.gitkeep
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT license
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

NOTICE ADDED Viewed

	@@ -0,0 +1,213 @@

+Donut
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+--------------------------------------------------------------------------------------
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+=====
+googlefonts/noto-fonts
+https://fonts.google.com/specimen/Noto+Sans
+Copyright 2018 The Noto Project Authors (github.com/googlei18n/noto-fonts)
+This Font Software is licensed under the SIL Open Font License,
+Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font
+creation efforts of academic and linguistic communities, and to
+provide a free and open framework in which fonts may be shared and
+improved in partnership with others.
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply to
+any document created using the fonts or their derivatives.
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+"Original Version" refers to the collection of Font Software
+components as distributed by the Copyright Holder(s).
+"Modified Version" refers to any derivative made by adding to,
+deleting, or substituting -- in part or in whole -- any of the
+components of the Original Version, by changing formats or by porting
+the Font Software to a new environment.
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed,
+modify, redistribute, and sell modified and unmodified copies of the
+Font Software, subject to the following conditions:
+1) Neither the Font Software nor any of its individual components, in
+Original or Modified Versions, may be sold by itself.
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the
+corresponding Copyright Holder. This restriction only applies to the
+primary font name as presented to the users.
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created using
+the Font Software.
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
+=====
+huggingface/transformers
+https://github.com/huggingface/transformers
+Copyright [yyyy] [name of copyright owner]
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+=====
+clovaai/synthtiger
+https://github.com/clovaai/synthtiger
+Copyright (c) 2021-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+=====
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+   Copyright 2019 Ross Wightman
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+=====
+ankush-me/SynthText
+https://github.com/ankush-me/SynthText
+   Copyright 2017, Ankush Gupta.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+=====

README.md CHANGED Viewed

@@ -1,13 +1,248 @@
 ---
-title: Donut Company Invoice
-emoji: 🏆
-colorFrom: pink
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.5.0
-app_file: app.py
-pinned: false
-short_description: Parser of company invoice details
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: donut-booking-gradio
+app_file: app.py
 sdk: gradio
 sdk_version: 5.5.0
 ---
+<div align="center">
+# Donut 🍩 : Document Understanding Transformer
+[![Paper](https://img.shields.io/badge/Paper-arxiv.2111.15664-red)](https://arxiv.org/abs/2111.15664)
+[![Conference](https://img.shields.io/badge/ECCV-2022-blue)](#how-to-cite)
+[![Demo](https://img.shields.io/badge/Demo-Gradio-brightgreen)](#demo)
+[![Demo](https://img.shields.io/badge/Demo-Colab-orange)](#demo)
+[![PyPI](https://img.shields.io/pypi/v/donut-python?color=green&label=pip%20install%20donut-python)](https://pypi.org/project/donut-python)
+[![Downloads](https://static.pepy.tech/personalized-badge/donut-python?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/donut-python)
+Official Implementation of Donut and SynthDoG | [Paper](https://arxiv.org/abs/2111.15664) | [Slide](https://docs.google.com/presentation/d/1gv3A7t4xpwwNdpxV_yeHzEOMy-exJCAz6AlAI9O5fS8/edit?usp=sharing) | [Poster](https://docs.google.com/presentation/d/1m1f8BbAm5vxPcqynn_MbFfmQAlHQIR5G72-hQUFS2sk/edit?usp=sharing)
+</div>
+## Introduction
+**Donut** 🍩, **Do**cume**n**t **u**nderstanding **t**ransformer, is a new method of document understanding that utilizes an OCR-free end-to-end Transformer model. Donut does not require off-the-shelf OCR engines/APIs, yet it shows state-of-the-art performances on various visual document understanding tasks, such as visual document classification or information extraction (a.k.a. document parsing).
+In addition, we present **SynthDoG** 🐶, **Synth**etic **Do**cument **G**enerator, that helps the model pre-training to be flexible on various languages and domains.
+Our academic paper, which describes our method in detail and provides full experimental results and analyses, can be found here:<br>
+> [**OCR-free Document Understanding Transformer**](https://arxiv.org/abs/2111.15664).<br>
+> [Geewook Kim](https://geewook.kim), [Teakgyu Hong](https://dblp.org/pid/183/0952.html), [Moonbin Yim](https://github.com/moonbings), [JeongYeon Nam](https://github.com/long8v), [Jinyoung Park](https://github.com/jyp1111), [Jinyeong Yim](https://jinyeong.github.io), [Wonseok Hwang](https://scholar.google.com/citations?user=M13_WdcAAAAJ), [Sangdoo Yun](https://sangdooyun.github.io), [Dongyoon Han](https://dongyoonhan.github.io), [Seunghyun Park](https://scholar.google.com/citations?user=iowjmTwAAAAJ). In ECCV 2022.
+<img width="946" alt="image" src="misc/overview.png">
+## Pre-trained Models and Web Demos
+Gradio web demos are available! [![Demo](https://img.shields.io/badge/Demo-Gradio-brightgreen)](#demo) [![Demo](https://img.shields.io/badge/Demo-Colab-orange)](#demo)
+|:--:|
+|![image](misc/screenshot_gradio_demos.png)|
+- You can run the demo with `./app.py` file.
+- Sample images are available at `./misc` and more receipt images are available at [CORD dataset link](https://huggingface.co/datasets/naver-clova-ix/cord-v2).
+- Web demos are available from the links in the following table.
+- Note: We have updated the Google Colab demo (as of June 15, 2023) to ensure its proper working.
+|Task|Sec/Img|Score|Trained Model|<div id="demo">Demo</div>|
+|---|---|---|---|---|
+| [CORD](https://github.com/clovaai/cord) (Document Parsing)   |   0.7 /<br> 0.7 /<br> 1.2   |  91.3 /<br> 91.1 /<br> 90.9    | [donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2/tree/official) (1280) /<br> [donut-base-finetuned-cord-v1](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v1/tree/official) (1280) /<br> [donut-base-finetuned-cord-v1-2560](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v1-2560/tree/official) | [gradio space web demo](https://huggingface.co/spaces/naver-clova-ix/donut-base-finetuned-cord-v2),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1NMSqoIZ_l39wyRD7yVjw2FIuU2aglzJi?usp=sharing) |
+| [Train Ticket](https://github.com/beacandler/EATEN) (Document Parsing)   |   0.6   |  98.7    | [donut-base-finetuned-zhtrainticket](https://huggingface.co/naver-clova-ix/donut-base-finetuned-zhtrainticket/tree/official) | [google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1YJBjllahdqNktXaBlq5ugPh1BCm8OsxI?usp=sharing) |
+| [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip) (Document Classification)     |  0.75   |   95.3      | [donut-base-finetuned-rvlcdip](https://huggingface.co/naver-clova-ix/donut-base-finetuned-rvlcdip/tree/official) | [gradio space web demo](https://huggingface.co/spaces/nielsr/donut-rvlcdip),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1iWOZHvao1W5xva53upcri5V6oaWT-P0O?usp=sharing) |
+| [DocVQA Task1](https://rrc.cvc.uab.es/?ch=17) (Document VQA) |  0.78       | 67.5 | [donut-base-finetuned-docvqa](https://huggingface.co/naver-clova-ix/donut-base-finetuned-docvqa/tree/official) | [gradio space web demo](https://huggingface.co/spaces/nielsr/donut-docvqa),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1oKieslZCulFiquequ62eMGc-ZWgay4X3?usp=sharing) |
+The links to the pre-trained backbones are here:
+- [`donut-base`](https://huggingface.co/naver-clova-ix/donut-base/tree/official): trained with 64 A100 GPUs (~2.5 days), number of layers (encoder: {2,2,14,2}, decoder: 4), input size 2560x1920, swin window size 10, IIT-CDIP (11M) and SynthDoG (English, Chinese, Japanese, Korean, 0.5M x 4).
+- [`donut-proto`](https://huggingface.co/naver-clova-ix/donut-proto/tree/official): (preliminary model) trained with 8 V100 GPUs (~5 days), number of layers (encoder: {2,2,18,2}, decoder: 4), input size 2048x1536, swin window size 8, and SynthDoG (English, Japanese, Korean, 0.4M x 3).
+Please see [our paper](#how-to-cite) for more details.
+## SynthDoG datasets
+![image](misc/sample_synthdog.png)
+The links to the SynthDoG-generated datasets are here:
+- [`synthdog-en`](https://huggingface.co/datasets/naver-clova-ix/synthdog-en): English, 0.5M.
+- [`synthdog-zh`](https://huggingface.co/datasets/naver-clova-ix/synthdog-zh): Chinese, 0.5M.
+- [`synthdog-ja`](https://huggingface.co/datasets/naver-clova-ix/synthdog-ja): Japanese, 0.5M.
+- [`synthdog-ko`](https://huggingface.co/datasets/naver-clova-ix/synthdog-ko): Korean, 0.5M.
+To generate synthetic datasets with our SynthDoG, please see `./synthdog/README.md` and [our paper](#how-to-cite) for details.
+## Updates
+**_2023-06-15_** We have updated all Google Colab demos to ensure its proper working.<br>
+**_2022-11-14_** New version 1.0.9 is released (`pip install donut-python --upgrade`). See [1.0.9 Release Notes](https://github.com/clovaai/donut/releases/tag/1.0.9).<br>
+**_2022-08-12_** Donut 🍩 is also available at [huggingface/transformers 🤗](https://huggingface.co/docs/transformers/main/en/model_doc/donut) (contributed by [@NielsRogge](https://github.com/NielsRogge)). `donut-python` loads the pre-trained weights from the `official` branch of the model repositories. See [1.0.5 Release Notes](https://github.com/clovaai/donut/releases/tag/1.0.5).<br>
+**_2022-08-05_** A well-executed hands-on tutorial on donut 🍩 is published at [Towards Data Science](https://towardsdatascience.com/ocr-free-document-understanding-with-donut-1acfbdf099be) (written by [@estaudere](https://github.com/estaudere)).<br>
+**_2022-07-20_** First Commit, We release our code, model weights, synthetic data and generator.
+## Software installation
+[![PyPI](https://img.shields.io/pypi/v/donut-python?color=green&label=pip%20install%20donut-python)](https://pypi.org/project/donut-python)
+[![Downloads](https://static.pepy.tech/personalized-badge/donut-python?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/donut-python)
+```bash
+pip install donut-python
+```
+or clone this repository and install the dependencies:
+```bash
+git clone https://github.com/clovaai/donut.git
+cd donut/
+conda create -n donut_official python=3.7
+conda activate donut_official
+pip install .
+```
+We tested [donut-python](https://pypi.org/project/donut-python/1.0.1) == 1.0.1 with:
+- [torch](https://github.com/pytorch/pytorch) == 1.11.0+cu113
+- [torchvision](https://github.com/pytorch/vision) == 0.12.0+cu113
+- [pytorch-lightning](https://github.com/Lightning-AI/lightning) == 1.6.4
+- [transformers](https://github.com/huggingface/transformers) == 4.11.3
+- [timm](https://github.com/rwightman/pytorch-image-models) == 0.5.4
+**Note**: From several reported issues, we have noticed increased challenges in configuring the testing environment for `donut-python` due to recent updates in key dependency libraries. While we are actively working on a solution, we have updated the Google Colab demo (as of June 15, 2023) to ensure its proper working. For assistance, we encourage you to refer to the following demo links: [CORD Colab Demo](https://colab.research.google.com/drive/1NMSqoIZ_l39wyRD7yVjw2FIuU2aglzJi?usp=sharing), [Train Ticket Colab Demo](https://colab.research.google.com/drive/1YJBjllahdqNktXaBlq5ugPh1BCm8OsxI?usp=sharing), [RVL-CDIP Colab Demo](https://colab.research.google.com/drive/1iWOZHvao1W5xva53upcri5V6oaWT-P0O?usp=sharing), [DocVQA Colab Demo](https://colab.research.google.com/drive/1oKieslZCulFiquequ62eMGc-ZWgay4X3?usp=sharing).
+## Getting Started
+### Data
+This repository assumes the following structure of dataset:
+```bash
+> tree dataset_name
+dataset_name
+├── test
+│   ├── metadata.jsonl
+│   ├── {image_path0}
+│   ├── {image_path1}
+│             .
+│             .
+├── train
+│   ├── metadata.jsonl
+│   ├── {image_path0}
+│   ├── {image_path1}
+│             .
+│             .
+└── validation
+    ├── metadata.jsonl
+    ├── {image_path0}
+    ├── {image_path1}
+              .
+              .
+> cat dataset_name/test/metadata.jsonl
+{"file_name": {image_path0}, "ground_truth": "{\"gt_parse\": {ground_truth_parse}, ... {other_metadata_not_used} ... }"}
+{"file_name": {image_path1}, "ground_truth": "{\"gt_parse\": {ground_truth_parse}, ... {other_metadata_not_used} ... }"}
+     .
+     .
+```
+- The structure of `metadata.jsonl` file is in [JSON Lines text format](https://jsonlines.org), i.e., `.jsonl`. Each line consists of
+  - `file_name` : relative path to the image file.
+  - `ground_truth` : string format (json dumped), the dictionary contains either `gt_parse` or `gt_parses`. Other fields (metadata) can be added to the dictionary but will not be used.
+- `donut` interprets all tasks as a JSON prediction problem. As a result, all `donut` model training share a same pipeline. For training and inference, the only thing to do is preparing `gt_parse` or `gt_parses` for the task in format described below.
+#### For Document Classification
+The `gt_parse` follows the format of `{"class" : {class_name}}`, for example, `{"class" : "scientific_report"}` or `{"class" : "presentation"}`.
+- Google colab demo is available [here](https://colab.research.google.com/drive/1xUDmLqlthx8A8rWKLMSLThZ7oeRJkDuU?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/nielsr/donut-rvlcdip).
+#### For Document Information Extraction
+The `gt_parse` is a JSON object that contains full information of the document image, for example, the JSON object for a receipt may look like `{"menu" : [{"nm": "ICE BLACKCOFFEE", "cnt": "2", ...}, ...], ...}`.
+- More examples are available at [CORD dataset](https://huggingface.co/datasets/naver-clova-ix/cord-v2).
+- Google colab demo is available [here](https://colab.research.google.com/drive/1o07hty-3OQTvGnc_7lgQFLvvKQuLjqiw?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/naver-clova-ix/donut-base-finetuned-cord-v2).
+#### For Document Visual Question Answering
+The `gt_parses` follows the format of `[{"question" : {question_sentence}, "answer" : {answer_candidate_1}}, {"question" : {question_sentence}, "answer" : {answer_candidate_2}}, ...]`, for example, `[{"question" : "what is the model name?", "answer" : "donut"}, {"question" : "what is the model name?", "answer" : "document understanding transformer"}]`.
+- DocVQA Task1 has multiple answers, hence `gt_parses` should be a list of dictionary that contains a pair of question and answer.
+- Google colab demo is available [here](https://colab.research.google.com/drive/1Z4WG8Wunj3HE0CERjt608ALSgSzRC9ig?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/nielsr/donut-docvqa).
+#### For (Pseudo) Text Reading Task
+The `gt_parse` looks like `{"text_sequence" : "word1 word2 word3 ... "}`
+- This task is also a pre-training task of Donut model.
+- You can use our **SynthDoG** 🐶 to generate synthetic images for the text reading task with proper `gt_parse`. See `./synthdog/README.md` for details.
+### Training
+This is the configuration of Donut model training on [CORD](https://github.com/clovaai/cord) dataset used in our experiment.
+We ran this with a single NVIDIA A100 GPU.
+```bash
+python train.py --config config/train_cord.yaml \
+                --pretrained_model_name_or_path "naver-clova-ix/donut-base" \
+                --dataset_name_or_paths '["naver-clova-ix/cord-v2"]' \
+                --exp_version "test_experiment"
+  .
+  .
+Prediction: <s_menu><s_nm>Lemon Tea (L)</s_nm><s_cnt>1</s_cnt><s_price>25.000</s_price></s_menu><s_total><s_total_price>25.000</s_total_price><s_cashprice>30.000</s_cashprice><s_changeprice>5.000</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Lemon Tea (L)</s_nm><s_cnt>1</s_cnt><s_price>25.000</s_price></s_menu><s_total><s_total_price>25.000</s_total_price><s_cashprice>30.000</s_cashprice><s_changeprice>5.000</s_changeprice></s_total>
+Normed ED: 0.0
+Prediction: <s_menu><s_nm>Hulk Topper Package</s_nm><s_cnt>1</s_cnt><s_price>100.000</s_price></s_menu><s_total><s_total_price>100.000</s_total_price><s_cashprice>100.000</s_cashprice><s_changeprice>0</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Hulk Topper Package</s_nm><s_cnt>1</s_cnt><s_price>100.000</s_price></s_menu><s_total><s_total_price>100.000</s_total_price><s_cashprice>100.000</s_cashprice><s_changeprice>0</s_changeprice></s_total>
+Normed ED: 0.0
+Prediction: <s_menu><s_nm>Giant Squid</s_nm><s_cnt>x 1</s_cnt><s_price>Rp. 39.000</s_price><s_sub><s_nm>C.Finishing - Cut</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>B.Spicy Level - Extreme Hot Rp. 0</s_price></s_sub><sep/><s_nm>A.Flavour - Salt & Pepper</s_nm><s_price>Rp. 0</s_price></s_sub></s_menu><s_sub_total><s_subtotal_price>Rp. 39.000</s_subtotal_price></s_sub_total><s_total><s_total_price>Rp. 39.000</s_total_price><s_cashprice>Rp. 50.000</s_cashprice><s_changeprice>Rp. 11.000</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Giant Squid</s_nm><s_cnt>x1</s_cnt><s_price>Rp. 39.000</s_price><s_sub><s_nm>C.Finishing - Cut</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>B.Spicy Level - Extreme Hot</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>A.Flavour- Salt & Pepper</s_nm><s_price>Rp. 0</s_price></s_sub></s_menu><s_sub_total><s_subtotal_price>Rp. 39.000</s_subtotal_price></s_sub_total><s_total><s_total_price>Rp. 39.000</s_total_price><s_cashprice>Rp. 50.000</s_cashprice><s_changeprice>Rp. 11.000</s_changeprice></s_total>
+Normed ED: 0.039603960396039604
+Epoch 29: 100%|█████████████| 200/200 [01:49<00:00,  1.82it/s, loss=0.00327, exp_name=train_cord, exp_version=test_experiment]
+```
+Some important arguments:
+- `--config` : config file path for model training.
+- `--pretrained_model_name_or_path` : string format, model name in Hugging Face modelhub or local path.
+- `--dataset_name_or_paths` : string format (json dumped), list of dataset names in Hugging Face datasets or local paths.
+- `--result_path` : file path to save model outputs/artifacts.
+- `--exp_version` : used for experiment versioning. The output files are saved at `{result_path}/{exp_version}/*`
+### Test
+With the trained model, test images and ground truth parses, you can get inference results and accuracy scores.
+```bash
+python test.py --dataset_name_or_path naver-clova-ix/cord-v2 --pretrained_model_name_or_path ./result/train_cord/test_experiment --save_path ./result/output.json
+100%|█████████████| 100/100 [00:35<00:00,  2.80it/s]
+Total number of samples: 100, Tree Edit Distance (TED) based accuracy score: 0.9129639764131697, F1 accuracy score: 0.8406020841373987
+```
+Some important arguments:
+- `--dataset_name_or_path` : string format, the target dataset name in Hugging Face datasets or local path.
+- `--pretrained_model_name_or_path` : string format, the model name in Hugging Face modelhub or local path.
+- `--save_path`: file path to save predictions and scores.
+## How to Cite
+If you find this work useful to you, please cite:
+```bibtex
+@inproceedings{kim2022donut,
+  title     = {OCR-Free Document Understanding Transformer},
+  author    = {Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Nam, JeongYeon and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
+  booktitle = {European Conference on Computer Vision (ECCV)},
+  year      = {2022}
+}
+```
+## License
+```
+MIT license
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+```

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import gradio as gr
+import argparse
+import torch
+from PIL import Image
+from donut import DonutModel
+def demo_process(input_img):
+    global model, task_prompt, task_name
+    input_img = Image.fromarray(input_img)
+    output = model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
+    return output
+parser = argparse.ArgumentParser()
+parser.add_argument("--task", type=str, default="Booking")
+parser.add_argument("--pretrained_path", type=str, default="result/train_booking/20241112_150925")
+args, left_argv = parser.parse_known_args()
+task_name = args.task
+task_prompt = f"<s_{task_name}>"
+model = DonutModel.from_pretrained("./result/train_booking/20241112_150925")
+if torch.cuda.is_available():
+    model.half()
+    device = torch.device("cuda")
+    model.to(device)
+else:
+    model.encoder.to(torch.bfloat16)
+model.eval()
+demo = gr.Interface(fn=demo_process,inputs="image",outputs="json", title=f"Donut 🍩 demonstration for `{task_name}` task",)
+demo.launch(debug=True)

config/train_booking.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["./dataset/Booking"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [2]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 400 # 800/2*10/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 10
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_cord.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["naver-clova-ix/cord-v2"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300 # 800/8*30/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 30
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_docvqa.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/docvqa"] # should be prepared from https://rrc.cvc.uab.es/?ch=17
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 128
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-docvqa was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 39463
+max_epochs: 300
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 0.25
+verbose: True

config/train_invoices.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["./dataset/SGSInvoice"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [2]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 60 # 800/8*30/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 10
+max_steps: -1
+num_workers: 2
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_rvlcdip.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/rvlcdip"] # should be prepared from https://www.cs.cmu.edu/~aharley/rvl-cdip/
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 8
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-rvlcdip was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 2e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 320000
+max_epochs: 100
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

config/train_zhtrainticket.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/zhtrainticket"] # should be prepared from https://github.com/beacandler/EATEN
+sort_json_key: True
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [960, 1280]
+max_length: 256
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300
+num_training_samples_per_epoch: 1368
+max_epochs: 10
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

donut/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from .model import DonutConfig, DonutModel
+from .util import DonutDataset, JSONParseEvaluator, load_json, save_json
+__all__ = [
+    "DonutConfig",
+    "DonutModel",
+    "DonutDataset",
+    "JSONParseEvaluator",
+    "load_json",
+    "save_json",
+]

donut/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (459 Bytes). View file

donut/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (18.6 kB). View file

donut/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

donut/_version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+__version__ = "1.0.9"

donut/model.py ADDED Viewed

	@@ -0,0 +1,613 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import math
+import os
+import re
+from typing import Any, List, Optional, Union
+import numpy as np
+import PIL
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import ImageOps
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.swin_transformer import SwinTransformer
+from torchvision import transforms
+from torchvision.transforms.functional import resize, rotate
+from transformers import MBartConfig, MBartForCausalLM, XLMRobertaTokenizer
+from transformers.file_utils import ModelOutput
+from transformers.modeling_utils import PretrainedConfig, PreTrainedModel
+class SwinEncoder(nn.Module):
+    r"""
+    Donut encoder based on SwinTransformer
+    Set the initial weights and configuration with a pretrained SwinTransformer and then
+    modify the detailed configurations as a Donut Encoder
+    Args:
+        input_size: Input image size (width, height)
+        align_long_axis: Whether to rotate image if height is greater than width
+        window_size: Window size(=patch size) of SwinTransformer
+        encoder_layer: Number of layers of SwinTransformer encoder
+        name_or_path: Name of a pretrained model name either registered in huggingface.co. or saved in local.
+                      otherwise, `swin_base_patch4_window12_384` will be set (using `timm`).
+    """
+    def __init__(
+        self,
+        input_size: List[int],
+        align_long_axis: bool,
+        window_size: int,
+        encoder_layer: List[int],
+        name_or_path: Union[str, bytes, os.PathLike] = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.to_tensor = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+        self.model = SwinTransformer(
+            img_size=self.input_size,
+            depths=self.encoder_layer,
+            window_size=self.window_size,
+            patch_size=4,
+            embed_dim=128,
+            num_heads=[4, 8, 16, 32],
+            num_classes=0,
+        )
+        self.model.norm = None
+        # weight init with swin
+        if not name_or_path:
+            swin_state_dict = timm.create_model("swin_base_patch4_window12_384", pretrained=True).state_dict()
+            new_swin_state_dict = self.model.state_dict()
+            for x in new_swin_state_dict:
+                if x.endswith("relative_position_index") or x.endswith("attn_mask"):
+                    pass
+                elif (
+                    x.endswith("relative_position_bias_table")
+                    and self.model.layers[0].blocks[0].attn.window_size[0] != 12
+                ):
+                    pos_bias = swin_state_dict[x].unsqueeze(0)[0]
+                    old_len = int(math.sqrt(len(pos_bias)))
+                    new_len = int(2 * window_size - 1)
+                    pos_bias = pos_bias.reshape(1, old_len, old_len, -1).permute(0, 3, 1, 2)
+                    pos_bias = F.interpolate(pos_bias, size=(new_len, new_len), mode="bicubic", align_corners=False)
+                    new_swin_state_dict[x] = pos_bias.permute(0, 2, 3, 1).reshape(1, new_len ** 2, -1).squeeze(0)
+                else:
+                    new_swin_state_dict[x] = swin_state_dict[x]
+            self.model.load_state_dict(new_swin_state_dict)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch_size, num_channels, height, width)
+        """
+        x = self.model.patch_embed(x)
+        x = self.model.pos_drop(x)
+        x = self.model.layers(x)
+        return x
+    def prepare_input(self, img: PIL.Image.Image, random_padding: bool = False) -> torch.Tensor:
+        """
+        Convert PIL Image to tensor according to specified input_size after following steps below:
+            - resize
+            - rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
+            - pad
+        """
+        img = img.convert("RGB")
+        if self.align_long_axis and (
+            (self.input_size[0] > self.input_size[1] and img.width > img.height)
+            or (self.input_size[0] < self.input_size[1] and img.width < img.height)
+        ):
+            img = rotate(img, angle=-90, expand=True)
+        img = resize(img, min(self.input_size))
+        img.thumbnail((self.input_size[1], self.input_size[0]))
+        delta_width = self.input_size[1] - img.width
+        delta_height = self.input_size[0] - img.height
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+        padding = (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+        return self.to_tensor(ImageOps.expand(img, padding))
+class BARTDecoder(nn.Module):
+    """
+    Donut Decoder based on Multilingual BART
+    Set the initial weights and configuration with a pretrained multilingual BART model,
+    and modify the detailed configurations as a Donut decoder
+    Args:
+        decoder_layer:
+            Number of layers of BARTDecoder
+        max_position_embeddings:
+            The maximum sequence length to be trained
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local,
+            otherwise, `hyunwoongko/asian-bart-ecjk` will be set (using `transformers`)
+    """
+    def __init__(
+        self, decoder_layer: int, max_position_embeddings: int, name_or_path: Union[str, bytes, os.PathLike] = None
+    ):
+        super().__init__()
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_position_embeddings
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
+            "hyunwoongko/asian-bart-ecjk" if not name_or_path else name_or_path
+        )
+        self.model = MBartForCausalLM(
+            config=MBartConfig(
+                is_decoder=True,
+                is_encoder_decoder=False,
+                add_cross_attention=True,
+                decoder_layers=self.decoder_layer,
+                max_position_embeddings=self.max_position_embeddings,
+                vocab_size=len(self.tokenizer),
+                scale_embedding=True,
+                add_final_layer_norm=True,
+            )
+        )
+        self.model.forward = self.forward  #  to get cross attentions and utilize `generate` function
+        self.model.config.is_encoder_decoder = True  # to get cross-attention
+        self.add_special_tokens(["<sep/>"])  # <sep/> is used for representing a list in a JSON
+        self.model.model.decoder.embed_tokens.padding_idx = self.tokenizer.pad_token_id
+        self.model.prepare_inputs_for_generation = self.prepare_inputs_for_inference
+        # weight init with asian-bart
+        if not name_or_path:
+            bart_state_dict = MBartForCausalLM.from_pretrained("hyunwoongko/asian-bart-ecjk").state_dict()
+            new_bart_state_dict = self.model.state_dict()
+            for x in new_bart_state_dict:
+                if x.endswith("embed_positions.weight") and self.max_position_embeddings != 1024:
+                    new_bart_state_dict[x] = torch.nn.Parameter(
+                        self.resize_bart_abs_pos_emb(
+                            bart_state_dict[x],
+                            self.max_position_embeddings
+                            + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                        )
+                    )
+                elif x.endswith("embed_tokens.weight") or x.endswith("lm_head.weight"):
+                    new_bart_state_dict[x] = bart_state_dict[x][: len(self.tokenizer), :]
+                else:
+                    new_bart_state_dict[x] = bart_state_dict[x]
+            self.model.load_state_dict(new_bart_state_dict)
+    def add_special_tokens(self, list_of_tokens: List[str]):
+        """
+        Add special tokens to tokenizer and resize the token embeddings
+        """
+        newly_added_num = self.tokenizer.add_special_tokens({"additional_special_tokens": sorted(set(list_of_tokens))})
+        if newly_added_num > 0:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+    def prepare_inputs_for_inference(self, input_ids: torch.Tensor, encoder_outputs: torch.Tensor, past_key_values=None, past=None, use_cache: bool = None, attention_mask: torch.Tensor = None):
+        """
+        Args:
+            input_ids: (batch_size, sequence_lenth)
+        Returns:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, embedding_dim)
+        """
+        # for compatibility with transformers==4.11.x
+        if past is not None:
+            past_key_values = past
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long()
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        output = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "encoder_hidden_states": encoder_outputs.last_hidden_state,
+        }
+        return output
+    def forward(
+        self,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: bool = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = None,
+    ):
+        """
+        A forward fucntion to get cross attentions and utilize `generate` function
+        Source:
+        https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L1669-L1810
+        Args:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, hidden_size)
+        Returns:
+            loss: (1, )
+            logits: (batch_size, sequence_length, hidden_dim)
+            hidden_states: (batch_size, sequence_length, hidden_size)
+            decoder_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+            cross_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.model.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.model.config.use_return_dict
+        outputs = self.model.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.model.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return ModelOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            decoder_attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    @staticmethod
+    def resize_bart_abs_pos_emb(weight: torch.Tensor, max_length: int) -> torch.Tensor:
+        """
+        Resize position embeddings
+        Truncate if sequence length of Bart backbone is greater than given max_length,
+        else interpolate to max_length
+        """
+        if weight.shape[0] > max_length:
+            weight = weight[:max_length, ...]
+        else:
+            weight = (
+                F.interpolate(
+                    weight.permute(1, 0).unsqueeze(0),
+                    size=max_length,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .permute(1, 0)
+            )
+        return weight
+class DonutConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutModel`]. It is used to
+    instantiate a Donut model according to the specified arguments, defining the model architecture
+    Args:
+        input_size:
+            Input image size (canvas size) of Donut.encoder, SwinTransformer in this codebase
+        align_long_axis:
+            Whether to rotate image if height is greater than width
+        window_size:
+            Window size of Donut.encoder, SwinTransformer in this codebase
+        encoder_layer:
+            Depth of each Donut.encoder Encoder layer, SwinTransformer in this codebase
+        decoder_layer:
+            Number of hidden layers in the Donut.decoder, such as BART
+        max_position_embeddings
+            Trained max position embeddings in the Donut decoder,
+            if not specified, it will have same value with max_length
+        max_length:
+            Max position embeddings(=maximum sequence length) you want to train
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local
+    """
+    model_type = "donut"
+    def __init__(
+        self,
+        input_size: List[int] = [2560, 1920],
+        align_long_axis: bool = False,
+        window_size: int = 10,
+        encoder_layer: List[int] = [2, 2, 14, 2],
+        decoder_layer: int = 4,
+        max_position_embeddings: int = None,
+        max_length: int = 1536,
+        name_or_path: Union[str, bytes, os.PathLike] = "",
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_length if max_position_embeddings is None else max_position_embeddings
+        self.max_length = max_length
+        self.name_or_path = name_or_path
+class DonutModel(PreTrainedModel):
+    r"""
+    Donut: an E2E OCR-free Document Understanding Transformer.
+    The encoder maps an input document image into a set of embeddings,
+    the decoder predicts a desired token sequence, that can be converted to a structured format,
+    given a prompt and the encoder output embeddings
+    """
+    config_class = DonutConfig
+    base_model_prefix = "donut"
+    def __init__(self, config: DonutConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SwinEncoder(
+            input_size=self.config.input_size,
+            align_long_axis=self.config.align_long_axis,
+            window_size=self.config.window_size,
+            encoder_layer=self.config.encoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+        self.decoder = BARTDecoder(
+            max_position_embeddings=self.config.max_position_embeddings,
+            decoder_layer=self.config.decoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+    def forward(self, image_tensors: torch.Tensor, decoder_input_ids: torch.Tensor, decoder_labels: torch.Tensor):
+        """
+        Calculate a loss given an input image and a desired token sequence,
+        the model will be trained in a teacher-forcing manner
+        Args:
+            image_tensors: (batch_size, num_channels, height, width)
+            decoder_input_ids: (batch_size, sequence_length, embedding_dim)
+            decode_labels: (batch_size, sequence_length)
+        """
+        encoder_outputs = self.encoder(image_tensors)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            encoder_hidden_states=encoder_outputs,
+            labels=decoder_labels,
+        )
+        return decoder_outputs
+    def inference(
+        self,
+        image: PIL.Image = None,
+        prompt: str = None,
+        image_tensors: Optional[torch.Tensor] = None,
+        prompt_tensors: Optional[torch.Tensor] = None,
+        return_json: bool = True,
+        return_attentions: bool = False,
+    ):
+        """
+        Generate a token sequence in an auto-regressive manner,
+        the generated token sequence is convereted into an ordered JSON format
+        Args:
+            image: input document image (PIL.Image)
+            prompt: task prompt (string) to guide Donut Decoder generation
+            image_tensors: (1, num_channels, height, width)
+                convert prompt to tensor if image_tensor is not fed
+            prompt_tensors: (1, sequence_length)
+                convert image to tensor if prompt_tensor is not fed
+        """
+        # prepare backbone inputs (image and prompt)
+        if image is None and image_tensors is None:
+            raise ValueError("Expected either image or image_tensors")
+        if all(v is None for v in {prompt, prompt_tensors}):
+            raise ValueError("Expected either prompt or prompt_tensors")
+        if image_tensors is None:
+            image_tensors = self.encoder.prepare_input(image).unsqueeze(0)
+        if self.device.type == "cuda":  # half is not compatible in cpu implementation.
+            image_tensors = image_tensors.half()
+            image_tensors = image_tensors.to(self.device)
+        if prompt_tensors is None:
+            prompt_tensors = self.decoder.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
+        prompt_tensors = prompt_tensors.to(self.device)
+        last_hidden_state = self.encoder(image_tensors)
+        if self.device.type != "cuda":
+            last_hidden_state = last_hidden_state.to(torch.float32)
+        encoder_outputs = ModelOutput(last_hidden_state=last_hidden_state, attentions=None)
+        if len(encoder_outputs.last_hidden_state.size()) == 1:
+            encoder_outputs.last_hidden_state = encoder_outputs.last_hidden_state.unsqueeze(0)
+        if len(prompt_tensors.size()) == 1:
+            prompt_tensors = prompt_tensors.unsqueeze(0)
+        # get decoder output
+        decoder_output = self.decoder.model.generate(
+            decoder_input_ids=prompt_tensors,
+            encoder_outputs=encoder_outputs,
+            max_length=self.config.max_length,
+            early_stopping=True,
+            pad_token_id=self.decoder.tokenizer.pad_token_id,
+            eos_token_id=self.decoder.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.decoder.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            output_attentions=return_attentions,
+        )
+        output = {"predictions": list()}
+        for seq in self.decoder.tokenizer.batch_decode(decoder_output.sequences):
+            seq = seq.replace(self.decoder.tokenizer.eos_token, "").replace(self.decoder.tokenizer.pad_token, "")
+            seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
+            if return_json:
+                output["predictions"].append(self.token2json(seq))
+            else:
+                output["predictions"].append(seq)
+        if return_attentions:
+            output["attentions"] = {
+                "self_attentions": decoder_output.decoder_attentions,
+                "cross_attentions": decoder_output.cross_attentions,
+            }
+        return output
+    def json2token(self, obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
+        """
+        Convert an ordered JSON object into a token sequence
+        """
+        if type(obj) == dict:
+            if len(obj) == 1 and "text_sequence" in obj:
+                return obj["text_sequence"]
+            else:
+                output = ""
+                if sort_json_key:
+                    keys = sorted(obj.keys(), reverse=True)
+                else:
+                    keys = obj.keys()
+                for k in keys:
+                    if update_special_tokens_for_json_key:
+                        self.decoder.add_special_tokens([fr"<s_{k}>", fr"</s_{k}>"])
+                    output += (
+                        fr"<s_{k}>"
+                        + self.json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
+                        + fr"</s_{k}>"
+                    )
+                return output
+        elif type(obj) == list:
+            return r"<sep/>".join(
+                [self.json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
+            )
+        else:
+            obj = str(obj)
+            if f"<{obj}/>" in self.decoder.tokenizer.all_special_tokens:
+                obj = f"<{obj}/>"  # for categorical special tokens
+            return obj
+    def token2json(self, tokens, is_inner_value=False):
+        """
+        Convert a (generated) token seuqnce into an ordered JSON format
+        """
+        output = dict()
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            end_token = re.search(fr"</s_{key}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if (
+                                leaf in self.decoder.tokenizer.get_added_vocab()
+                                and leaf[0] == "<"
+                                and leaf[-2:] == "/>"
+                            ):
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True)
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, bytes, os.PathLike],
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained donut model from a pre-trained model configuration
+        Args:
+            pretrained_model_name_or_path:
+                Name of a pretrained model name either registered in huggingface.co. or saved in local,
+                e.g., `naver-clova-ix/donut-base`, or `naver-clova-ix/donut-base-finetuned-rvlcdip`
+        """
+        model = super(DonutModel, cls).from_pretrained(pretrained_model_name_or_path, revision="official", *model_args, **kwargs)
+        # truncate or interplolate position embeddings of donut decoder
+        max_length = kwargs.get("max_length", model.config.max_position_embeddings)
+        if (
+            max_length != model.config.max_position_embeddings
+        ):  # if max_length of trained model differs max_length you want to train
+            model.decoder.model.model.decoder.embed_positions.weight = torch.nn.Parameter(
+                model.decoder.resize_bart_abs_pos_emb(
+                    model.decoder.model.model.decoder.embed_positions.weight,
+                    max_length
+                    + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                )
+            )
+            model.config.max_position_embeddings = max_length
+        return model

donut/util.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple, Union
+import torch
+import zss
+from datasets import load_dataset
+from nltk import edit_distance
+from torch.utils.data import Dataset
+from transformers.modeling_utils import PreTrainedModel
+from zss import Node
+def save_json(write_path: Union[str, bytes, os.PathLike], save_obj: Any):
+    with open(write_path, "w") as f:
+        json.dump(save_obj, f)
+def load_json(json_path: Union[str, bytes, os.PathLike]):
+    with open(json_path, "r") as f:
+        return json.load(f)
+class DonutDataset(Dataset):
+    """
+    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
+    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
+    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string)
+    Args:
+        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
+        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
+        task_start_token: the special token to be fed to the decoder to conduct the target task
+    """
+    def __init__(
+        self,
+        dataset_name_or_path: str,
+        donut_model: PreTrainedModel,
+        max_length: int,
+        split: str = "train",
+        ignore_id: int = -100,
+        task_start_token: str = "<s>",
+        prompt_end_token: str = None,
+        sort_json_key: bool = True,
+    ):
+        super().__init__()
+        self.donut_model = donut_model
+        self.max_length = max_length
+        self.split = split
+        self.ignore_id = ignore_id
+        self.task_start_token = task_start_token
+        self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
+        self.sort_json_key = sort_json_key
+        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
+        self.dataset_length = len(self.dataset)
+        self.gt_token_sequences = []
+        for sample in self.dataset:
+            ground_truth = json.loads(sample["ground_truth"])
+            if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
+                assert isinstance(ground_truth["gt_parses"], list)
+                gt_jsons = ground_truth["gt_parses"]
+            else:
+                assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
+                gt_jsons = [ground_truth["gt_parse"]]
+            self.gt_token_sequences.append(
+                [
+                    task_start_token
+                    + self.donut_model.json2token(
+                        gt_json,
+                        update_special_tokens_for_json_key=self.split == "train",
+                        sort_json_key=self.sort_json_key,
+                    )
+                    + self.donut_model.decoder.tokenizer.eos_token
+                    for gt_json in gt_jsons  # load json from list of json
+                ]
+            )
+        self.donut_model.decoder.add_special_tokens([self.task_start_token, self.prompt_end_token])
+        self.prompt_end_token_id = self.donut_model.decoder.tokenizer.convert_tokens_to_ids(self.prompt_end_token)
+    def __len__(self) -> int:
+        return self.dataset_length
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Load image from image_path of given dataset_path and convert into input_tensor and labels.
+        Convert gt data into input_ids (tokenized string)
+        Returns:
+            input_tensor : preprocessed image
+            input_ids : tokenized gt_data
+            labels : masked labels (model doesn't need to predict prompt and pad token)
+        """
+        sample = self.dataset[idx]
+        # input_tensor
+        input_tensor = self.donut_model.encoder.prepare_input(sample["image"], random_padding=self.split == "train")
+        # input_ids
+        processed_parse = random.choice(self.gt_token_sequences[idx])  # can be more than one, e.g., DocVQA Task 1
+        input_ids = self.donut_model.decoder.tokenizer(
+            processed_parse,
+            add_special_tokens=False,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )["input_ids"].squeeze(0)
+        if self.split == "train":
+            labels = input_ids.clone()
+            labels[
+                labels == self.donut_model.decoder.tokenizer.pad_token_id
+            ] = self.ignore_id  # model doesn't need to predict pad token
+            labels[
+                : torch.nonzero(labels == self.prompt_end_token_id).sum() + 1
+            ] = self.ignore_id  # model doesn't need to predict prompt (for VQA)
+            return input_tensor, input_ids, labels
+        else:
+            prompt_end_index = torch.nonzero(
+                input_ids == self.prompt_end_token_id
+            ).sum()  # return prompt end index instead of target output labels
+            return input_tensor, input_ids, prompt_end_index, processed_parse
+class JSONParseEvaluator:
+    """
+    Calculate n-TED(Normalized Tree Edit Distance) based accuracy and F1 accuracy score
+    """
+    @staticmethod
+    def flatten(data: dict):
+        """
+        Convert Dictionary into Non-nested Dictionary
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(list)
+                [
+                    ("menu.name", "cake"),
+                    ("menu.count", "2"),
+                    ("menu.name", "juice"),
+                    ("menu.count", "1"),
+                ]
+        """
+        flatten_data = list()
+        def _flatten(value, key=""):
+            if type(value) is dict:
+                for child_key, child_value in value.items():
+                    _flatten(child_value, f"{key}.{child_key}" if key else child_key)
+            elif type(value) is list:
+                for value_item in value:
+                    _flatten(value_item, key)
+            else:
+                flatten_data.append((key, value))
+        _flatten(data)
+        return flatten_data
+    @staticmethod
+    def update_cost(node1: Node, node2: Node):
+        """
+        Update cost for tree edit distance.
+        If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
+        If one of them is leaf node, cost is length of string in leaf node + 1.
+        If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
+        """
+        label1 = node1.label
+        label2 = node2.label
+        label1_leaf = "<leaf>" in label1
+        label2_leaf = "<leaf>" in label2
+        if label1_leaf == True and label2_leaf == True:
+            return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
+        elif label1_leaf == False and label2_leaf == True:
+            return 1 + len(label2.replace("<leaf>", ""))
+        elif label1_leaf == True and label2_leaf == False:
+            return 1 + len(label1.replace("<leaf>", ""))
+        else:
+            return int(label1 != label2)
+    @staticmethod
+    def insert_and_remove_cost(node: Node):
+        """
+        Insert and remove cost for tree edit distance.
+        If leaf node, cost is length of label name.
+        Otherwise, 1
+        """
+        label = node.label
+        if "<leaf>" in label:
+            return len(label.replace("<leaf>", ""))
+        else:
+            return 1
+    def normalize_dict(self, data: Union[Dict, List, Any]):
+        """
+        Sort by value, while iterate over element if data is list
+        """
+        if not data:
+            return {}
+        if isinstance(data, dict):
+            new_data = dict()
+            for key in sorted(data.keys(), key=lambda k: (len(k), k)):
+                value = self.normalize_dict(data[key])
+                if value:
+                    if not isinstance(value, list):
+                        value = [value]
+                    new_data[key] = value
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                new_data = []
+                for item in data:
+                    item = self.normalize_dict(item)
+                    if item:
+                        new_data.append(item)
+            else:
+                new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
+        else:
+            new_data = [str(data).strip()]
+        return new_data
+    def cal_f1(self, preds: List[dict], answers: List[dict]):
+        """
+        Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives
+        """
+        total_tp, total_fn_or_fp = 0, 0
+        for pred, answer in zip(preds, answers):
+            pred, answer = self.flatten(self.normalize_dict(pred)), self.flatten(self.normalize_dict(answer))
+            for field in pred:
+                if field in answer:
+                    total_tp += 1
+                    answer.remove(field)
+                else:
+                    total_fn_or_fp += 1
+            total_fn_or_fp += len(answer)
+        return total_tp / (total_tp + total_fn_or_fp / 2)
+    def construct_tree_from_dict(self, data: Union[Dict, List], node_name: str = None):
+        """
+        Convert Dictionary into Tree
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(tree)
+                                     <root>
+                                       |
+                                     menu
+                                    /    \
+                             <subtree>  <subtree>
+                            /      |     |      \
+                         name    count  name    count
+                        /         |     |         \
+                  <leaf>cake  <leaf>2  <leaf>juice  <leaf>1
+         """
+        if node_name is None:
+            node_name = "<root>"
+        node = Node(node_name)
+        if isinstance(data, dict):
+            for key, value in data.items():
+                kid_node = self.construct_tree_from_dict(value, key)
+                node.addkid(kid_node)
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                for item in data:
+                    kid_node = self.construct_tree_from_dict(
+                        item,
+                        "<subtree>",
+                    )
+                    node.addkid(kid_node)
+            else:
+                for item in data:
+                    node.addkid(Node(f"<leaf>{item}"))
+        else:
+            raise Exception(data, node_name)
+        return node
+    def cal_acc(self, pred: dict, answer: dict):
+        """
+        Calculate normalized tree edit distance(nTED) based accuracy.
+        1) Construct tree from dict,
+        2) Get tree distance with insert/remove/update cost,
+        3) Divide distance with GT tree size (i.e., nTED),
+        4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
+        """
+        pred = self.construct_tree_from_dict(self.normalize_dict(pred))
+        answer = self.construct_tree_from_dict(self.normalize_dict(answer))
+        return max(
+            0,
+            1
+            - (
+                zss.distance(
+                    pred,
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+                / zss.distance(
+                    self.construct_tree_from_dict(self.normalize_dict({})),
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+            ),
+        )

lightning_module.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import math
+import random
+import re
+from pathlib import Path
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from nltk import edit_distance
+from pytorch_lightning.utilities import rank_zero_only
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from torch.nn.utils.rnn import pad_sequence
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+from donut import DonutConfig, DonutModel
+class DonutModelPLModule(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if self.config.get("pretrained_model_name_or_path", False):
+            self.model = DonutModel.from_pretrained(
+                self.config.pretrained_model_name_or_path,
+                input_size=self.config.input_size,
+                max_length=self.config.max_length,
+                align_long_axis=self.config.align_long_axis,
+                ignore_mismatched_sizes=True,
+            )
+        else:
+            self.model = DonutModel(
+                config=DonutConfig(
+                    input_size=self.config.input_size,
+                    max_length=self.config.max_length,
+                    align_long_axis=self.config.align_long_axis,
+                    # with DonutConfig, the architecture customization is available, e.g.,
+                    # encoder_layer=[2,2,14,2], decoder_layer=4, ...
+                )
+            )
+        self.pytorch_lightning_version_is_1 = int(pl.__version__[0]) < 2
+        self.num_of_loaders = len(self.config.dataset_name_or_paths)
+    def training_step(self, batch, batch_idx):
+        image_tensors, decoder_input_ids, decoder_labels = list(), list(), list()
+        for batch_data in batch:
+            image_tensors.append(batch_data[0])
+            decoder_input_ids.append(batch_data[1][:, :-1])
+            decoder_labels.append(batch_data[2][:, 1:])
+        image_tensors = torch.cat(image_tensors)
+        decoder_input_ids = torch.cat(decoder_input_ids)
+        decoder_labels = torch.cat(decoder_labels)
+        loss = self.model(image_tensors, decoder_input_ids, decoder_labels)[0]
+        self.log_dict({"train_loss": loss}, sync_dist=True)
+        if not self.pytorch_lightning_version_is_1:
+            self.log('loss', loss, prog_bar=True)
+        return loss
+    def on_validation_epoch_start(self) -> None:
+        super().on_validation_epoch_start()
+        self.validation_step_outputs = [[] for _ in range(self.num_of_loaders)]
+        return
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        image_tensors, decoder_input_ids, prompt_end_idxs, answers = batch
+        decoder_prompts = pad_sequence(
+            [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
+            batch_first=True,
+        )
+        preds = self.model.inference(
+            image_tensors=image_tensors,
+            prompt_tensors=decoder_prompts,
+            return_json=False,
+            return_attentions=False,
+        )["predictions"]
+        scores = list()
+        for pred, answer in zip(preds, answers):
+            pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
+            answer = re.sub(r"<.*?>", "", answer, count=1)
+            answer = answer.replace(self.model.decoder.tokenizer.eos_token, "")
+            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))
+            if self.config.get("verbose", False) and len(scores) == 1:
+                self.print(f"Prediction: {pred}")
+                self.print(f"    Answer: {answer}")
+                self.print(f" Normed ED: {scores[0]}")
+        self.validation_step_outputs[dataloader_idx].append(scores)
+        return scores
+    def on_validation_epoch_end(self):
+        assert len(self.validation_step_outputs) == self.num_of_loaders
+        cnt = [0] * self.num_of_loaders
+        total_metric = [0] * self.num_of_loaders
+        val_metric = [0] * self.num_of_loaders
+        for i, results in enumerate(self.validation_step_outputs):
+            for scores in results:
+                cnt[i] += len(scores)
+                total_metric[i] += np.sum(scores)
+            val_metric[i] = total_metric[i] / cnt[i]
+            val_metric_name = f"val_metric_{i}th_dataset"
+            self.log_dict({val_metric_name: val_metric[i]}, sync_dist=True)
+        self.log_dict({"val_metric": np.sum(total_metric) / np.sum(cnt)}, sync_dist=True)
+    def configure_optimizers(self):
+        max_iter = None
+        if int(self.config.get("max_epochs", -1)) > 0:
+            assert len(self.config.train_batch_sizes) == 1, "Set max_epochs only if the number of datasets is 1"
+            max_iter = (self.config.max_epochs * self.config.num_training_samples_per_epoch) / (
+                self.config.train_batch_sizes[0] * torch.cuda.device_count() * self.config.get("num_nodes", 1)
+            )
+        if int(self.config.get("max_steps", -1)) > 0:
+            max_iter = min(self.config.max_steps, max_iter) if max_iter is not None else self.config.max_steps
+        assert max_iter is not None
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.lr)
+        scheduler = {
+            "scheduler": self.cosine_scheduler(optimizer, max_iter, self.config.warmup_steps),
+            "name": "learning_rate",
+            "interval": "step",
+        }
+        return [optimizer], [scheduler]
+    @staticmethod
+    def cosine_scheduler(optimizer, training_steps, warmup_steps):
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return current_step / max(1, warmup_steps)
+            progress = current_step - warmup_steps
+            progress /= max(1, training_steps - warmup_steps)
+            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
+        return LambdaLR(optimizer, lr_lambda)
+    @rank_zero_only
+    def on_save_checkpoint(self, checkpoint):
+        save_path = Path(self.config.result_path) / self.config.exp_name / self.config.exp_version
+        self.model.save_pretrained(save_path)
+        self.model.decoder.tokenizer.save_pretrained(save_path)
+class DonutDataPLModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.train_batch_sizes = self.config.train_batch_sizes
+        self.val_batch_sizes = self.config.val_batch_sizes
+        self.train_datasets = []
+        self.val_datasets = []
+        self.g = torch.Generator()
+        self.g.manual_seed(self.config.seed)
+    def train_dataloader(self):
+        loaders = list()
+        for train_dataset, batch_size in zip(self.train_datasets, self.train_batch_sizes):
+            loaders.append(
+                DataLoader(
+                    train_dataset,
+                    batch_size=batch_size,
+                    num_workers=self.config.num_workers,
+                    pin_memory=True,
+                    worker_init_fn=self.seed_worker,
+                    generator=self.g,
+                    shuffle=True,
+                )
+            )
+        return loaders
+    def val_dataloader(self):
+        loaders = list()
+        for val_dataset, batch_size in zip(self.val_datasets, self.val_batch_sizes):
+            loaders.append(
+                DataLoader(
+                    val_dataset,
+                    batch_size=batch_size,
+                    pin_memory=True,
+                    shuffle=False,
+                )
+            )
+        return loaders
+    @staticmethod
+    def seed_worker(wordker_id):
+        worker_seed = torch.initial_seed() % 2 ** 32
+        np.random.seed(worker_seed)
+        random.seed(worker_seed)

setup.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import os
+from setuptools import find_packages, setup
+ROOT = os.path.abspath(os.path.dirname(__file__))
+def read_version():
+    data = {}
+    path = os.path.join(ROOT, "donut", "_version.py")
+    with open(path, "r", encoding="utf-8") as f:
+        exec(f.read(), data)
+    return data["__version__"]
+def read_long_description():
+    path = os.path.join(ROOT, "README.md")
+    with open(path, "r", encoding="utf-8") as f:
+        text = f.read()
+    return text
+setup(
+    name="donut-python",
+    version=read_version(),
+    description="OCR-free Document Understanding Transformer",
+    long_description=read_long_description(),
+    long_description_content_type="text/markdown",
+    author="Geewook Kim, Teakgyu Hong, Moonbin Yim, JeongYeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park",
+    author_email="[email protected]",
+    url="https://github.com/clovaai/donut",
+    license="MIT",
+    packages=find_packages(
+        exclude=[
+            "config",
+            "dataset",
+            "misc",
+            "result",
+            "synthdog",
+            "app.py",
+            "lightning_module.py",
+            "README.md",
+            "train.py",
+            "test.py",
+        ]
+    ),
+    python_requires=">=3.7",
+    install_requires=[
+        "transformers>=4.11.3",
+        "timm",
+        "datasets[vision]",
+        "pytorch-lightning>=1.6.4",
+        "nltk",
+        "sentencepiece",
+        "zss",
+        "sconf>=0.2.3",
+    ],
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)

test.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+import numpy as np
+import torch
+from datasets import load_dataset
+from PIL import Image
+from tqdm import tqdm
+from donut import DonutModel, JSONParseEvaluator, load_json, save_json
+def test(args):
+    pretrained_model = DonutModel.from_pretrained(args.pretrained_model_name_or_path)
+    if torch.cuda.is_available():
+        pretrained_model.half()
+        pretrained_model.to("cuda")
+    pretrained_model.eval()
+    if args.save_path:
+        os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
+    predictions = []
+    ground_truths = []
+    accs = []
+    evaluator = JSONParseEvaluator()
+    dataset = load_dataset(args.dataset_name_or_path, split=args.split)
+    for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
+        ground_truth = json.loads(sample["ground_truth"])
+        if args.task_name == "docvqa":
+            output = pretrained_model.inference(
+                image=sample["image"],
+                prompt=f"<s_{args.task_name}><s_question>{ground_truth['gt_parses'][0]['question'].lower()}</s_question><s_answer>",
+            )["predictions"][0]
+        else:
+            output = pretrained_model.inference(image=sample["image"], prompt=f"<s_{args.task_name}>")["predictions"][0]
+        if args.task_name == "rvlcdip":
+            gt = ground_truth["gt_parse"]
+            score = float(output["class"] == gt["class"])
+        elif args.task_name == "docvqa":
+            # Note: we evaluated the model on the official website.
+            # In this script, an exact-match based score will be returned instead
+            gt = ground_truth["gt_parses"]
+            answers = set([qa_parse["answer"] for qa_parse in gt])
+            score = float(output["answer"] in answers)
+        else:
+            gt = ground_truth["gt_parse"]
+            score = evaluator.cal_acc(output, gt)
+        accs.append(score)
+        predictions.append(output)
+        ground_truths.append(gt)
+    scores = {
+        "ted_accuracies": accs,
+        "ted_accuracy": np.mean(accs),
+        "f1_accuracy": evaluator.cal_f1(predictions, ground_truths),
+    }
+    print(
+        f"Total number of samples: {len(accs)}, Tree Edit Distance (TED) based accuracy score: {scores['ted_accuracy']}, F1 accuracy score: {scores['f1_accuracy']}"
+    )
+    if args.save_path:
+        scores["predictions"] = predictions
+        scores["ground_truths"] = ground_truths
+        save_json(args.save_path, scores)
+    return predictions
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrained_model_name_or_path", type=str)
+    parser.add_argument("--dataset_name_or_path", type=str)
+    parser.add_argument("--split", type=str, default="test")
+    parser.add_argument("--task_name", type=str, default=None)
+    parser.add_argument("--save_path", type=str, default=None)
+    args, left_argv = parser.parse_known_args()
+    if args.task_name is None:
+        args.task_name = os.path.basename(args.dataset_name_or_path)
+    predictions = test(args)

train.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import argparse
+import datetime
+import json
+import os
+import random
+from io import BytesIO
+from os.path import basename
+from pathlib import Path
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
+from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
+from pytorch_lightning.plugins import CheckpointIO
+from pytorch_lightning.utilities import rank_zero_only
+from sconf import Config
+from donut import DonutDataset
+from lightning_module import DonutDataPLModule, DonutModelPLModule
+class CustomCheckpointIO(CheckpointIO):
+    def save_checkpoint(self, checkpoint, path, storage_options=None):
+        del checkpoint["state_dict"]
+        torch.save(checkpoint, path)
+    def load_checkpoint(self, path, storage_options=None):
+        checkpoint = torch.load(path + "artifacts.ckpt")
+        state_dict = torch.load(path + "pytorch_model.bin")
+        checkpoint["state_dict"] = {"model." + key: value for key, value in state_dict.items()}
+        return checkpoint
+    def remove_checkpoint(self, path) -> None:
+        return super().remove_checkpoint(path)
+@rank_zero_only
+def save_config_file(config, path):
+    if not Path(path).exists():
+        os.makedirs(path)
+    save_path = Path(path) / "config.yaml"
+    print(config.dumps())
+    with open(save_path, "w") as f:
+        f.write(config.dumps(modified_color=None, quote_str=True))
+        print(f"Config is saved at {save_path}")
+class ProgressBar(pl.callbacks.TQDMProgressBar):
+    def __init__(self, config):
+        super().__init__()
+        self.enable = True
+        self.config = config
+    def disable(self):
+        self.enable = False
+    def get_metrics(self, trainer, model):
+        items = super().get_metrics(trainer, model)
+        items.pop("v_num", None)
+        items["exp_name"] = f"{self.config.get('exp_name', '')}"
+        items["exp_version"] = f"{self.config.get('exp_version', '')}"
+        return items
+def set_seed(seed):
+    pytorch_lightning_version = int(pl.__version__[0])
+    if pytorch_lightning_version < 2:
+        pl.utilities.seed.seed_everything(seed, workers=True)
+    else:
+        import lightning_fabric
+        lightning_fabric.utilities.seed.seed_everything(seed, workers=True)
+def train(config):
+    set_seed(config.get("seed", 42))
+    model_module = DonutModelPLModule(config)
+    data_module = DonutDataPLModule(config)
+    # add datasets to data_module
+    datasets = {"train": [], "validation": []}
+    for i, dataset_name_or_path in enumerate(config.dataset_name_or_paths):
+        task_name = os.path.basename(dataset_name_or_path)  # e.g., cord-v2, docvqa, rvlcdip, ...
+        # add categorical special tokens (optional)
+        if task_name == "rvlcdip":
+            model_module.model.decoder.add_special_tokens([
+                "<advertisement/>", "<budget/>", "<email/>", "<file_folder/>",
+                "<form/>", "<handwritten/>", "<invoice/>", "<letter/>",
+                "<memo/>", "<news_article/>", "<presentation/>", "<questionnaire/>",
+                "<resume/>", "<scientific_publication/>", "<scientific_report/>", "<specification/>"
+            ])
+        if task_name == "docvqa":
+            model_module.model.decoder.add_special_tokens(["<yes/>", "<no/>"])
+        for split in ["train", "validation"]:
+            datasets[split].append(
+                DonutDataset(
+                    dataset_name_or_path=dataset_name_or_path,
+                    donut_model=model_module.model,
+                    max_length=config.max_length,
+                    split=split,
+                    task_start_token=config.task_start_tokens[i]
+                    if config.get("task_start_tokens", None)
+                    else f"<s_{task_name}>",
+                    prompt_end_token="<s_answer>" if "docvqa" in dataset_name_or_path else f"<s_{task_name}>",
+                    sort_json_key=config.sort_json_key,
+                )
+            )
+            # prompt_end_token is used for ignoring a given prompt in a loss function
+            # for docvqa task, i.e., {"question": {used as a prompt}, "answer": {prediction target}},
+            # set prompt_end_token to "<s_answer>"
+    data_module.train_datasets = datasets["train"]
+    data_module.val_datasets = datasets["validation"]
+    logger = TensorBoardLogger(
+        save_dir=config.result_path,
+        name=config.exp_name,
+        version=config.exp_version,
+        default_hp_metric=False,
+    )
+    lr_callback = LearningRateMonitor(logging_interval="step")
+    checkpoint_callback = ModelCheckpoint(
+        monitor="val_metric",
+        dirpath=Path(config.result_path) / config.exp_name / config.exp_version,
+        filename="artifacts",
+        save_top_k=1,
+        save_last=False,
+        mode="min",
+    )
+    bar = ProgressBar(config)
+    custom_ckpt = CustomCheckpointIO()
+    trainer = pl.Trainer(
+        num_nodes=config.get("num_nodes", 1),
+        devices=torch.cuda.device_count(),
+        strategy="ddp",
+        accelerator="gpu",
+        plugins=custom_ckpt,
+        max_epochs=config.max_epochs,
+        max_steps=config.max_steps,
+        val_check_interval=config.val_check_interval,
+        check_val_every_n_epoch=config.check_val_every_n_epoch,
+        gradient_clip_val=config.gradient_clip_val,
+        precision=16,
+        num_sanity_val_steps=0,
+        logger=logger,
+        callbacks=[lr_callback, checkpoint_callback, bar],
+    )
+    trainer.fit(model_module, data_module, ckpt_path=config.get("resume_from_checkpoint_path", None))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--exp_version", type=str, required=False)
+    args, left_argv = parser.parse_known_args()
+    config = Config(args.config)
+    config.argv_update(left_argv)
+    config.exp_name = basename(args.config).split(".")[0]
+    config.exp_version = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") if not args.exp_version else args.exp_version
+    save_config_file(config, Path(config.result_path) / config.exp_name / config.exp_version)
+    train(config)