pierreguillou commited on
Commit
6dfa440
·
0 Parent(s):

Duplicate from pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Inference APP for Document Understanding at line level (v2)
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
+ app_file: app.py
9
+ pinned: false
10
+ models:
11
+ - pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384
12
+ duplicated_from: pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
4
+ # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
5
+ os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
6
+
7
+ # install detectron2 that matches pytorch 1.8
8
+ # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
9
+ #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
10
+ os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
11
+
12
+ import detectron2
13
+ from detectron2.utils.logger import setup_logger
14
+ setup_logger()
15
+
16
+ import gradio as gr
17
+ import re
18
+ import string
19
+
20
+ from operator import itemgetter
21
+ import collections
22
+
23
+ import pypdf
24
+ from pypdf import PdfReader
25
+ from pypdf.errors import PdfReadError
26
+
27
+ import pdf2image
28
+ from pdf2image import convert_from_path
29
+ import langdetect
30
+ from langdetect import detect_langs
31
+
32
+ import pandas as pd
33
+ import numpy as np
34
+ import random
35
+ import tempfile
36
+ import itertools
37
+
38
+ from matplotlib import font_manager
39
+ from PIL import Image, ImageDraw, ImageFont
40
+ import cv2
41
+
42
+ ## files
43
+
44
+ import sys
45
+ sys.path.insert(0, 'files/')
46
+
47
+ import functions
48
+ from functions import *
49
+
50
+ # update pip
51
+ os.system('python -m pip install --upgrade pip')
52
+
53
+ ## model / feature extractor / tokenizer
54
+
55
+ from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
56
+
57
+ import torch
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+
60
+ # model
61
+ # tokenizer = LayoutXLMTokenizerFast.from_pretrained(model_id)
62
+ model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
63
+ model.to(device);
64
+
65
+ # feature extractor
66
+ from transformers import LayoutLMv2FeatureExtractor
67
+ feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
68
+
69
+ # tokenizer
70
+ from transformers import AutoTokenizer
71
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
72
+
73
+ # APP outputs
74
+ def app_outputs(uploaded_pdf):
75
+ filename, msg, images = pdf_to_images(uploaded_pdf)
76
+ num_images = len(images)
77
+
78
+ if not msg.startswith("Error with the PDF"):
79
+
80
+ # Extraction of image data (text and bounding boxes)
81
+ dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
82
+ # prepare our data in the format of the model
83
+ encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
84
+ custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
85
+ # Get predictions (token level)
86
+ outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
87
+ # Get predictions (line level)
88
+ probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
89
+ # Get labeled images with lines bounding boxes
90
+ images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
91
+
92
+ img_files = list()
93
+ # get image of PDF without bounding boxes
94
+ for i in range(num_images):
95
+ if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
96
+ else: img_file = filename.replace(".pdf", ".png")
97
+ images[i].save(img_file)
98
+ img_files.append(img_file)
99
+
100
+ if num_images < max_imgboxes:
101
+ img_files += [image_blank]*(max_imgboxes - num_images)
102
+ images += [Image.open(image_blank)]*(max_imgboxes - num_images)
103
+ for count in range(max_imgboxes - num_images):
104
+ df[num_images + count] = pd.DataFrame()
105
+ else:
106
+ img_files = img_files[:max_imgboxes]
107
+ images = images[:max_imgboxes]
108
+ df = dict(itertools.islice(df.items(), max_imgboxes))
109
+
110
+ # save
111
+ csv_files = list()
112
+ for i in range(max_imgboxes):
113
+ csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
114
+ csv_files.append(gr.File.update(value=csv_file, visible=True))
115
+ df[i].to_csv(csv_file, encoding="utf-8", index=False)
116
+
117
+ else:
118
+ img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
119
+ img_files[0], img_files[1] = image_blank, image_blank
120
+ images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
121
+ csv_file = "csv_wo_content.csv"
122
+ csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
123
+ df, df_empty = dict(), pd.DataFrame()
124
+ df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
125
+
126
+ return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
127
+
128
+ # gradio APP
129
+ with gr.Blocks(title="Inference APP for Document Understanding at line level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
130
+ gr.HTML("""
131
+ <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v2 - LayoutXLM base)</h1></div>
132
+ <div style="margin-top: 40px"><p>(03/05/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
133
+ <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
134
+ <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
135
+ <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
136
+ <div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
137
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
138
+ <ul><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
139
+ """)
140
+ with gr.Row():
141
+ pdf_file = gr.File(label="PDF")
142
+ with gr.Row():
143
+ submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
144
+ reset_btn = gr.Button(value="Clear")
145
+ with gr.Row():
146
+ output_msg = gr.Textbox(label="Output message")
147
+ with gr.Row():
148
+ fileboxes = []
149
+ for num_page in range(max_imgboxes):
150
+ file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
151
+ fileboxes.append(file_path)
152
+ with gr.Row():
153
+ imgboxes = []
154
+ for num_page in range(max_imgboxes):
155
+ img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
156
+ imgboxes.append(img)
157
+ with gr.Row():
158
+ csvboxes = []
159
+ for num_page in range(max_imgboxes):
160
+ csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
161
+ csvboxes.append(csv)
162
+ with gr.Row():
163
+ dfboxes = []
164
+ for num_page in range(max_imgboxes):
165
+ df = gr.Dataframe(
166
+ headers=["bounding boxes", "texts", "labels"],
167
+ datatype=["str", "str", "str"],
168
+ col_count=(3, "fixed"),
169
+ visible=True,
170
+ label=f"Data of page {num_page}",
171
+ type="pandas",
172
+ wrap=True
173
+ )
174
+ dfboxes.append(df)
175
+
176
+ outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
177
+ submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
178
+ reset_btn.click(
179
+ lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
180
+ inputs=[],
181
+ outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
182
+ )
183
+
184
+ gr.Examples(
185
+ [["files/example.pdf"]],
186
+ [pdf_file],
187
+ outputboxes,
188
+ fn=app_outputs,
189
+ cache_examples=True,
190
+ )
191
+
192
+ demo.launch()
files/README.md ADDED
File without changes
files/blank.pdf ADDED
Binary file (1.15 kB). View file
 
files/blank.png ADDED
files/example.pdf ADDED
Binary file (343 kB). View file
 
files/functions.py ADDED
@@ -0,0 +1,863 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
4
+ # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
5
+ os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
6
+
7
+ # install detectron2 that matches pytorch 1.8
8
+ # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
9
+ #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
10
+ os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
11
+
12
+ import detectron2
13
+ from detectron2.utils.logger import setup_logger
14
+ setup_logger()
15
+
16
+ import gradio as gr
17
+ import re
18
+ import string
19
+ import torch
20
+
21
+ from operator import itemgetter
22
+ import collections
23
+
24
+ import pypdf
25
+ from pypdf import PdfReader
26
+ from pypdf.errors import PdfReadError
27
+
28
+ import pdf2image
29
+ from pdf2image import convert_from_path
30
+ import langdetect
31
+ from langdetect import detect_langs
32
+
33
+ import pandas as pd
34
+ import numpy as np
35
+ import random
36
+ import tempfile
37
+ import itertools
38
+
39
+ from matplotlib import font_manager
40
+ from PIL import Image, ImageDraw, ImageFont
41
+ import cv2
42
+
43
+ import pathlib
44
+ from pathlib import Path
45
+ import shutil
46
+
47
+ # Tesseract
48
+ print(os.popen(f'cat /etc/debian_version').read())
49
+ print(os.popen(f'cat /etc/issue').read())
50
+ print(os.popen(f'apt search tesseract').read())
51
+ import pytesseract
52
+
53
+ ## model / feature extractor / tokenizer
54
+
55
+ from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
56
+
57
+ import torch
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+
60
+ # model
61
+ # tokenizer = LayoutXLMTokenizerFast.from_pretrained(model_id)
62
+ model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
63
+ model.to(device);
64
+
65
+ # feature extractor
66
+ from transformers import LayoutLMv2FeatureExtractor
67
+ feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
68
+
69
+ # tokenizer
70
+ from transformers import AutoTokenizer
71
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
72
+
73
+ ## Key parameters
74
+
75
+ # categories colors
76
+ label2color = {
77
+ 'Caption': 'brown',
78
+ 'Footnote': 'orange',
79
+ 'Formula': 'gray',
80
+ 'List-item': 'yellow',
81
+ 'Page-footer': 'red',
82
+ 'Page-header': 'red',
83
+ 'Picture': 'violet',
84
+ 'Section-header': 'orange',
85
+ 'Table': 'green',
86
+ 'Text': 'blue',
87
+ 'Title': 'pink'
88
+ }
89
+
90
+ # bounding boxes start and end of a sequence
91
+ cls_box = [0, 0, 0, 0]
92
+ sep_box = [1000, 1000, 1000, 1000]
93
+
94
+ # model
95
+ model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
96
+
97
+ # tokenizer
98
+ tokenizer_id = "xlm-roberta-base"
99
+
100
+ # (tokenization) The maximum length of a feature (sequence)
101
+ if str(384) in model_id:
102
+ max_length = 384
103
+ elif str(512) in model_id:
104
+ max_length = 512
105
+ else:
106
+ print("Error with max_length of chunks!")
107
+
108
+ # (tokenization) overlap
109
+ doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
110
+
111
+ # max PDF page images that will be displayed
112
+ max_imgboxes = 2
113
+
114
+ # get files
115
+ examples_dir = 'files/'
116
+ Path(examples_dir).mkdir(parents=True, exist_ok=True)
117
+ from huggingface_hub import hf_hub_download
118
+ files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
119
+ for file_name in files:
120
+ path_to_file = hf_hub_download(
121
+ repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2",
122
+ filename = "files/" + file_name,
123
+ repo_type = "space"
124
+ )
125
+ shutil.copy(path_to_file,examples_dir)
126
+
127
+ # path to files
128
+ image_wo_content = examples_dir + "wo_content.png" # image without content
129
+ pdf_blank = examples_dir + "blank.pdf" # blank PDF
130
+ image_blank = examples_dir + "blank.png" # blank image
131
+
132
+ ## get langdetect2Tesseract dictionary
133
+ t = "files/languages_tesseract.csv"
134
+ l = "files/languages_iso.csv"
135
+
136
+ df_t = pd.read_csv(t)
137
+ df_l = pd.read_csv(l)
138
+
139
+ langs_t = df_t["Language"].to_list()
140
+ langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
141
+ langs_l = df_l["Language"].to_list()
142
+ langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
143
+ langscode_t = df_t["LangCode"].to_list()
144
+ langscode_l = df_l["LangCode"].to_list()
145
+
146
+ Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
147
+ for lang_t, langcode_t in zip(langs_t,langscode_t):
148
+ try:
149
+ if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
150
+ index = langs_l.index(lang_t)
151
+ langcode_l = langscode_l[index]
152
+ Tesseract2langdetect[langcode_t] = langcode_l
153
+ except:
154
+ continue
155
+
156
+ langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
157
+
158
+ ## General
159
+
160
+ # get text and bounding boxes from an image
161
+ # https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
162
+ # https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
163
+ def get_data(results, factor, conf_min=0):
164
+
165
+ data = {}
166
+ for i in range(len(results['line_num'])):
167
+ level = results['level'][i]
168
+ block_num = results['block_num'][i]
169
+ par_num = results['par_num'][i]
170
+ line_num = results['line_num'][i]
171
+ top, left = results['top'][i], results['left'][i]
172
+ width, height = results['width'][i], results['height'][i]
173
+ conf = results['conf'][i]
174
+ text = results['text'][i]
175
+ if not (text == '' or text.isspace()):
176
+ if conf >= conf_min:
177
+ tup = (text, left, top, width, height)
178
+ if block_num in list(data.keys()):
179
+ if par_num in list(data[block_num].keys()):
180
+ if line_num in list(data[block_num][par_num].keys()):
181
+ data[block_num][par_num][line_num].append(tup)
182
+ else:
183
+ data[block_num][par_num][line_num] = [tup]
184
+ else:
185
+ data[block_num][par_num] = {}
186
+ data[block_num][par_num][line_num] = [tup]
187
+ else:
188
+ data[block_num] = {}
189
+ data[block_num][par_num] = {}
190
+ data[block_num][par_num][line_num] = [tup]
191
+
192
+ # get paragraphs dicionnary with list of lines
193
+ par_data = {}
194
+ par_idx = 1
195
+ for _, b in data.items():
196
+ for _, p in b.items():
197
+ line_data = {}
198
+ line_idx = 1
199
+ for _, l in p.items():
200
+ line_data[line_idx] = l
201
+ line_idx += 1
202
+ par_data[par_idx] = line_data
203
+ par_idx += 1
204
+
205
+ # get lines of texts, grouped by paragraph
206
+ lines = list()
207
+ row_indexes = list()
208
+ row_index = 0
209
+ for _,par in par_data.items():
210
+ count_lines = 0
211
+ for _,line in par.items():
212
+ if count_lines == 0: row_indexes.append(row_index)
213
+ line_text = ' '.join([item[0] for item in line])
214
+ lines.append(line_text)
215
+ count_lines += 1
216
+ row_index += 1
217
+ # lines.append("\n")
218
+ row_index += 1
219
+ # lines = lines[:-1]
220
+
221
+ # get paragraphes boxes (par_boxes)
222
+ # get lines boxes (line_boxes)
223
+ par_boxes = list()
224
+ par_idx = 1
225
+ line_boxes = list()
226
+ line_idx = 1
227
+ for _, par in par_data.items():
228
+ xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
229
+ for _, line in par.items():
230
+ xmin, ymin = line[0][1], line[0][2]
231
+ xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
232
+ line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
233
+ xmins.append(xmin)
234
+ ymins.append(ymin)
235
+ xmaxs.append(xmax)
236
+ ymaxs.append(ymax)
237
+ line_idx += 1
238
+ xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
239
+ par_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
240
+ par_idx += 1
241
+
242
+ return lines, row_indexes, par_boxes, line_boxes #data, par_data #
243
+
244
+ # rescale image to get 300dpi
245
+ def set_image_dpi_resize(image):
246
+ """
247
+ Rescaling image to 300dpi while resizing
248
+ :param image: An image
249
+ :return: A rescaled image
250
+ """
251
+ length_x, width_y = image.size
252
+ factor = min(1, float(1024.0 / length_x))
253
+ size = int(factor * length_x), int(factor * width_y)
254
+ # image_resize = image.resize(size, Image.Resampling.LANCZOS)
255
+ image_resize = image.resize(size, Image.LANCZOS)
256
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
257
+ temp_filename = temp_file.name
258
+ image_resize.save(temp_filename, dpi=(300, 300))
259
+ return factor, temp_filename
260
+
261
+ # it is important that each bounding box should be in (upper left, lower right) format.
262
+ # source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
263
+ def upperleft_to_lowerright(bbox):
264
+ x0, y0, x1, y1 = tuple(bbox)
265
+ if bbox[2] < bbox[0]:
266
+ x0 = bbox[2]
267
+ x1 = bbox[0]
268
+ if bbox[3] < bbox[1]:
269
+ y0 = bbox[3]
270
+ y1 = bbox[1]
271
+ return [x0, y0, x1, y1]
272
+
273
+ # convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
274
+ def convert_box(bbox):
275
+ x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
276
+ return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
277
+
278
+ # LiLT model gets 1000x10000 pixels images
279
+ def normalize_box(bbox, width, height):
280
+ return [
281
+ int(1000 * (bbox[0] / width)),
282
+ int(1000 * (bbox[1] / height)),
283
+ int(1000 * (bbox[2] / width)),
284
+ int(1000 * (bbox[3] / height)),
285
+ ]
286
+
287
+ # LiLT model gets 1000x10000 pixels images
288
+ def denormalize_box(bbox, width, height):
289
+ return [
290
+ int(width * (bbox[0] / 1000)),
291
+ int(height * (bbox[1] / 1000)),
292
+ int(width* (bbox[2] / 1000)),
293
+ int(height * (bbox[3] / 1000)),
294
+ ]
295
+
296
+ # get back original size
297
+ def original_box(box, original_width, original_height, coco_width, coco_height):
298
+ return [
299
+ int(original_width * (box[0] / coco_width)),
300
+ int(original_height * (box[1] / coco_height)),
301
+ int(original_width * (box[2] / coco_width)),
302
+ int(original_height* (box[3] / coco_height)),
303
+ ]
304
+
305
+ def get_blocks(bboxes_block, categories, texts):
306
+
307
+ # get list of unique block boxes
308
+ bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
309
+ for count_block, bbox_block in enumerate(bboxes_block):
310
+ if bbox_block != bbox_block_prec:
311
+ bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
312
+ bbox_block_dict[count_block] = bbox_block_indexes
313
+ bboxes_block_list.append(bbox_block)
314
+ bbox_block_prec = bbox_block
315
+
316
+ # get list of categories and texts by unique block boxes
317
+ category_block_list, text_block_list = list(), list()
318
+ for bbox_block in bboxes_block_list:
319
+ count_block = bboxes_block.index(bbox_block)
320
+ bbox_block_indexes = bbox_block_dict[count_block]
321
+ category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
322
+ category_block_list.append(category_block)
323
+ text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
324
+ text_block = [text.replace("\n","").strip() for text in text_block]
325
+ if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
326
+ text_block = ' '.join(text_block)
327
+ else:
328
+ text_block = '\n'.join(text_block)
329
+ text_block_list.append(text_block)
330
+
331
+ return bboxes_block_list, category_block_list, text_block_list
332
+
333
+ # function to sort bounding boxes
334
+ def get_sorted_boxes(bboxes):
335
+
336
+ # sort by y from page top to bottom
337
+ sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
338
+ y_list = [bbox[1] for bbox in sorted_bboxes]
339
+
340
+ # sort by x from page left to right when boxes with same y
341
+ if len(list(set(y_list))) != len(y_list):
342
+ y_list_duplicates_indexes = dict()
343
+ y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
344
+ for item in y_list_duplicates:
345
+ y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
346
+ bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
347
+ np_array_bboxes = np.array(sorted_bboxes)
348
+ np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
349
+ sorted_bboxes = np_array_bboxes.tolist()
350
+
351
+ return sorted_bboxes
352
+
353
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
354
+ def sort_data(bboxes, categories, texts):
355
+
356
+ sorted_bboxes = get_sorted_boxes(bboxes)
357
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
358
+ sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
359
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
360
+
361
+ return sorted_bboxes, sorted_categories, sorted_texts
362
+
363
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
364
+ def sort_data_wo_labels(bboxes, texts):
365
+
366
+ sorted_bboxes = get_sorted_boxes(bboxes)
367
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
368
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
369
+
370
+ return sorted_bboxes, sorted_texts
371
+
372
+ ## PDF processing
373
+
374
+ # get filename and images of PDF pages
375
+ def pdf_to_images(uploaded_pdf):
376
+
377
+ # Check if None object
378
+ if uploaded_pdf is None:
379
+ path_to_file = pdf_blank
380
+ filename = path_to_file.replace(examples_dir,"")
381
+ msg = "Invalid PDF file."
382
+ images = [Image.open(image_blank)]
383
+ else:
384
+ # path to the uploaded PDF
385
+ path_to_file = uploaded_pdf.name
386
+ filename = path_to_file.replace("/tmp/","")
387
+
388
+ try:
389
+ PdfReader(path_to_file)
390
+ except PdfReadError:
391
+ path_to_file = pdf_blank
392
+ filename = path_to_file.replace(examples_dir,"")
393
+ msg = "Invalid PDF file."
394
+ images = [Image.open(image_blank)]
395
+ else:
396
+ try:
397
+ images = convert_from_path(path_to_file, last_page=max_imgboxes)
398
+ num_imgs = len(images)
399
+ msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
400
+ except:
401
+ msg = f'Error with the PDF "{filename}": it was not converted into images.'
402
+ images = [Image.open(image_wo_content)]
403
+
404
+ return filename, msg, images
405
+
406
+ # Extraction of image data (text and bounding boxes)
407
+ def extraction_data_from_image(images):
408
+
409
+ num_imgs = len(images)
410
+
411
+ if num_imgs > 0:
412
+
413
+ # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
414
+ custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
415
+ results, lines, row_indexes, par_boxes, line_boxes, images_pixels = dict(), dict(), dict(), dict(), dict(), dict()
416
+ images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, images_pixels_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list(), list()
417
+
418
+ try:
419
+ for i,image in enumerate(images):
420
+ # image preprocessing
421
+ # https://docs.opencv.org/3.0-beta/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html
422
+ img = image.copy()
423
+ factor, path_to_img = set_image_dpi_resize(img) # Rescaling to 300dpi
424
+ img = Image.open(path_to_img)
425
+ img = np.array(img, dtype='uint8') # convert PIL to cv2
426
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
427
+ ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
428
+
429
+ # OCR PyTesseract | get langs of page
430
+ txt = pytesseract.image_to_string(img, config=custom_config)
431
+ txt = txt.strip().lower()
432
+ txt = re.sub(r" +", " ", txt) # multiple space
433
+ txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
434
+ # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
435
+ try:
436
+ langs = detect_langs(txt)
437
+ langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
438
+ langs_string = '+'.join(langs)
439
+ except:
440
+ langs_string = "eng"
441
+ langs_string += '+osd'
442
+ custom_config = f'--oem 3 --psm 3 -l {langs_string}' # default config PyTesseract: --oem 3 --psm 3
443
+
444
+ # OCR PyTesseract | get data
445
+ results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
446
+ # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
447
+
448
+ # get image pixels
449
+ images_pixels[i] = feature_extractor(images[i], return_tensors="pt").pixel_values
450
+
451
+ lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
452
+ lines_list.append(lines[i])
453
+ par_boxes_list.append(par_boxes[i])
454
+ line_boxes_list.append(line_boxes[i])
455
+ images_ids_list.append(i)
456
+ images_pixels_list.append(images_pixels[i])
457
+ images_list.append(images[i])
458
+ page_no_list.append(i)
459
+ num_pages_list.append(num_imgs)
460
+
461
+ except:
462
+ print(f"There was an error within the extraction of PDF text by the OCR!")
463
+ else:
464
+ from datasets import Dataset
465
+ dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "images_pixels": images_pixels_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_line": line_boxes_list})
466
+
467
+ # print(f"The text data was successfully extracted by the OCR!")
468
+
469
+ return dataset, lines, row_indexes, par_boxes, line_boxes
470
+
471
+ ## Inference
472
+
473
+ def prepare_inference_features(example, cls_box = cls_box, sep_box = sep_box):
474
+
475
+ images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
476
+
477
+ # get batch
478
+ batch_images_ids = example["images_ids"]
479
+ batch_images = example["images"]
480
+ batch_images_pixels = example["images_pixels"]
481
+ batch_bboxes_line = example["bboxes_line"]
482
+ batch_texts = example["texts"]
483
+ batch_images_size = [image.size for image in batch_images]
484
+
485
+ batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size]
486
+
487
+ # add a dimension if not a batch but only one image
488
+ if not isinstance(batch_images_ids, list):
489
+ batch_images_ids = [batch_images_ids]
490
+ batch_images = [batch_images]
491
+ batch_images_pixels = [batch_images_pixels]
492
+ batch_bboxes_line = [batch_bboxes_line]
493
+ batch_texts = [batch_texts]
494
+ batch_width, batch_height = [batch_width], [batch_height]
495
+
496
+ # process all images of the batch
497
+ for num_batch, (image_id, image_pixels, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_images_pixels, batch_bboxes_line, batch_texts, batch_width, batch_height)):
498
+ tokens_list = []
499
+ bboxes_list = []
500
+
501
+ # add a dimension if only on image
502
+ if not isinstance(texts, list):
503
+ texts, boxes = [texts], [boxes]
504
+
505
+ # convert boxes to original
506
+ normalize_bboxes_line = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
507
+
508
+ # sort boxes with texts
509
+ # we want sorted lists from top to bottom of the image
510
+ boxes, texts = sort_data_wo_labels(normalize_bboxes_line, texts)
511
+
512
+ count = 0
513
+ for box, text in zip(boxes, texts):
514
+ tokens = tokenizer.tokenize(text)
515
+ num_tokens = len(tokens) # get number of tokens
516
+ tokens_list.extend(tokens)
517
+
518
+ bboxes_list.extend([box] * num_tokens) # number of boxes must be the same as the number of tokens
519
+
520
+ # use of return_overflowing_tokens=True / stride=doc_stride
521
+ # to get parts of image with overlap
522
+ # source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts
523
+ encodings = tokenizer(" ".join(texts),
524
+ truncation=True,
525
+ padding="max_length",
526
+ max_length=max_length,
527
+ stride=doc_stride,
528
+ return_overflowing_tokens=True,
529
+ return_offsets_mapping=True
530
+ )
531
+
532
+ otsm = encodings.pop("overflow_to_sample_mapping")
533
+ offset_mapping = encodings.pop("offset_mapping")
534
+
535
+ # Let's label those examples and get their boxes
536
+ sequence_length_prev = 0
537
+ for i, offsets in enumerate(offset_mapping):
538
+ # truncate tokens, boxes and labels based on length of chunk - 2 (special tokens <s> and </s>)
539
+ sequence_length = len(encodings.input_ids[i]) - 2
540
+ if i == 0: start = 0
541
+ else: start += sequence_length_prev - doc_stride
542
+ end = start + sequence_length
543
+ sequence_length_prev = sequence_length
544
+
545
+ # get tokens, boxes and labels of this image chunk
546
+ bb = [cls_box] + bboxes_list[start:end] + [sep_box]
547
+
548
+ # as the last chunk can have a length < max_length
549
+ # we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels)
550
+ if len(bb) < max_length:
551
+ bb = bb + [sep_box] * (max_length - len(bb))
552
+
553
+ # append results
554
+ input_ids_list.append(encodings["input_ids"][i])
555
+ attention_mask_list.append(encodings["attention_mask"][i])
556
+ bb_list.append(bb)
557
+ images_ids_list.append(image_id)
558
+ chunks_ids_list.append(i)
559
+ images_pixels_list.append(image_pixels)
560
+
561
+ return {
562
+ "images_ids": images_ids_list,
563
+ "chunk_ids": chunks_ids_list,
564
+ "input_ids": input_ids_list,
565
+ "attention_mask": attention_mask_list,
566
+ "normalized_bboxes": bb_list,
567
+ "images_pixels": images_pixels_list
568
+ }
569
+
570
+ from torch.utils.data import Dataset
571
+
572
+ class CustomDataset(Dataset):
573
+ def __init__(self, dataset, tokenizer):
574
+ self.dataset = dataset
575
+ self.tokenizer = tokenizer
576
+
577
+ def __len__(self):
578
+ return len(self.dataset)
579
+
580
+ def __getitem__(self, idx):
581
+ # get item
582
+ example = self.dataset[idx]
583
+ encoding = dict()
584
+ encoding["images_ids"] = example["images_ids"]
585
+ encoding["chunk_ids"] = example["chunk_ids"]
586
+ encoding["input_ids"] = example["input_ids"]
587
+ encoding["attention_mask"] = example["attention_mask"]
588
+ encoding["bbox"] = example["normalized_bboxes"]
589
+ encoding["images_pixels"] = example["images_pixels"]
590
+
591
+ return encoding
592
+
593
+ import torch.nn.functional as F
594
+
595
+ # get predictions at token level
596
+ def predictions_token_level(images, custom_encoded_dataset):
597
+
598
+ num_imgs = len(images)
599
+ if num_imgs > 0:
600
+
601
+ chunk_ids, input_ids, bboxes, pixels_values, outputs, token_predictions = dict(), dict(), dict(), dict(), dict(), dict()
602
+ images_ids_list = list()
603
+
604
+ for i,encoding in enumerate(custom_encoded_dataset):
605
+
606
+ # get custom encoded data
607
+ image_id = encoding['images_ids']
608
+ chunk_id = encoding['chunk_ids']
609
+ input_id = torch.tensor(encoding['input_ids'])[None]
610
+ attention_mask = torch.tensor(encoding['attention_mask'])[None]
611
+ bbox = torch.tensor(encoding['bbox'])[None]
612
+ pixel_values = torch.tensor(encoding["images_pixels"])
613
+
614
+ # save data in dictionnaries
615
+ if image_id not in images_ids_list: images_ids_list.append(image_id)
616
+
617
+ if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id)
618
+ else: chunk_ids[image_id] = [chunk_id]
619
+
620
+ if image_id in input_ids: input_ids[image_id].append(input_id)
621
+ else: input_ids[image_id] = [input_id]
622
+
623
+ if image_id in bboxes: bboxes[image_id].append(bbox)
624
+ else: bboxes[image_id] = [bbox]
625
+
626
+ if image_id in pixels_values: pixels_values[image_id].append(pixel_values)
627
+ else: pixels_values[image_id] = [pixel_values]
628
+
629
+ # get prediction with forward pass
630
+ with torch.no_grad():
631
+ output = model(
632
+ input_ids=input_id.to(device),
633
+ attention_mask=attention_mask.to(device),
634
+ bbox=bbox.to(device),
635
+ image=pixel_values.to(device)
636
+ )
637
+
638
+ # save probabilities of predictions in dictionnary
639
+ if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
640
+ else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)]
641
+
642
+ return outputs, images_ids_list, chunk_ids, input_ids, bboxes
643
+
644
+ else:
645
+ print("An error occurred while getting predictions!")
646
+
647
+ from functools import reduce
648
+
649
+ # Get predictions (line level)
650
+ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
651
+
652
+ ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
653
+ bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
654
+
655
+ if len(images_ids_list) > 0:
656
+
657
+ for i, image_id in enumerate(images_ids_list):
658
+
659
+ # get image information
660
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
661
+ image = images_list[0]
662
+ width, height = image.size
663
+
664
+ # get data
665
+ chunk_ids_list = chunk_ids[image_id]
666
+ outputs_list = outputs[image_id]
667
+ input_ids_list = input_ids[image_id]
668
+ bboxes_list = bboxes[image_id]
669
+
670
+ # create zeros tensors
671
+ ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1])
672
+ ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int)
673
+ ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int)
674
+
675
+ if len(outputs_list) > 1:
676
+
677
+ for num_output, (output, input_id, bbox) in enumerate(zip(outputs_list, input_ids_list, bboxes_list)):
678
+ start = num_output*(max_length - 2) - max(0,num_output)*doc_stride
679
+ end = start + (max_length - 2)
680
+
681
+ if num_output == 0:
682
+ ten_probs[start:end,:] += output[1:-1]
683
+ ten_input_ids[:,start:end] = input_id[:,1:-1]
684
+ ten_bboxes[:,start:end,:] = bbox[:,1:-1,:]
685
+ else:
686
+ ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride]
687
+ ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5
688
+ ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1]
689
+
690
+ ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride]
691
+ ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1]
692
+
693
+ ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:]
694
+ ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:]
695
+
696
+ else:
697
+ ten_probs += outputs_list[0][1:-1]
698
+ ten_input_ids = input_ids_list[0][:,1:-1]
699
+ ten_bboxes = bboxes_list[0][:,1:-1]
700
+
701
+ ten_probs_list, ten_input_ids_list, ten_bboxes_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0]
702
+ bboxes_list = list()
703
+ input_ids_dict, probs_dict = dict(), dict()
704
+ bbox_prev = [-100, -100, -100, -100]
705
+ for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
706
+ bbox = denormalize_box(bbox, width, height)
707
+ if bbox != bbox_prev and bbox != cls_box:
708
+ bboxes_list.append(bbox)
709
+ input_ids_dict[str(bbox)] = [input_id]
710
+ probs_dict[str(bbox)] = [probs]
711
+ else:
712
+ if bbox != cls_box:
713
+ input_ids_dict[str(bbox)].append(input_id)
714
+ probs_dict[str(bbox)].append(probs)
715
+ bbox_prev = bbox
716
+
717
+ probs_bbox = dict()
718
+ for i,bbox in enumerate(bboxes_list):
719
+ probs = probs_dict[str(bbox)]
720
+ probs = np.array(probs).T.tolist()
721
+
722
+ probs_label = list()
723
+ for probs_list in probs:
724
+ prob_label = reduce(lambda x, y: x*y, probs_list)
725
+ probs_label.append(prob_label)
726
+ max_value = max(probs_label)
727
+ max_index = probs_label.index(max_value)
728
+ probs_bbox[str(bbox)] = max_index
729
+
730
+ bboxes_list_dict[image_id] = bboxes_list
731
+ input_ids_dict_dict[image_id] = input_ids_dict
732
+ probs_dict_dict[image_id] = probs_bbox
733
+
734
+ df[image_id] = pd.DataFrame()
735
+ df[image_id]["bboxes"] = bboxes_list
736
+ df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list]
737
+ df[image_id]["labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list]
738
+
739
+ return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df
740
+
741
+ else:
742
+ print("An error occurred while getting predictions!")
743
+
744
+ # Get labeled images with lines bounding boxes
745
+ def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
746
+
747
+ labeled_images = list()
748
+
749
+ for i, image_id in enumerate(images_ids_list):
750
+
751
+ # get image
752
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
753
+ image = images_list[0]
754
+ width, height = image.size
755
+
756
+ # get predicted boxes and labels
757
+ bboxes_list = bboxes_list_dict[image_id]
758
+ probs_bbox = probs_dict_dict[image_id]
759
+
760
+ draw = ImageDraw.Draw(image)
761
+ # https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f
762
+ font = font_manager.FontProperties(family='sans-serif', weight='bold')
763
+ font_file = font_manager.findfont(font)
764
+ font_size = 30
765
+ font = ImageFont.truetype(font_file, font_size)
766
+
767
+ for bbox in bboxes_list:
768
+ predicted_label = id2label[probs_bbox[str(bbox)]]
769
+ draw.rectangle(bbox, outline=label2color[predicted_label])
770
+ draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font)
771
+
772
+ labeled_images.append(image)
773
+
774
+ return labeled_images
775
+
776
+ # get data of encoded chunk
777
+ def get_encoded_chunk_inference(index_chunk=None):
778
+
779
+ # get datasets
780
+ example = dataset
781
+ encoded_example = encoded_dataset
782
+
783
+ # get randomly a document in dataset
784
+ if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1)
785
+ encoded_example = encoded_example[index_chunk]
786
+ encoded_image_ids = encoded_example["images_ids"]
787
+
788
+ # get the image
789
+ example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0]
790
+ image = example["images"] # original image
791
+ width, height = image.size
792
+ page_no = example["page_no"]
793
+ num_pages = example["num_pages"]
794
+
795
+ # get boxes, texts, categories
796
+ bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1]
797
+ bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes]
798
+ num_tokens = len(input_ids) + 2
799
+
800
+ # get unique bboxes and corresponding labels
801
+ bboxes_list, input_ids_list = list(), list()
802
+ input_ids_dict = dict()
803
+ bbox_prev = [-100, -100, -100, -100]
804
+ for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)):
805
+ if bbox != bbox_prev:
806
+ bboxes_list.append(bbox)
807
+ input_ids_dict[str(bbox)] = [input_id]
808
+ else:
809
+ input_ids_dict[str(bbox)].append(input_id)
810
+
811
+ # start_indexes_list.append(i)
812
+ bbox_prev = bbox
813
+
814
+ # do not keep "</s><pad><pad>..."
815
+ if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('</s>')):
816
+ del input_ids_dict[str(bboxes_list[-1])]
817
+ bboxes_list = bboxes_list[:-1]
818
+
819
+ # get texts by line
820
+ input_ids_list = input_ids_dict.values()
821
+ texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
822
+
823
+ # display DataFrame
824
+ df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list})
825
+
826
+ return image, df, num_tokens, page_no, num_pages
827
+
828
+ # display chunk of PDF image and its data
829
+ def display_chunk_lines_inference(index_chunk=None):
830
+
831
+ # get image and image data
832
+ image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
833
+
834
+ # get data from dataframe
835
+ input_ids = df["input_ids"]
836
+ texts = df["texts"]
837
+ bboxes = df["bboxes"]
838
+
839
+ print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
840
+
841
+ # display image with bounding boxes
842
+ print(">> PDF image with bounding boxes of lines\n")
843
+ draw = ImageDraw.Draw(image)
844
+
845
+ labels = list()
846
+ for box, text in zip(bboxes, texts):
847
+ color = "red"
848
+ draw.rectangle(box, outline=color)
849
+
850
+ # resize image to original
851
+ width, height = image.size
852
+ image = image.resize((int(0.5*width), int(0.5*height)))
853
+
854
+ # convert to cv and display
855
+ img = np.array(image, dtype='uint8') # PIL to cv2
856
+ cv2_imshow(img)
857
+ cv2.waitKey(0)
858
+
859
+ # display image dataframe
860
+ print("\n>> Dataframe of annotated lines\n")
861
+ cols = ["texts", "bboxes"]
862
+ df = df[cols]
863
+ display(df)
files/languages_iso.csv ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Abkhazian,ab
3
+ Afar,aa
4
+ Afrikaans,af
5
+ Akan,ak
6
+ Albanian,sq
7
+ Amharic,am
8
+ Arabic,ar
9
+ Aragonese,an
10
+ Armenian,hy
11
+ Assamese,as
12
+ Avaric,av
13
+ Avestan,ae
14
+ Aymara,ay
15
+ Azerbaijani,az
16
+ Bambara,bm
17
+ Bashkir,ba
18
+ Basque,eu
19
+ Belarusian,be
20
+ Bengali,bn
21
+ Bislama,bi
22
+ Bosnian,bs
23
+ Breton,br
24
+ Bulgarian,bg
25
+ Burmese,my
26
+ "Catalan, Valencian",ca
27
+ Chamorro,ch
28
+ Chechen,ce
29
+ "Chichewa, Chewa, Nyanja",ny
30
+ Chinese,zh
31
+ "Church Slavonic, Old Slavonic, Old Church Slavonic",cu
32
+ Chuvash,cv
33
+ Cornish,kw
34
+ Corsican,co
35
+ Cree,cr
36
+ Croatian,hr
37
+ Czech,cs
38
+ Danish,da
39
+ "Divehi, Dhivehi, Maldivian",dv
40
+ "Dutch, Flemish",nl
41
+ Dzongkha,dz
42
+ English,en
43
+ Esperanto,eo
44
+ Estonian,et
45
+ Ewe,ee
46
+ Faroese,fo
47
+ Fijian,fj
48
+ Finnish,fi
49
+ French,fr
50
+ Western Frisian,fy
51
+ Fulah,ff
52
+ "Gaelic, Scottish Gaelic",gd
53
+ Galician,gl
54
+ Ganda,lg
55
+ Georgian,ka
56
+ German,de
57
+ "Greek, Modern (1453–)",el
58
+ "Kalaallisut, Greenlandic",kl
59
+ Guarani,gn
60
+ Gujarati,gu
61
+ "Haitian, Haitian Creole",ht
62
+ Hausa,ha
63
+ Hebrew,he
64
+ Herero,hz
65
+ Hindi,hi
66
+ Hiri Motu,ho
67
+ Hungarian,hu
68
+ Icelandic,is
69
+ Ido,io
70
+ Igbo,ig
71
+ Indonesian,id
72
+ Interlingua (International Auxiliary Language Association),ia
73
+ "Interlingue, Occidental",ie
74
+ Inuktitut,iu
75
+ Inupiaq,ik
76
+ Irish,ga
77
+ Italian,it
78
+ Japanese,ja
79
+ Javanese,jv
80
+ Kannada,kn
81
+ Kanuri,kr
82
+ Kashmiri,ks
83
+ Kazakh,kk
84
+ Central Khmer,km
85
+ "Kikuyu, Gikuyu",ki
86
+ Kinyarwanda,rw
87
+ "Kirghiz, Kyrgyz",ky
88
+ Komi,kv
89
+ Kongo,kg
90
+ Korean,ko
91
+ "Kuanyama, Kwanyama",kj
92
+ Kurdish,ku
93
+ Lao,lo
94
+ Latin,la
95
+ Latvian,lv
96
+ "Limburgan, Limburger, Limburgish",li
97
+ Lingala,ln
98
+ Lithuanian,lt
99
+ Luba-Katanga,lu
100
+ "Luxembourgish, Letzeburgesch",lb
101
+ Macedonian,mk
102
+ Malagasy,mg
103
+ Malay,ms
104
+ Malayalam,ml
105
+ Maltese,mt
106
+ Manx,gv
107
+ Maori,mi
108
+ Marathi,mr
109
+ Marshallese,mh
110
+ Mongolian,mn
111
+ Nauru,na
112
+ "Navajo, Navaho",nv
113
+ North Ndebele,nd
114
+ South Ndebele,nr
115
+ Ndonga,ng
116
+ Nepali,ne
117
+ Norwegian,no
118
+ Norwegian Bokmål,nb
119
+ Norwegian Nynorsk,nn
120
+ "Sichuan Yi, Nuosu",ii
121
+ Occitan,oc
122
+ Ojibwa,oj
123
+ Oriya,or
124
+ Oromo,om
125
+ "Ossetian, Ossetic",os
126
+ Pali,pi
127
+ "Pashto, Pushto",ps
128
+ Persian,fa
129
+ Polish,pl
130
+ Portuguese,pt
131
+ "Punjabi, Panjabi",pa
132
+ Quechua,qu
133
+ "Romanian, Moldavian, Moldovan",ro
134
+ Romansh,rm
135
+ Rundi,rn
136
+ Russian,ru
137
+ Northern Sami,se
138
+ Samoan,sm
139
+ Sango,sg
140
+ Sanskrit,sa
141
+ Sardinian,sc
142
+ Serbian,sr
143
+ Shona,sn
144
+ Sindhi,sd
145
+ "Sinhala, Sinhalese",si
146
+ Slovak,sk
147
+ Slovenian,sl
148
+ Somali,so
149
+ Southern Sotho,st
150
+ "Spanish, Castilian",es
151
+ Sundanese,su
152
+ Swahili,sw
153
+ Swati,ss
154
+ Swedish,sv
155
+ Tagalog,tl
156
+ Tahitian,ty
157
+ Tajik,tg
158
+ Tamil,ta
159
+ Tatar,tt
160
+ Telugu,te
161
+ Thai,th
162
+ Tibetan,bo
163
+ Tigrinya,ti
164
+ Tonga (Tonga Islands),to
165
+ Tsonga,ts
166
+ Tswana,tn
167
+ Turkish,tr
168
+ Turkmen,tk
169
+ Twi,tw
170
+ "Uighur, Uyghur",ug
171
+ Ukrainian,uk
172
+ Urdu,ur
173
+ Uzbek,uz
174
+ Venda,ve
175
+ Vietnamese,vi
176
+ Volapük,vo
177
+ Walloon,wa
178
+ Welsh,cy
179
+ Wolof,wo
180
+ Xhosa,xh
181
+ Yiddish,yi
182
+ Yoruba,yo
183
+ "Zhuang, Chuang",za
184
+ Zulu,zu
files/languages_tesseract.csv ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Afrikaans,afr
3
+ Amharic,amh
4
+ Arabic,ara
5
+ Assamese,asm
6
+ Azerbaijani,aze
7
+ Azerbaijani - Cyrilic,aze_cyrl
8
+ Belarusian,bel
9
+ Bengali,ben
10
+ Tibetan,bod
11
+ Bosnian,bos
12
+ Breton,bre
13
+ Bulgarian,bul
14
+ Catalan; Valencian,cat
15
+ Cebuano,ceb
16
+ Czech,ces
17
+ Chinese - Simplified,chi_sim
18
+ Chinese - Traditional,chi_tra
19
+ Cherokee,chr
20
+ Corsican,cos
21
+ Welsh,cym
22
+ Danish,dan
23
+ Danish - Fraktur (contrib),dan_frak
24
+ German,deu
25
+ German - Fraktur (contrib),deu_frak
26
+ Dzongkha,dzo
27
+ "Greek, Modern (1453-)",ell
28
+ English,eng
29
+ "English, Middle (1100-1500)",enm
30
+ Esperanto,epo
31
+ Math / equation detection module,equ
32
+ Estonian,est
33
+ Basque,eus
34
+ Faroese,fao
35
+ Persian,fas
36
+ Filipino (old - Tagalog),fil
37
+ Finnish,fin
38
+ French,fra
39
+ German - Fraktur,frk
40
+ "French, Middle (ca.1400-1600)",frm
41
+ Western Frisian,fry
42
+ Scottish Gaelic,gla
43
+ Irish,gle
44
+ Galician,glg
45
+ "Greek, Ancient (to 1453) (contrib)",grc
46
+ Gujarati,guj
47
+ Haitian; Haitian Creole,hat
48
+ Hebrew,heb
49
+ Hindi,hin
50
+ Croatian,hrv
51
+ Hungarian,hun
52
+ Armenian,hye
53
+ Inuktitut,iku
54
+ Indonesian,ind
55
+ Icelandic,isl
56
+ Italian,ita
57
+ Italian - Old,ita_old
58
+ Javanese,jav
59
+ Japanese,jpn
60
+ Kannada,kan
61
+ Georgian,kat
62
+ Georgian - Old,kat_old
63
+ Kazakh,kaz
64
+ Central Khmer,khm
65
+ Kirghiz; Kyrgyz,kir
66
+ Kurmanji (Kurdish - Latin Script),kmr
67
+ Korean,kor
68
+ Korean (vertical),kor_vert
69
+ Kurdish (Arabic Script),kur
70
+ Lao,lao
71
+ Latin,lat
72
+ Latvian,lav
73
+ Lithuanian,lit
74
+ Luxembourgish,ltz
75
+ Malayalam,mal
76
+ Marathi,mar
77
+ Macedonian,mkd
78
+ Maltese,mlt
79
+ Mongolian,mon
80
+ Maori,mri
81
+ Malay,msa
82
+ Burmese,mya
83
+ Nepali,nep
84
+ Dutch; Flemish,nld
85
+ Norwegian,nor
86
+ Occitan (post 1500),oci
87
+ Oriya,ori
88
+ Orientation and script detection module,osd
89
+ Panjabi; Punjabi,pan
90
+ Polish,pol
91
+ Portuguese,por
92
+ Pushto; Pashto,pus
93
+ Quechua,que
94
+ Romanian; Moldavian; Moldovan,ron
95
+ Russian,rus
96
+ Sanskrit,san
97
+ Sinhala; Sinhalese,sin
98
+ Slovak,slk
99
+ Slovak - Fraktur (contrib),slk_frak
100
+ Slovenian,slv
101
+ Sindhi,snd
102
+ Spanish; Castilian,spa
103
+ Spanish; Castilian - Old,spa_old
104
+ Albanian,sqi
105
+ Serbian,srp
106
+ Serbian - Latin,srp_latn
107
+ Sundanese,sun
108
+ Swahili,swa
109
+ Swedish,swe
110
+ Syriac,syr
111
+ Tamil,tam
112
+ Tatar,tat
113
+ Telugu,tel
114
+ Tajik,tgk
115
+ Tagalog (new - Filipino),tgl
116
+ Thai,tha
117
+ Tigrinya,tir
118
+ Tonga,ton
119
+ Turkish,tur
120
+ Uighur; Uyghur,uig
121
+ Ukrainian,ukr
122
+ Urdu,urd
123
+ Uzbek,uzb
124
+ Uzbek - Cyrilic,uzb_cyrl
125
+ Vietnamese,vie
126
+ Yiddish,yid
127
+ Yoruba,yor
files/template.pdf ADDED
Binary file (29.4 kB). View file
 
files/wo_content.png ADDED
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr-all
2
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ pytesseract
5
+ opencv-python
6
+ pdf2image
7
+ pypdf
8
+ langdetect
9
+ gradio