alitavanaali commited on
Commit
911803a
·
1 Parent(s): c10fb98

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +371 -0
  2. doc1.2.pdf +0 -0
  3. doc1.pdf +0 -0
  4. packages.txt +7 -0
  5. requirements.txt +11 -0
  6. test1.jpg +0 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled1.ipynb
3
+ Automatically generated by Colaboratory.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1J4fCr7TGzdFvkCeikMAQ5af5ml2Q83W0
6
+ """
7
+
8
+ import os
9
+ os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
10
+ import os, glob, fitz
11
+ import cv2
12
+ import os
13
+ import PIL
14
+ import torch
15
+ import pandas as pd
16
+ import numpy as np
17
+ import pandas as pd
18
+ import gradio as gr
19
+ from tqdm import tqdm
20
+ from PIL import Image as im
21
+ from scipy import ndimage
22
+ from difflib import SequenceMatcher
23
+ from itertools import groupby
24
+ from datasets import load_metric
25
+ from datasets import load_dataset
26
+ from datasets.features import ClassLabel
27
+ from transformers import AutoProcessor
28
+ from PIL import Image, ImageDraw, ImageFont
29
+ from transformers import AutoModelForTokenClassification
30
+ from transformers.data.data_collator import default_data_collator
31
+ from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
32
+ from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor
33
+
34
+ # define id2label
35
+ id2label={0: 'container id', 1: 'seal number', 2: 'container quantity', 3: 'container type', 4: 'tare', 5: 'package quantity', 6: 'weight', 7: 'others'}
36
+ custom_config = r'--oem 3 --psm 6'
37
+ # lang='eng+deu+ita+chi_sim'
38
+ lang='spa'
39
+
40
+ label_ints = np.random.randint(0,len(PIL.ImageColor.colormap.items()),42)
41
+ label_color_pil = [k for k,_ in PIL.ImageColor.colormap.items()]
42
+ label_color = [label_color_pil[i] for i in label_ints]
43
+ label2color = {}
44
+ for k,v in id2label.items():
45
+ if v[:2] == '':
46
+ label2color['o']=label_color[k]
47
+ else:
48
+ label2color[v[2:]]=label_color[k]
49
+
50
+ processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True,lang=lang)
51
+ model = AutoModelForTokenClassification.from_pretrained("atatavana/layoutlm_manifesto_bigdataset")
52
+
53
+
54
+ def unnormalize_box(bbox, width, height):
55
+ #print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox)
56
+ return [
57
+ width * (bbox[0] / 1000),
58
+ height * (bbox[1] / 1000),
59
+ width * (bbox[2] / 1000),
60
+ height * (bbox[3] / 1000),
61
+ ]
62
+
63
+ def iob_to_label(label):
64
+ if label == 0:
65
+ return 'container id'
66
+ if label == 1:
67
+ return 'seal number'
68
+ if label == 2:
69
+ return 'container quantity'
70
+ if label == 3:
71
+ return 'container type'
72
+ if label == 4:
73
+ return 'tare'
74
+ if label == 5:
75
+ return 'package quantity'
76
+ if label == 6:
77
+ return 'weight'
78
+ if label == 7:
79
+ return 'others'
80
+
81
+ # this method will detect if there is any intersect between two boxes or not
82
+ def intersect(w, z):
83
+ x1 = max(w[0], z[0]) #190 | 881 | 10
84
+ y1 = max(w[1], z[1]) #90 | 49 | 273
85
+ x2 = min(w[2], z[2]) #406 | 406 | 1310
86
+ y2 = min(w[3], z[3]) #149 | 703 | 149
87
+ if (x1 > x2 or y1 > y2):
88
+ return 0
89
+ else:
90
+ # because sometimes in annotating, it is possible to overlap rows or columns by mistake
91
+ # for very small pixels, we check a threshold to delete them
92
+ area = (x2-x1) * (y2-y1)
93
+ if (area > 0): #500 is minumum accepted area
94
+ return [int(x1), int(y1), int(x2), int(y2)]
95
+ else:
96
+ return 0
97
+
98
+
99
+ def process_image(image):
100
+ custom_config = r'--oem 3 --psm 6'
101
+ # lang='eng+deu+ita+chi_sim'
102
+ lang='spa'
103
+ width, height = image.size
104
+ feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
105
+ encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
106
+ words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
107
+
108
+ custom_config = r'--oem 3 --psm 6'
109
+ # encode
110
+ inference_image = [image.convert("RGB")]
111
+ encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt",
112
+ padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
113
+ offset_mapping = encoding.pop('offset_mapping')
114
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
115
+
116
+ # change the shape of pixel values
117
+ x = []
118
+ for i in range(0, len(encoding['pixel_values'])):
119
+ x.append(encoding['pixel_values'][i])
120
+ x = torch.stack(x)
121
+ encoding['pixel_values'] = x
122
+
123
+ # forward pass
124
+ outputs = model(**encoding)
125
+
126
+ # get predictions
127
+ predictions = outputs.logits.argmax(-1).squeeze().tolist()
128
+ token_boxes = encoding.bbox.squeeze().tolist()
129
+
130
+ # only keep non-subword predictions
131
+ preds = []
132
+ l_words = []
133
+ bboxes = []
134
+ token_section_num = []
135
+
136
+ if (len(token_boxes) == 512):
137
+ predictions = [predictions]
138
+ token_boxes = [token_boxes]
139
+
140
+
141
+ for i in range(0, len(token_boxes)):
142
+ for j in range(0, len(token_boxes[i])):
143
+ #print(np.asarray(token_boxes[i][j]).shape)
144
+ unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
145
+ #print('prediction: {} - box: {} - word:{}'.format(predictions[i][j], unnormal_box, processor.tokenizer.decode(encoding["input_ids"][i][j])))
146
+ if (np.asarray(token_boxes[i][j]).shape != (4,)):
147
+ continue
148
+ elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
149
+ #print('zero found!')
150
+ continue
151
+ # if bbox is available in the list, just we need to update text
152
+ elif (unnormal_box not in bboxes):
153
+ preds.append(predictions[i][j])
154
+ l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
155
+ bboxes.append(unnormal_box)
156
+ token_section_num.append(i)
157
+ else:
158
+ # we have to update the word
159
+ _index = bboxes.index(unnormal_box)
160
+ if (token_section_num[_index] == i):
161
+ # check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
162
+ # parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
163
+ # HERE: because they're in a same section, so we can merge them safely
164
+ l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
165
+
166
+ else:
167
+ continue
168
+
169
+
170
+ return bboxes, preds, l_words, image
171
+
172
+
173
+
174
+ def visualize_image(final_bbox, final_preds, l_words, image):
175
+
176
+ draw = ImageDraw.Draw(image)
177
+ font = ImageFont.load_default()
178
+
179
+ label2color = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
180
+ l2l = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
181
+ f_labels = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
182
+
183
+ json_df = []
184
+
185
+ for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
186
+ predicted_label = iob_to_label(prediction).lower()
187
+ draw.rectangle(box, outline=label2color[predicted_label])
188
+ draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
189
+
190
+ json_dict = {}
191
+ json_dict['TEXT'] = l_words[ix]
192
+ json_dict['LABEL'] = f_labels[predicted_label]
193
+
194
+ json_df.append(json_dict)
195
+ return image, json_df
196
+
197
+
198
+ def mergeCloseBoxes(pr, bb, wr, threshold):
199
+ idx = 0
200
+ final_bbox =[]
201
+ final_preds =[]
202
+ final_words=[]
203
+
204
+ for box, pred, word in zip(bb, pr, wr):
205
+ if (pred=='others'):
206
+ continue
207
+ else:
208
+ final_bbox.append(box)
209
+ final_preds.append(pred)
210
+ final_words.append(word)
211
+ for b, p, w in zip(bb, pr, wr):
212
+ if (p == 'others'):
213
+ continue
214
+ elif (box==b): # we shouldn't check each item with itself
215
+ continue
216
+ else:
217
+ XMIN, YMIN, XMAX, YMAX = box
218
+ xmin, ymin, xmax, ymax = b
219
+ intsc = intersect([XMIN, YMIN, XMAX+threshold, YMAX], [xmin-threshold, ymin, xmax, ymax])
220
+ if (intsc != 0 and pred==p):
221
+ #if(abs(XMAX - xmin) < treshold and abs(YMIN - ymin) < 10):
222
+ if(box in final_bbox):
223
+ final_bbox[idx]= [XMIN, min(YMIN, ymin), xmax, max(YMAX, ymax)]
224
+ final_words[idx] = word + ' ' + w
225
+ continue
226
+
227
+ print('box: {}, label: {} is close to b:{} with this p:{}--> {}'.format(box, pred, b, p, word + ' ' + w))
228
+
229
+ idx = idx +1
230
+ return final_bbox, final_preds, final_words
231
+
232
+ def createDataframe(preds, words):
233
+ df = pd.DataFrame(columns = ['container id' ,'seal number', 'container quantity', 'container type', 'package quantity', 'tare', 'weight'])
234
+ flag_label = preds[0]
235
+ #print(preds)
236
+ #print(words)
237
+ #print('@@@@@')
238
+ #print(flag_label)
239
+ row_number = -1
240
+ for i in range(len(preds)):
241
+ #print('i is: {}'.format(i))
242
+ if (preds[i] == flag_label):
243
+ row_number = row_number + 1
244
+ df.at[row_number, preds[i]] = words[i]
245
+ #print('row number is: {}'.format(row_number))
246
+ continue
247
+
248
+ else:
249
+ #print('row_number {} is <= of df.shape {}'.format(row_number, df.shape[0]))
250
+ #print(pd.isna(df[preds[i]].iloc[row_number]))
251
+ #print(pd.isna(df[preds[i]].iloc[row_number]))
252
+ if(pd.isna(df[preds[i]].iloc[row_number])):
253
+ df.at[row_number, preds[i]] = words[i]
254
+ else:
255
+ row_number = row_number + 1
256
+ df.at[row_number, preds[i]] = words[i]
257
+
258
+ return df
259
+
260
+ def isInside(w, z):
261
+ # return True if w is inside z, if z is inside w return false
262
+ if(w[0] >= z[0] and w[1] >= z[1] and w[2] <= z[2] and w[3] <= z[3]):
263
+ return True
264
+ return False
265
+
266
+ def removeSimilarItems(final_bbox, final_preds, final_words):
267
+ _bb =[]
268
+ _pp=[]
269
+ _ww=[]
270
+ for i in range(len(final_bbox)):
271
+ _bb.append(final_bbox[i])
272
+ _pp.append(final_preds[i])
273
+ _ww.append(final_words[i])
274
+ for j in range(len(final_bbox)):
275
+ if (final_bbox[i] == final_bbox[j]):
276
+ continue
277
+ elif (isInside(final_bbox[i], final_bbox[j]) and final_preds[i]==final_preds[j] ):
278
+ # box i is inside box j, so we have to remove it
279
+ #print('box[i]: {} is inside box[j]:{}'.format(final_bbox[i], final_bbox[j]))
280
+ _bb = _bb[:-1]
281
+ _pp = _pp[:-1]
282
+ _ww = _ww[:-1]
283
+ continue
284
+ return _bb, _pp, _ww
285
+
286
+ #[45.604, 2309.811, 66.652, 2391.6839999999997]
287
+
288
+ def process_form(preds, words, bboxes):
289
+
290
+ final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words, 70)
291
+ _bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words)
292
+ # convert float list to int
293
+ _bbox = [[int(x) for x in item ] for item in _bbox]
294
+ # creat data object for sorting
295
+ data = []
296
+ for index in range(len(_bbox)):
297
+ data.append((_bbox[index], _preds[index], _words[index]))
298
+ # sorting by the height of the page
299
+ sorted_list = sorted(
300
+ data,
301
+ key=lambda x: x[0][1]
302
+ )
303
+ _bbox = [item[0] for item in sorted_list]
304
+ _preds = [item[1] for item in sorted_list]
305
+ _words = [item[2] for item in sorted_list]
306
+ return _bbox, _preds, _words
307
+
308
+ def mergeImageVertical(a):
309
+ list_im = a
310
+ imgs = [ Image.open(i) for i in list_im ]
311
+ # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
312
+ min_shape = sorted( [(np.sum(i.size), i.size ) for i in imgs])[0][1]
313
+ imgs_comb = np.hstack([i.resize(min_shape) for i in imgs])
314
+
315
+ # for a vertical stacking it is simple: use vstack
316
+ imgs_comb = np.vstack([i.resize(min_shape) for i in imgs])
317
+ imgs_comb = Image.fromarray( imgs_comb)
318
+ imgs_comb.save( 'Trifecta_vertical.jpg' )
319
+ return imgs_comb
320
+
321
+
322
+
323
+ def completepreprocess(pdffile):
324
+ myDataFrame = pd.DataFrame()
325
+ a=[]
326
+ doc = fitz.open(pdffile)
327
+ for i in range(0,len(doc)):
328
+ page = doc.load_page(i)
329
+ zoom = 2 # zoom factor
330
+ mat = fitz.Matrix(zoom, zoom)
331
+ pix = page.get_pixmap(matrix = mat,dpi = 200)
332
+ t=pix.save("page"+str(i)+".jpg")
333
+ images = Image.open("page"+str(i)+".jpg")
334
+ image = images.convert("RGB")
335
+ bbox, preds, words, image = process_image(image)
336
+ im, df = visualize_image(bbox, preds, words, image)
337
+ im1 = im.save("page"+str(i)+".jpg")
338
+ a.append("page"+str(i)+".jpg")
339
+ pred_list = []
340
+ for number in preds:
341
+ pred_list.append(iob_to_label(number))
342
+ _bbox, _preds, _words = process_form(pred_list, words, bbox)
343
+ print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words)))
344
+ df = createDataframe(_preds, _words)
345
+ myDataFrame=myDataFrame.append(df)
346
+
347
+ im2=mergeImageVertical(a)
348
+ return im2,myDataFrame
349
+
350
+
351
+ title = "Interactive demo: Manifesto Information Extraction model"
352
+ description = "Manifesto Information Extraction - We use Microsoft’s LayoutLMv3 trained on Manifesto Dataset through csv's to predict the labels. To use it, simply upload a PDF or use the example PDF below and click ‘Submit’. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select ‘Open image in new tab’.Train =63 ,Test =15"
353
+
354
+ css = """.output_image, .input_image {height: 600px !important}"""
355
+ #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
356
+ # ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
357
+ # ["744BJQ69.PDF"], ['tarros_2.jpg'],
358
+ examples = [['3pages_messina.pdf'], ['messina2.jpg'], ['arkas1.jpg'], ['brointermed1.jpg'], ['brointermed2.pdf'], ['tarros_1.jpg'], ['tarros_3.jpg'], ['tarros_4.jpg']]
359
+
360
+ iface = gr.Interface(fn=completepreprocess,
361
+ #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
362
+ inputs=gr.File(label="PDF"),
363
+ #inputs=gr.inputs.Image(type="pil")
364
+ outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
365
+ title=title,
366
+ description=description,
367
+ examples=examples,
368
+ css=css,
369
+ analytics_enabled = True, enable_queue=True)
370
+
371
+ iface.launch(inline=False , debug=True)
doc1.2.pdf ADDED
Binary file (277 kB). View file
 
doc1.pdf ADDED
Binary file (276 kB). View file
 
packages.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ffmpeg
2
+ libsm6
3
+ libxext6 -y
4
+ libgl1
5
+ -y libgl1-mesa-glx
6
+ tesseract-ocr
7
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ PyYAML==6.0
3
+ pytesseract==0.3.9
4
+ datasets==2.2.2
5
+ seqeval==1.2.2
6
+ pdf2image==1.16.0
7
+ PyMuPDF==1.20.2
8
+ lxml==4.9.1
9
+ opencv-contrib-python==4.6.0.66
10
+ opencv-python==4.6.0.66
11
+ opencv-python-headless==4.6.0.66
test1.jpg ADDED