Spaces:
Runtime error
Runtime error
Commit
·
07111e1
1
Parent(s):
2ffed31
Update app.py
Browse files
app.py
CHANGED
@@ -78,6 +78,8 @@ def iob_to_label(label):
|
|
78 |
if label == 7:
|
79 |
return 'others'
|
80 |
|
|
|
|
|
81 |
# this method will detect if there is any intersect between two boxes or not
|
82 |
def intersect(w, z):
|
83 |
x1 = max(w[0], z[0]) #190 | 881 | 10
|
@@ -90,18 +92,16 @@ def intersect(w, z):
|
|
90 |
# because sometimes in annotating, it is possible to overlap rows or columns by mistake
|
91 |
# for very small pixels, we check a threshold to delete them
|
92 |
area = (x2-x1) * (y2-y1)
|
93 |
-
if (area > 0):
|
94 |
return [int(x1), int(y1), int(x2), int(y2)]
|
95 |
else:
|
96 |
return 0
|
97 |
|
98 |
-
|
99 |
def process_image(image):
|
100 |
custom_config = r'--oem 3 --psm 6'
|
101 |
# lang='eng+deu+ita+chi_sim'
|
102 |
-
lang='
|
103 |
width, height = image.size
|
104 |
-
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
|
105 |
encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
|
106 |
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
|
107 |
|
@@ -131,13 +131,12 @@ def process_image(image):
|
|
131 |
preds = []
|
132 |
l_words = []
|
133 |
bboxes = []
|
134 |
-
token_section_num = []
|
135 |
|
136 |
if (len(token_boxes) == 512):
|
137 |
predictions = [predictions]
|
138 |
token_boxes = [token_boxes]
|
139 |
|
140 |
-
|
141 |
for i in range(0, len(token_boxes)):
|
142 |
for j in range(0, len(token_boxes[i])):
|
143 |
#print(np.asarray(token_boxes[i][j]).shape)
|
@@ -162,11 +161,8 @@ def process_image(image):
|
|
162 |
# parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
|
163 |
# HERE: because they're in a same section, so we can merge them safely
|
164 |
l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
|
165 |
-
|
166 |
else:
|
167 |
continue
|
168 |
-
|
169 |
-
|
170 |
return bboxes, preds, l_words, image
|
171 |
|
172 |
|
@@ -175,24 +171,27 @@ def visualize_image(final_bbox, final_preds, l_words, image):
|
|
175 |
|
176 |
draw = ImageDraw.Draw(image)
|
177 |
font = ImageFont.load_default()
|
|
|
178 |
|
|
|
179 |
label2color = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue', 'lyrics':'purple', 'others': 'white'}
|
180 |
l2l = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue','lyrics':'purple', 'others':'white'}
|
181 |
f_labels = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue','lyrics':'purple', 'others':'white'}
|
182 |
|
183 |
-
|
184 |
json_df = []
|
185 |
|
|
|
186 |
for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
|
187 |
predicted_label = iob_to_label(prediction).lower()
|
188 |
-
|
189 |
-
|
|
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
|
195 |
-
|
196 |
return image, json_df
|
197 |
|
198 |
|
@@ -206,55 +205,106 @@ def mergeCloseBoxes(pr, bb, wr, threshold):
|
|
206 |
if (pred=='others'):
|
207 |
continue
|
208 |
else:
|
209 |
-
|
210 |
-
final_preds.append(pred)
|
211 |
-
final_words.append(word)
|
212 |
for b, p, w in zip(bb, pr, wr):
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
return final_bbox, final_preds, final_words
|
232 |
|
233 |
def createDataframe(preds, words):
|
234 |
-
df = pd.DataFrame(columns = ['
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
return df
|
260 |
|
@@ -288,7 +338,7 @@ def removeSimilarItems(final_bbox, final_preds, final_words):
|
|
288 |
|
289 |
def process_form(preds, words, bboxes):
|
290 |
|
291 |
-
final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words,
|
292 |
_bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words)
|
293 |
# convert float list to int
|
294 |
_bbox = [[int(x) for x in item ] for item in _bbox]
|
@@ -319,8 +369,6 @@ def mergeImageVertical(a):
|
|
319 |
imgs_comb.save( 'Trifecta_vertical.jpg' )
|
320 |
return imgs_comb
|
321 |
|
322 |
-
|
323 |
-
|
324 |
def completepreprocess(pdffile):
|
325 |
myDataFrame = pd.DataFrame()
|
326 |
a=[]
|
@@ -334,6 +382,8 @@ def completepreprocess(pdffile):
|
|
334 |
images = Image.open("page"+str(i)+".jpg")
|
335 |
image = images.convert("RGB")
|
336 |
bbox, preds, words, image = process_image(image)
|
|
|
|
|
337 |
im, df = visualize_image(bbox, preds, words, image)
|
338 |
im1 = im.save("page"+str(i)+".jpg")
|
339 |
a.append("page"+str(i)+".jpg")
|
@@ -358,6 +408,7 @@ css = """.output_image, .input_image {height: 600px !important}"""
|
|
358 |
# ["744BJQ69.PDF"], ['tarros_2.jpg'],
|
359 |
#examples = [['test1.jpg'], ['doc1.pdf'], ['doc1.2.pdf']]
|
360 |
|
|
|
361 |
iface = gr.Interface(fn=completepreprocess,
|
362 |
#inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
|
363 |
inputs=gr.File(label="PDF"),
|
|
|
78 |
if label == 7:
|
79 |
return 'others'
|
80 |
|
81 |
+
|
82 |
+
|
83 |
# this method will detect if there is any intersect between two boxes or not
|
84 |
def intersect(w, z):
|
85 |
x1 = max(w[0], z[0]) #190 | 881 | 10
|
|
|
92 |
# because sometimes in annotating, it is possible to overlap rows or columns by mistake
|
93 |
# for very small pixels, we check a threshold to delete them
|
94 |
area = (x2-x1) * (y2-y1)
|
95 |
+
if (area > 0):
|
96 |
return [int(x1), int(y1), int(x2), int(y2)]
|
97 |
else:
|
98 |
return 0
|
99 |
|
|
|
100 |
def process_image(image):
|
101 |
custom_config = r'--oem 3 --psm 6'
|
102 |
# lang='eng+deu+ita+chi_sim'
|
103 |
+
lang='eng'
|
104 |
width, height = image.size
|
|
|
105 |
encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
|
106 |
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
|
107 |
|
|
|
131 |
preds = []
|
132 |
l_words = []
|
133 |
bboxes = []
|
134 |
+
token_section_num = [] # related to more than 512 tokens
|
135 |
|
136 |
if (len(token_boxes) == 512):
|
137 |
predictions = [predictions]
|
138 |
token_boxes = [token_boxes]
|
139 |
|
|
|
140 |
for i in range(0, len(token_boxes)):
|
141 |
for j in range(0, len(token_boxes[i])):
|
142 |
#print(np.asarray(token_boxes[i][j]).shape)
|
|
|
161 |
# parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
|
162 |
# HERE: because they're in a same section, so we can merge them safely
|
163 |
l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
|
|
|
164 |
else:
|
165 |
continue
|
|
|
|
|
166 |
return bboxes, preds, l_words, image
|
167 |
|
168 |
|
|
|
171 |
|
172 |
draw = ImageDraw.Draw(image)
|
173 |
font = ImageFont.load_default()
|
174 |
+
#{0: 'document number', 1: 'elemento pn', 2: 'nombre del responsabile', 3: 'fecha', 4: 'internal reference', 5: 'others'}
|
175 |
|
176 |
+
#id2label = {0: 'song name', 1: 'artist', 2: 'year', 3: 'album', 4: 'genres', 5: 'song writer', 6: 'lyrics', 7: 'others'}
|
177 |
label2color = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue', 'lyrics':'purple', 'others': 'white'}
|
178 |
l2l = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue','lyrics':'purple', 'others':'white'}
|
179 |
f_labels = {'song name':'red', 'artist':'blue', 'year':'black', 'album':'green', 'genres':'brown', 'song writer':'blue','lyrics':'purple', 'others':'white'}
|
180 |
|
|
|
181 |
json_df = []
|
182 |
|
183 |
+
# draw bboxes on image
|
184 |
for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
|
185 |
predicted_label = iob_to_label(prediction).lower()
|
186 |
+
if (predicted_label != 'others'):
|
187 |
+
draw.rectangle(box, outline=label2color[predicted_label])
|
188 |
+
draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
|
189 |
|
190 |
+
json_dict = {}
|
191 |
+
json_dict['TEXT'] = l_words[ix]
|
192 |
+
json_dict['LABEL'] = f_labels[predicted_label]
|
193 |
|
194 |
+
json_df.append(json_dict)
|
195 |
return image, json_df
|
196 |
|
197 |
|
|
|
205 |
if (pred=='others'):
|
206 |
continue
|
207 |
else:
|
208 |
+
flag = False
|
|
|
|
|
209 |
for b, p, w in zip(bb, pr, wr):
|
210 |
+
if (p == 'others'):
|
211 |
+
#print('others')
|
212 |
+
#print('-------')
|
213 |
+
continue
|
214 |
+
elif (box==b): # we shouldn't check each item with itself
|
215 |
+
#print('itself')
|
216 |
+
#print('--------')
|
217 |
+
continue
|
218 |
+
else:
|
219 |
+
XMIN, YMIN, XMAX, YMAX = box
|
220 |
+
xmin, ymin, xmax, ymax = b
|
221 |
+
#print('word: {} , w:{}'.format(word, w))
|
222 |
+
intsc = intersect([XMIN, YMIN, XMAX+threshold, YMAX], [xmin-threshold, ymin, xmax, ymax])
|
223 |
+
if (intsc != 0 and pred==p):
|
224 |
+
flag = True
|
225 |
+
#print('there is intersect')
|
226 |
+
# if(abs(XMAX - xmin) < treshold and abs(YMIN - ymin) < 10):
|
227 |
+
# we have to check if there is any intersection between box and all the values in final_bbox list
|
228 |
+
# because if we have updated it before, now we have to update in final_bbox
|
229 |
+
#print(final_bbox)
|
230 |
+
print(*final_bbox, sep=",")
|
231 |
+
merged_box = [
|
232 |
+
min(XMIN, xmin),
|
233 |
+
min(YMIN, ymin),
|
234 |
+
max(XMAX, xmax),
|
235 |
+
max(YMAX, ymax)
|
236 |
+
]
|
237 |
+
merged_words = word + ' ' + w
|
238 |
+
# add to final_bbox
|
239 |
+
wasAvailable = False
|
240 |
+
for id, fbox in enumerate(final_bbox):
|
241 |
+
if (intersect(box, fbox) != 0 and pred==final_preds[id]):
|
242 |
+
#print('added before!')
|
243 |
+
# box is inside another processed box, so we have to update it
|
244 |
+
wasAvailable = True
|
245 |
+
merged_box = [
|
246 |
+
min(fbox[0], min(XMIN, xmin)),
|
247 |
+
min(fbox[1], min(YMIN, ymin)),
|
248 |
+
max(fbox[2], max(XMAX, xmax)),
|
249 |
+
max(fbox[3], max(YMAX, ymax))
|
250 |
+
]
|
251 |
+
final_bbox[id] = merged_box
|
252 |
+
final_words[id] = final_words[id] + ' ' + w
|
253 |
+
break
|
254 |
+
|
255 |
+
if (not wasAvailable):
|
256 |
+
# there was no intersect, bbox is not added before
|
257 |
+
#print('not added before, so we add merged box!')
|
258 |
+
final_bbox.append(merged_box)
|
259 |
+
final_preds.append(pred)
|
260 |
+
final_words.append(merged_words)
|
261 |
+
'''else:
|
262 |
+
print()
|
263 |
+
final_bbox.append(box)
|
264 |
+
final_preds.append(pred)
|
265 |
+
final_words.append(word)'''
|
266 |
+
if (flag == False):
|
267 |
+
#print('flag is false, word: {} added'.format(word))
|
268 |
+
# there is no intersect between word and the others
|
269 |
+
# we will check for last time if box is inside the others, because if the word is last word (like Juan + Mulian + Alexander) (Alexander)
|
270 |
+
# it is added before but it has not intersection with others, so we will check to prevent
|
271 |
+
for id, fbox in enumerate(final_bbox):
|
272 |
+
if (intersect(box, fbox) != 0 and pred==final_preds[id]):
|
273 |
+
flag = True
|
274 |
+
|
275 |
+
if (not flag):
|
276 |
+
final_bbox.append(box)
|
277 |
+
final_preds.append(pred)
|
278 |
+
final_words.append(word)
|
279 |
+
|
280 |
return final_bbox, final_preds, final_words
|
281 |
|
282 |
def createDataframe(preds, words):
|
283 |
+
df = pd.DataFrame(columns = ['song name', 'artist', 'year', 'album', 'genres', 'song writer', 'lyrics', 'others'])
|
284 |
+
if (len(preds) > 0):
|
285 |
+
flag_label = preds[0]
|
286 |
+
#print(preds)
|
287 |
+
#print(words)
|
288 |
+
#print('@@@@@')
|
289 |
+
#print(flag_label)
|
290 |
+
row_number = -1
|
291 |
+
for i in range(len(preds)):
|
292 |
+
#print('i is: {}'.format(i))
|
293 |
+
if (preds[i] == flag_label):
|
294 |
+
row_number = row_number + 1
|
295 |
+
df.at[row_number, preds[i]] = words[i]
|
296 |
+
#print('row number is: {}'.format(row_number))
|
297 |
+
continue
|
298 |
|
299 |
+
else:
|
300 |
+
#print('row_number {} is <= of df.shape {}'.format(row_number, df.shape[0]))
|
301 |
+
#print(pd.isna(df[preds[i]].iloc[row_number]))
|
302 |
+
#print(pd.isna(df[preds[i]].iloc[row_number]))
|
303 |
+
if(pd.isna(df[preds[i]].iloc[row_number])):
|
304 |
+
df.at[row_number, preds[i]] = words[i]
|
305 |
+
else:
|
306 |
+
row_number = row_number + 1
|
307 |
+
df.at[row_number, preds[i]] = words[i]
|
308 |
|
309 |
return df
|
310 |
|
|
|
338 |
|
339 |
def process_form(preds, words, bboxes):
|
340 |
|
341 |
+
final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words, 30)
|
342 |
_bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words)
|
343 |
# convert float list to int
|
344 |
_bbox = [[int(x) for x in item ] for item in _bbox]
|
|
|
369 |
imgs_comb.save( 'Trifecta_vertical.jpg' )
|
370 |
return imgs_comb
|
371 |
|
|
|
|
|
372 |
def completepreprocess(pdffile):
|
373 |
myDataFrame = pd.DataFrame()
|
374 |
a=[]
|
|
|
382 |
images = Image.open("page"+str(i)+".jpg")
|
383 |
image = images.convert("RGB")
|
384 |
bbox, preds, words, image = process_image(image)
|
385 |
+
print(preds)
|
386 |
+
print(words)
|
387 |
im, df = visualize_image(bbox, preds, words, image)
|
388 |
im1 = im.save("page"+str(i)+".jpg")
|
389 |
a.append("page"+str(i)+".jpg")
|
|
|
408 |
# ["744BJQ69.PDF"], ['tarros_2.jpg'],
|
409 |
#examples = [['test1.jpg'], ['doc1.pdf'], ['doc1.2.pdf']]
|
410 |
|
411 |
+
|
412 |
iface = gr.Interface(fn=completepreprocess,
|
413 |
#inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
|
414 |
inputs=gr.File(label="PDF"),
|