Spaces:
Sleeping
Sleeping
feat: host list file detection
Browse files- .gitattributes +1 -0
- apis/layoutlm.py +17 -0
- apis/qc3/host_list.py +83 -0
- app.py +77 -23
- examples/ex-invoice-1.png +0 -0
- examples/ex-invoice-2.png +0 -0
- examples/host-list1.JPG +0 -0
- examples/host-list2.JPG +0 -0
- examples/host-list3.JPG +0 -0
- requirements.txt +1 -1
- tessdata/eng_slashed_zeros.traineddata +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.traineddata filter=lfs diff=lfs merge=lfs -text
|
apis/layoutlm.py
CHANGED
@@ -21,6 +21,23 @@ class LayoutLM:
|
|
21 |
|
22 |
self.pipeline = self.tf_pipeline(self.pipeline_category, model=model)
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def answer_the_question(self, img, question: str, is_debug=False):
|
25 |
score = 0
|
26 |
answer = '-'
|
|
|
21 |
|
22 |
self.pipeline = self.tf_pipeline(self.pipeline_category, model=model)
|
23 |
|
24 |
+
def answer_the_question_without_filter(self, img, question: str, is_debug=False, **kwargs):
|
25 |
+
answers = None
|
26 |
+
|
27 |
+
top_k = kwargs['top_k'] if kwargs.get('top_k') is not None else 1
|
28 |
+
max_answer_len = kwargs['max_answer_len'] if kwargs.get('max_answer_len') is not None else 15
|
29 |
+
|
30 |
+
if self.pipeline is not None:
|
31 |
+
answers = self.pipeline(img, question,
|
32 |
+
top_k=top_k,
|
33 |
+
max_answer_len=max_answer_len)
|
34 |
+
|
35 |
+
if is_debug:
|
36 |
+
print('--------------------')
|
37 |
+
print(answers)
|
38 |
+
|
39 |
+
return answers
|
40 |
+
|
41 |
def answer_the_question(self, img, question: str, is_debug=False):
|
42 |
score = 0
|
43 |
answer = '-'
|
apis/qc3/host_list.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from PIL import Image
|
4 |
+
import pytesseract
|
5 |
+
|
6 |
+
class HostList:
|
7 |
+
def __init__(self, is_debug=False) -> None:
|
8 |
+
self.is_debug = is_debug
|
9 |
+
|
10 |
+
# Host List Style (hlstyle) configuration for pytesseract
|
11 |
+
# - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/)
|
12 |
+
# - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata
|
13 |
+
self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros'
|
14 |
+
|
15 |
+
def get_orientation(self, image: Image):
|
16 |
+
# detect orientation
|
17 |
+
osd = pytesseract.image_to_osd(image)
|
18 |
+
isrotate = re.search('(?<=Rotate: )\d+', osd)
|
19 |
+
isscript = re.search('(?<=Script: )\d+', osd)
|
20 |
+
angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None
|
21 |
+
script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None
|
22 |
+
|
23 |
+
if self.is_debug:
|
24 |
+
print("---------------------------------")
|
25 |
+
print(f"angle : {angle}")
|
26 |
+
print(f"script : {script}")
|
27 |
+
|
28 |
+
return (angle, script)
|
29 |
+
|
30 |
+
def post_processes(self, result: str):
|
31 |
+
data = dict()
|
32 |
+
obj = dict()
|
33 |
+
data['data'] = []
|
34 |
+
is_host = False
|
35 |
+
is_mid = False
|
36 |
+
is_tid = False
|
37 |
+
|
38 |
+
if self.is_debug:
|
39 |
+
print("---------------------------------")
|
40 |
+
print("post-processes:\n")
|
41 |
+
|
42 |
+
lines = result.splitlines()
|
43 |
+
for line in lines:
|
44 |
+
if re.search(r'(\:)', line):
|
45 |
+
infos = line.split(':')[1]
|
46 |
+
|
47 |
+
# # Clear end line character
|
48 |
+
# if len(infos) > 1:
|
49 |
+
# infos.pop()
|
50 |
+
|
51 |
+
# # Merge all
|
52 |
+
# infos = ''.join(infos)
|
53 |
+
print(infos)
|
54 |
+
|
55 |
+
# Is alphabet or numeric ?
|
56 |
+
if re.search(r'[a-zA-Z0-9]+', infos):
|
57 |
+
if not is_host and not is_mid and not is_tid:
|
58 |
+
is_host = True
|
59 |
+
obj['host'] = re.sub('\W', '', infos)
|
60 |
+
elif is_host and not is_mid and not is_tid:
|
61 |
+
is_mid = True
|
62 |
+
obj['mid'] = max(infos.split(' '), key=len)
|
63 |
+
elif is_host and is_mid and not is_tid:
|
64 |
+
is_tid = True
|
65 |
+
obj['tid'] = max(infos.split(' '), key=len)
|
66 |
+
|
67 |
+
if is_host and is_mid and is_tid:
|
68 |
+
data['data'].append(obj.copy())
|
69 |
+
print(json.dumps(obj))
|
70 |
+
is_host = False
|
71 |
+
is_mid = False
|
72 |
+
is_tid = False
|
73 |
+
|
74 |
+
obj.clear()
|
75 |
+
|
76 |
+
print(json.dumps(data))
|
77 |
+
|
78 |
+
return f'{result}\n-------------------\n{json.dumps(data, indent=2)}'
|
79 |
+
|
80 |
+
def process_image(self, image: Image):
|
81 |
+
string = pytesseract.image_to_string(image, config=self.hlstyle_config)
|
82 |
+
string = self.post_processes(string)
|
83 |
+
return f'{string}'
|
app.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
from apis.layoutlm import LayoutLM
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
import os
|
5 |
|
6 |
layoutlm = None
|
|
|
7 |
|
8 |
def auth(username, password):
|
9 |
u = os.environ.get('USERNAME')
|
@@ -13,8 +16,31 @@ def auth(username, password):
|
|
13 |
def inference(img) -> pd.DataFrame:
|
14 |
return layoutlm.inference(img)
|
15 |
|
16 |
-
def
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
if __name__ == '__main__':
|
20 |
|
@@ -24,26 +50,54 @@ if __name__ == '__main__':
|
|
24 |
|
25 |
with gr.Blocks() as demo:
|
26 |
|
27 |
-
with gr.
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
except Exception as e:
|
49 |
print(str(e))
|
|
|
1 |
from apis.layoutlm import LayoutLM
|
2 |
+
from apis.qc3.host_list import HostList
|
3 |
+
from PIL import Image
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
6 |
import os
|
7 |
|
8 |
layoutlm = None
|
9 |
+
hl = HostList(is_debug=True)
|
10 |
|
11 |
def auth(username, password):
|
12 |
u = os.environ.get('USERNAME')
|
|
|
16 |
def inference(img) -> pd.DataFrame:
|
17 |
return layoutlm.inference(img)
|
18 |
|
19 |
+
def filter_green_out(img: Image):
|
20 |
+
image_data = img.load()
|
21 |
+
height,width = img.size
|
22 |
+
for loop1 in range(height):
|
23 |
+
for loop2 in range(width):
|
24 |
+
(r,g,b) = image_data[loop1,loop2]
|
25 |
+
if g < 70 and r < 70 and b < 70:
|
26 |
+
(r,g,b) = (0,0,0)
|
27 |
+
else:
|
28 |
+
(r,g,b) = (255,255,255)
|
29 |
+
image_data[loop1,loop2] = r,g,b
|
30 |
+
|
31 |
+
img.save('./temp.jpg')
|
32 |
+
return img
|
33 |
+
|
34 |
+
def ask(img: Image, question, top_k, max_answer_len, chk_is_remove_green) -> str:
|
35 |
+
if chk_is_remove_green:
|
36 |
+
img = filter_green_out(img.copy())
|
37 |
+
|
38 |
+
return layoutlm.answer_the_question_without_filter(
|
39 |
+
img,
|
40 |
+
question,
|
41 |
+
top_k=top_k,
|
42 |
+
max_answer_len=max_answer_len,
|
43 |
+
is_debug=True)
|
44 |
|
45 |
if __name__ == '__main__':
|
46 |
|
|
|
50 |
|
51 |
with gr.Blocks() as demo:
|
52 |
|
53 |
+
with gr.Tab('List'):
|
54 |
+
with gr.Row():
|
55 |
+
with gr.Column():
|
56 |
+
list_inp_img = gr.Image(type="pil")
|
57 |
+
gr.Examples(
|
58 |
+
[['./examples/host-list1.JPG'], ['./examples/host-list2.JPG', './examples/host-list3.JPG']],
|
59 |
+
list_inp_img
|
60 |
+
)
|
61 |
+
|
62 |
+
with gr.Column():
|
63 |
+
list_out_txt = gr.Textbox(label='Answer', interactive=False)
|
64 |
+
|
65 |
+
list_btn_ask = gr.Button('Ask me')
|
66 |
+
list_btn_ask.click(hl.process_image, [
|
67 |
+
list_inp_img
|
68 |
+
], list_out_txt)
|
69 |
+
|
70 |
+
with gr.Tab('Layout'):
|
71 |
+
with gr.Row():
|
72 |
+
inp_img = gr.Image(type='pil')
|
73 |
+
|
74 |
+
with gr.Column():
|
75 |
+
out = gr.Dataframe(
|
76 |
+
headers=['Data', 'Value'],
|
77 |
+
datatype=['str', 'str'],
|
78 |
+
row_count=8,
|
79 |
+
col_count=(2, 'fixed'),
|
80 |
+
interactive=False
|
81 |
+
)
|
82 |
+
|
83 |
+
txt_custom_question = gr.Textbox(label='Your question')
|
84 |
+
sld_max_answer = gr.Slider(1, 10, value=1, step=1, label="Max answer", info="Top-K between 1 and 10")
|
85 |
+
sld_max_answer_len = gr.Slider(1, 200, value=15, step=1, label="Max answer length", info="Length between 15 and 200")
|
86 |
+
chk_is_remove_green = gr.Checkbox(label="Remove green", info="Do you need clean context?")
|
87 |
+
btn_ask = gr.Button('Ask me')
|
88 |
+
txt_out_answer = gr.Textbox(label='Answer', interactive=False)
|
89 |
+
|
90 |
+
# event
|
91 |
+
inp_img.change(inference, inp_img, out)
|
92 |
+
btn_ask.click(ask, [
|
93 |
+
inp_img,
|
94 |
+
txt_custom_question,
|
95 |
+
sld_max_answer,
|
96 |
+
sld_max_answer_len,
|
97 |
+
chk_is_remove_green
|
98 |
+
], txt_out_answer)
|
99 |
+
|
100 |
+
#demo.launch(auth=auth)
|
101 |
+
demo.launch()
|
102 |
except Exception as e:
|
103 |
print(str(e))
|
examples/ex-invoice-1.png
ADDED
![]() |
examples/ex-invoice-2.png
ADDED
![]() |
examples/host-list1.JPG
ADDED
|
examples/host-list2.JPG
ADDED
|
examples/host-list3.JPG
ADDED
|
requirements.txt
CHANGED
@@ -4,4 +4,4 @@ torch==2.2.0
|
|
4 |
pytesseract==0.3.10
|
5 |
Pillow==10.0
|
6 |
gradio==4.19.0
|
7 |
-
pandas==2.2.0
|
|
|
4 |
pytesseract==0.3.10
|
5 |
Pillow==10.0
|
6 |
gradio==4.19.0
|
7 |
+
pandas==2.2.0
|
tessdata/eng_slashed_zeros.traineddata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d3edb3b6bddf3d15d80b36ad9c1203d1289ead1e6b9d4bbb006357a267a2b3a
|
3 |
+
size 15858079
|