KevinHuSh
commited on
Commit
·
96a1a44
1
Parent(s):
04aba1b
add paper & manual parser (#46)
Browse files- rag/app/__init__.py +22 -0
- rag/app/laws.py +7 -21
- rag/app/manual.py +140 -0
- rag/app/paper.py +240 -0
- rag/app/presentation.py +8 -8
- rag/nlp/__init__.py +4 -1
- rag/parser/pdf_parser.py +96 -63
rag/app/__init__.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
import re
|
2 |
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def callback__(progress, msg, func):
|
5 |
if not func :return
|
@@ -46,3 +50,21 @@ def bullets_category(sections):
|
|
46 |
res = i
|
47 |
maxium = h
|
48 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
|
3 |
+
from nltk import word_tokenize
|
4 |
+
|
5 |
+
from rag.nlp import stemmer, huqie
|
6 |
+
|
7 |
|
8 |
def callback__(progress, msg, func):
|
9 |
if not func :return
|
|
|
50 |
res = i
|
51 |
maxium = h
|
52 |
return res
|
53 |
+
|
54 |
+
def is_english(texts):
|
55 |
+
eng = 0
|
56 |
+
for t in texts:
|
57 |
+
if re.match(r"[a-zA-Z]", t.strip()):
|
58 |
+
eng += 1
|
59 |
+
if eng / len(texts) > 0.8:
|
60 |
+
return True
|
61 |
+
return False
|
62 |
+
|
63 |
+
def tokenize(d, t, eng):
|
64 |
+
d["content_with_weight"] = t
|
65 |
+
if eng:
|
66 |
+
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
67 |
+
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
68 |
+
else:
|
69 |
+
d["content_ltks"] = huqie.qie(t)
|
70 |
+
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
rag/app/laws.py
CHANGED
@@ -3,12 +3,13 @@ import re
|
|
3 |
from io import BytesIO
|
4 |
from docx import Document
|
5 |
import numpy as np
|
6 |
-
from rag.app import callback__, bullets_category, BULLET_PATTERN
|
7 |
from rag.nlp import huqie
|
|
|
8 |
from rag.parser.pdf_parser import HuParser
|
9 |
|
10 |
|
11 |
-
class Docx(
|
12 |
def __init__(self):
|
13 |
pass
|
14 |
|
@@ -42,14 +43,7 @@ class Pdf(HuParser):
|
|
42 |
print("paddle layouts:", timer()-start)
|
43 |
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
|
44 |
# is it English
|
45 |
-
eng =
|
46 |
-
for b in bxs:
|
47 |
-
if re.match(r"[a-zA-Z]", b["text"].strip()):
|
48 |
-
eng += 1
|
49 |
-
if eng / len(bxs) > 0.8:
|
50 |
-
eng = True
|
51 |
-
else:
|
52 |
-
eng = False
|
53 |
# Merge vertically
|
54 |
i = 0
|
55 |
while i + 1 < len(bxs):
|
@@ -59,7 +53,7 @@ class Pdf(HuParser):
|
|
59 |
bxs.pop(i)
|
60 |
continue
|
61 |
concatting_feats = [
|
62 |
-
b["text"].strip()[-1] in ",;:'\"
|
63 |
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
64 |
b["text"].strip()[0] in "。;?!?”)),,、:",
|
65 |
]
|
@@ -118,14 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
118 |
sections = [l for l in sections if l]
|
119 |
|
120 |
# is it English
|
121 |
-
eng =
|
122 |
-
for sec in sections:
|
123 |
-
if re.match(r"[a-zA-Z]", sec.strip()):
|
124 |
-
eng += 1
|
125 |
-
if eng / len(sections) > 0.8:
|
126 |
-
eng = True
|
127 |
-
else:
|
128 |
-
eng = False
|
129 |
# Remove 'Contents' part
|
130 |
i = 0
|
131 |
while i < len(sections):
|
@@ -181,8 +168,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
181 |
if pdf_parser:
|
182 |
d["image"] = pdf_parser.crop(ck)
|
183 |
ck = pdf_parser.remove_tag(ck)
|
184 |
-
d
|
185 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
186 |
res.append(d)
|
187 |
return res
|
188 |
|
|
|
3 |
from io import BytesIO
|
4 |
from docx import Document
|
5 |
import numpy as np
|
6 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
7 |
from rag.nlp import huqie
|
8 |
+
from rag.parser.docx_parser import HuDocxParser
|
9 |
from rag.parser.pdf_parser import HuParser
|
10 |
|
11 |
|
12 |
+
class Docx(HuDocxParser):
|
13 |
def __init__(self):
|
14 |
pass
|
15 |
|
|
|
43 |
print("paddle layouts:", timer()-start)
|
44 |
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
|
45 |
# is it English
|
46 |
+
eng = is_english([b["text"] for b in bxs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# Merge vertically
|
48 |
i = 0
|
49 |
while i + 1 < len(bxs):
|
|
|
53 |
bxs.pop(i)
|
54 |
continue
|
55 |
concatting_feats = [
|
56 |
+
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
|
57 |
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
58 |
b["text"].strip()[0] in "。;?!?”)),,、:",
|
59 |
]
|
|
|
112 |
sections = [l for l in sections if l]
|
113 |
|
114 |
# is it English
|
115 |
+
eng = is_english(sections)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
# Remove 'Contents' part
|
117 |
i = 0
|
118 |
while i < len(sections):
|
|
|
168 |
if pdf_parser:
|
169 |
d["image"] = pdf_parser.crop(ck)
|
170 |
ck = pdf_parser.remove_tag(ck)
|
171 |
+
tokenize(d, ck, eng)
|
|
|
172 |
res.append(d)
|
173 |
return res
|
174 |
|
rag/app/manual.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
5 |
+
from rag.nlp import huqie, stemmer
|
6 |
+
from rag.parser.docx_parser import HuDocxParser
|
7 |
+
from rag.parser.pdf_parser import HuParser
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
import numpy as np
|
10 |
+
from rag.utils import num_tokens_from_string
|
11 |
+
|
12 |
+
|
13 |
+
class Pdf(HuParser):
|
14 |
+
def __call__(self, filename, binary=None, from_page=0,
|
15 |
+
to_page=100000, zoomin=3, callback=None):
|
16 |
+
self.__images__(
|
17 |
+
filename if not binary else binary,
|
18 |
+
zoomin,
|
19 |
+
from_page,
|
20 |
+
to_page)
|
21 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
22 |
+
"Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
|
23 |
+
|
24 |
+
from timeit import default_timer as timer
|
25 |
+
start = timer()
|
26 |
+
self._layouts_paddle(zoomin)
|
27 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
28 |
+
"Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
29 |
+
print("paddle layouts:", timer() - start)
|
30 |
+
self._table_transformer_job(zoomin)
|
31 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
32 |
+
"Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
33 |
+
self._text_merge()
|
34 |
+
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
35 |
+
self._concat_downward(concat_between_pages=False)
|
36 |
+
self._filter_forpages()
|
37 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
38 |
+
"Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
|
39 |
+
tbls = self._extract_table_figure(True, zoomin, False)
|
40 |
+
|
41 |
+
# clean mess
|
42 |
+
for b in self.boxes:
|
43 |
+
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
44 |
+
|
45 |
+
# merge chunks with the same bullets
|
46 |
+
i = 0
|
47 |
+
while i + 1 < len(self.boxes):
|
48 |
+
b = self.boxes[i]
|
49 |
+
b_ = self.boxes[i + 1]
|
50 |
+
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
51 |
+
or b["page_number"]!=b_["page_number"] \
|
52 |
+
or b["top"] > b_["bottom"]:
|
53 |
+
i += 1
|
54 |
+
continue
|
55 |
+
b_["text"] = b["text"] + "\n" + b_["text"]
|
56 |
+
b_["x0"] = min(b["x0"], b_["x0"])
|
57 |
+
b_["x1"] = max(b["x1"], b_["x1"])
|
58 |
+
b_["top"] = b["top"]
|
59 |
+
self.boxes.pop(i)
|
60 |
+
# merge title with decent chunk
|
61 |
+
i = 0
|
62 |
+
while i + 1 < len(self.boxes):
|
63 |
+
b = self.boxes[i]
|
64 |
+
if b.get("layoutno","").find("title") < 0:
|
65 |
+
i += 1
|
66 |
+
continue
|
67 |
+
b_ = self.boxes[i + 1]
|
68 |
+
b_["text"] = b["text"] + "\n" + b_["text"]
|
69 |
+
b_["x0"] = min(b["x0"], b_["x0"])
|
70 |
+
b_["x1"] = max(b["x1"], b_["x1"])
|
71 |
+
b_["top"] = b["top"]
|
72 |
+
self.boxes.pop(i)
|
73 |
+
|
74 |
+
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
75 |
+
|
76 |
+
print(tbls)
|
77 |
+
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
78 |
+
|
79 |
+
|
80 |
+
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
81 |
+
pdf_parser = None
|
82 |
+
paper = {}
|
83 |
+
|
84 |
+
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
85 |
+
pdf_parser = Pdf()
|
86 |
+
cks, tbls = pdf_parser(filename if not binary else binary,
|
87 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
88 |
+
doc = {
|
89 |
+
"docnm_kwd": filename
|
90 |
+
}
|
91 |
+
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
92 |
+
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
93 |
+
# is it English
|
94 |
+
eng = pdf_parser.is_english
|
95 |
+
|
96 |
+
res = []
|
97 |
+
# add tables
|
98 |
+
for img, rows in tbls:
|
99 |
+
bs = 10
|
100 |
+
de = ";" if eng else ";"
|
101 |
+
for i in range(0, len(rows), bs):
|
102 |
+
d = copy.deepcopy(doc)
|
103 |
+
r = de.join(rows[i:i + bs])
|
104 |
+
r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
|
105 |
+
tokenize(d, r, eng)
|
106 |
+
d["image"] = img
|
107 |
+
res.append(d)
|
108 |
+
|
109 |
+
i = 0
|
110 |
+
chunk = []
|
111 |
+
tk_cnt = 0
|
112 |
+
def add_chunk():
|
113 |
+
nonlocal chunk, res, doc, pdf_parser, tk_cnt
|
114 |
+
d = copy.deepcopy(doc)
|
115 |
+
ck = "\n".join(chunk)
|
116 |
+
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
|
117 |
+
d["image"] = pdf_parser.crop(ck)
|
118 |
+
res.append(d)
|
119 |
+
chunk = []
|
120 |
+
tk_cnt = 0
|
121 |
+
|
122 |
+
while i < len(cks):
|
123 |
+
if tk_cnt > 128: add_chunk()
|
124 |
+
txt = cks[i]
|
125 |
+
txt_ = pdf_parser.remove_tag(txt)
|
126 |
+
i += 1
|
127 |
+
cnt = num_tokens_from_string(txt_)
|
128 |
+
chunk.append(txt)
|
129 |
+
tk_cnt += cnt
|
130 |
+
if chunk: add_chunk()
|
131 |
+
for i, d in enumerate(res):
|
132 |
+
print(d)
|
133 |
+
# d["image"].save(f"./logs/{i}.jpg")
|
134 |
+
return res
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
import sys
|
139 |
+
|
140 |
+
chunk(sys.argv[1])
|
rag/app/paper.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
5 |
+
from rag.nlp import huqie, stemmer
|
6 |
+
from rag.parser.docx_parser import HuDocxParser
|
7 |
+
from rag.parser.pdf_parser import HuParser
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
import numpy as np
|
10 |
+
from rag.utils import num_tokens_from_string
|
11 |
+
|
12 |
+
|
13 |
+
class Pdf(HuParser):
|
14 |
+
def __call__(self, filename, binary=None, from_page=0,
|
15 |
+
to_page=100000, zoomin=3, callback=None):
|
16 |
+
self.__images__(
|
17 |
+
filename if not binary else binary,
|
18 |
+
zoomin,
|
19 |
+
from_page,
|
20 |
+
to_page)
|
21 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
22 |
+
"Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
|
23 |
+
|
24 |
+
from timeit import default_timer as timer
|
25 |
+
start = timer()
|
26 |
+
self._layouts_paddle(zoomin)
|
27 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
28 |
+
"Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
29 |
+
print("paddle layouts:", timer() - start)
|
30 |
+
self._table_transformer_job(zoomin)
|
31 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
32 |
+
"Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
33 |
+
self._text_merge()
|
34 |
+
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
35 |
+
self._concat_downward(concat_between_pages=False)
|
36 |
+
self._filter_forpages()
|
37 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
38 |
+
"Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
|
39 |
+
tbls = self._extract_table_figure(True, zoomin, False)
|
40 |
+
|
41 |
+
# clean mess
|
42 |
+
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
43 |
+
print("two_column...................", column_width,
|
44 |
+
self.page_images[0].size[0] / zoomin / 2)
|
45 |
+
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
46 |
+
for b in self.boxes:
|
47 |
+
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
48 |
+
freq = Counter([b["text"] for b in self.boxes])
|
49 |
+
garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
|
50 |
+
i = 0
|
51 |
+
while i < len(self.boxes):
|
52 |
+
if self.boxes[i]["text"] in garbage \
|
53 |
+
or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
|
54 |
+
or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
|
55 |
+
self.boxes.pop(i)
|
56 |
+
elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
|
57 |
+
'1'):
|
58 |
+
# merge within same layouts
|
59 |
+
self.boxes[i + 1]["top"] = self.boxes[i]["top"]
|
60 |
+
self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
|
61 |
+
self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
|
62 |
+
self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
|
63 |
+
self.boxes.pop(i)
|
64 |
+
else:
|
65 |
+
i += 1
|
66 |
+
|
67 |
+
def _begin(txt):
|
68 |
+
return re.match(
|
69 |
+
"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
|
70 |
+
txt.lower().strip())
|
71 |
+
|
72 |
+
# get title and authors
|
73 |
+
title = ""
|
74 |
+
authors = []
|
75 |
+
i = 0
|
76 |
+
while i < min(32, len(self.boxes)):
|
77 |
+
b = self.boxes[i]
|
78 |
+
i += 1
|
79 |
+
if b.get("layoutno", "").find("title") >= 0:
|
80 |
+
title = b["text"]
|
81 |
+
if _begin(title):
|
82 |
+
title = ""
|
83 |
+
break
|
84 |
+
for j in range(3):
|
85 |
+
if _begin(self.boxes[i + j]["text"]): break
|
86 |
+
authors.append(self.boxes[i + j]["text"])
|
87 |
+
break
|
88 |
+
break
|
89 |
+
# get abstract
|
90 |
+
abstr = ""
|
91 |
+
i = 0
|
92 |
+
while i + 1 < min(32, len(self.boxes)):
|
93 |
+
b = self.boxes[i]
|
94 |
+
i += 1
|
95 |
+
txt = b["text"].lower().strip()
|
96 |
+
if re.match("(abstract|摘要)", txt):
|
97 |
+
if len(txt.split(" ")) > 32 or len(txt) > 64:
|
98 |
+
abstr = txt + self._line_tag(b, zoomin)
|
99 |
+
i += 1
|
100 |
+
break
|
101 |
+
txt = self.boxes[i + 1]["text"].lower().strip()
|
102 |
+
if len(txt.split(" ")) > 32 or len(txt) > 64:
|
103 |
+
abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
|
104 |
+
i += 1
|
105 |
+
break
|
106 |
+
if not abstr: i = 0
|
107 |
+
|
108 |
+
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
109 |
+
print(tbls)
|
110 |
+
|
111 |
+
return {
|
112 |
+
"title": title if title else filename,
|
113 |
+
"authors": " ".join(authors),
|
114 |
+
"abstract": abstr,
|
115 |
+
"lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
|
116 |
+
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
117 |
+
"tables": tbls
|
118 |
+
}
|
119 |
+
|
120 |
+
|
121 |
+
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
122 |
+
pdf_parser = None
|
123 |
+
paper = {}
|
124 |
+
|
125 |
+
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
126 |
+
pdf_parser = Pdf()
|
127 |
+
paper = pdf_parser(filename if not binary else binary,
|
128 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
129 |
+
doc = {
|
130 |
+
"docnm_kwd": paper["title"] if paper["title"] else filename,
|
131 |
+
"authors_tks": paper["authors"]
|
132 |
+
}
|
133 |
+
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
134 |
+
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
135 |
+
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
136 |
+
# is it English
|
137 |
+
eng = pdf_parser.is_english
|
138 |
+
print("It's English.....", eng)
|
139 |
+
|
140 |
+
res = []
|
141 |
+
# add tables
|
142 |
+
for img, rows in paper["tables"]:
|
143 |
+
bs = 10
|
144 |
+
de = ";" if eng else ";"
|
145 |
+
for i in range(0, len(rows), bs):
|
146 |
+
d = copy.deepcopy(doc)
|
147 |
+
r = de.join(rows[i:i + bs])
|
148 |
+
r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
|
149 |
+
tokenize(d, r)
|
150 |
+
d["image"] = img
|
151 |
+
res.append(d)
|
152 |
+
|
153 |
+
if paper["abstract"]:
|
154 |
+
d = copy.deepcopy(doc)
|
155 |
+
txt = pdf_parser.remove_tag(paper["abstract"])
|
156 |
+
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
|
157 |
+
d["important_tks"] = " ".join(d["important_kwd"])
|
158 |
+
d["image"] = pdf_parser.crop(paper["abstract"])
|
159 |
+
tokenize(d, txt, eng)
|
160 |
+
res.append(d)
|
161 |
+
|
162 |
+
readed = [0] * len(paper["lines"])
|
163 |
+
# find colon firstly
|
164 |
+
i = 0
|
165 |
+
while i + 1 < len(paper["lines"]):
|
166 |
+
txt = pdf_parser.remove_tag(paper["lines"][i][0])
|
167 |
+
j = i
|
168 |
+
if txt.strip("\n").strip()[-1] not in "::":
|
169 |
+
i += 1
|
170 |
+
continue
|
171 |
+
i += 1
|
172 |
+
while i < len(paper["lines"]) and not paper["lines"][i][0]:
|
173 |
+
i += 1
|
174 |
+
if i >= len(paper["lines"]): break
|
175 |
+
proj = [paper["lines"][i][0].strip()]
|
176 |
+
i += 1
|
177 |
+
while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
|
178 |
+
proj.append(paper["lines"][i])
|
179 |
+
i += 1
|
180 |
+
for k in range(j, i): readed[k] = True
|
181 |
+
txt = txt[::-1]
|
182 |
+
if eng:
|
183 |
+
r = re.search(r"(.*?) ([\.;?!]|$)", txt)
|
184 |
+
txt = r.group(1)[::-1] if r else txt[::-1]
|
185 |
+
else:
|
186 |
+
r = re.search(r"(.*?) ([。?;!]|$)", txt)
|
187 |
+
txt = r.group(1)[::-1] if r else txt[::-1]
|
188 |
+
for p in proj:
|
189 |
+
d = copy.deepcopy(doc)
|
190 |
+
txt += "\n" + pdf_parser.remove_tag(p)
|
191 |
+
d["image"] = pdf_parser.crop(p)
|
192 |
+
tokenize(d, txt)
|
193 |
+
res.append(d)
|
194 |
+
|
195 |
+
i = 0
|
196 |
+
chunk = []
|
197 |
+
tk_cnt = 0
|
198 |
+
def add_chunk():
|
199 |
+
nonlocal chunk, res, doc, pdf_parser, tk_cnt
|
200 |
+
d = copy.deepcopy(doc)
|
201 |
+
ck = "\n".join(chunk)
|
202 |
+
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
|
203 |
+
d["image"] = pdf_parser.crop(ck)
|
204 |
+
res.append(d)
|
205 |
+
chunk = []
|
206 |
+
tk_cnt = 0
|
207 |
+
|
208 |
+
while i < len(paper["lines"]):
|
209 |
+
if tk_cnt > 128:
|
210 |
+
add_chunk()
|
211 |
+
if readed[i]:
|
212 |
+
i += 1
|
213 |
+
continue
|
214 |
+
readed[i] = True
|
215 |
+
txt, layouts = paper["lines"][i]
|
216 |
+
txt_ = pdf_parser.remove_tag(txt)
|
217 |
+
i += 1
|
218 |
+
cnt = num_tokens_from_string(txt_)
|
219 |
+
if any([
|
220 |
+
layouts.find("title") >= 0 and chunk,
|
221 |
+
cnt + tk_cnt > 128 and tk_cnt > 32,
|
222 |
+
]):
|
223 |
+
add_chunk()
|
224 |
+
chunk = [txt]
|
225 |
+
tk_cnt = cnt
|
226 |
+
else:
|
227 |
+
chunk.append(txt)
|
228 |
+
tk_cnt += cnt
|
229 |
+
|
230 |
+
if chunk: add_chunk()
|
231 |
+
for i, d in enumerate(res):
|
232 |
+
print(d)
|
233 |
+
# d["image"].save(f"./logs/{i}.jpg")
|
234 |
+
return res
|
235 |
+
|
236 |
+
|
237 |
+
if __name__ == "__main__":
|
238 |
+
import sys
|
239 |
+
|
240 |
+
chunk(sys.argv[1])
|
rag/app/presentation.py
CHANGED
@@ -3,7 +3,7 @@ import re
|
|
3 |
from io import BytesIO
|
4 |
from pptx import Presentation
|
5 |
|
6 |
-
from rag.app import callback__
|
7 |
from rag.nlp import huqie
|
8 |
from rag.parser.pdf_parser import HuParser
|
9 |
|
@@ -57,7 +57,7 @@ class Ppt(object):
|
|
57 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
58 |
callback__((min(to_page, self.total_page) - from_page) / self.total_page,
|
59 |
"Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
|
60 |
-
|
61 |
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
62 |
|
63 |
|
@@ -103,19 +103,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
103 |
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
104 |
res = []
|
105 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
106 |
-
|
|
|
107 |
d = copy.deepcopy(doc)
|
108 |
-
d["content_ltks"] = huqie.qie(txt)
|
109 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
110 |
d["image"] = img
|
|
|
111 |
res.append(d)
|
112 |
return res
|
113 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
114 |
-
|
|
|
115 |
d = copy.deepcopy(doc)
|
116 |
-
d["content_ltks"] = huqie.qie(txt)
|
117 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
118 |
d["image"] = img
|
|
|
119 |
res.append(d)
|
120 |
return res
|
121 |
callback__(-1, "This kind of presentation document did not support yet!", callback)
|
|
|
3 |
from io import BytesIO
|
4 |
from pptx import Presentation
|
5 |
|
6 |
+
from rag.app import callback__, tokenize, is_english
|
7 |
from rag.nlp import huqie
|
8 |
from rag.parser.pdf_parser import HuParser
|
9 |
|
|
|
57 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
58 |
callback__((min(to_page, self.total_page) - from_page) / self.total_page,
|
59 |
"Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
|
60 |
+
self.is_english = is_english(txts)
|
61 |
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
62 |
|
63 |
|
|
|
103 |
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
104 |
res = []
|
105 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
106 |
+
ppt_parser = Ppt()
|
107 |
+
for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback):
|
108 |
d = copy.deepcopy(doc)
|
|
|
|
|
109 |
d["image"] = img
|
110 |
+
tokenize(d, txt, ppt_parser.is_english)
|
111 |
res.append(d)
|
112 |
return res
|
113 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
114 |
+
pdf_parser = Pdf()
|
115 |
+
for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
|
116 |
d = copy.deepcopy(doc)
|
|
|
|
|
117 |
d["image"] = img
|
118 |
+
tokenize(d, txt, pdf_parser.is_english)
|
119 |
res.append(d)
|
120 |
return res
|
121 |
callback__(-1, "This kind of presentation document did not support yet!", callback)
|
rag/nlp/__init__.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
from . import search
|
2 |
from rag.utils import ELASTICSEARCH
|
3 |
|
4 |
-
retrievaler = search.Dealer(ELASTICSEARCH)
|
|
|
|
|
|
|
|
1 |
from . import search
|
2 |
from rag.utils import ELASTICSEARCH
|
3 |
|
4 |
+
retrievaler = search.Dealer(ELASTICSEARCH)
|
5 |
+
|
6 |
+
from nltk.stem import PorterStemmer
|
7 |
+
stemmer = PorterStemmer()
|
rag/parser/pdf_parser.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
|
|
|
|
2 |
import fitz
|
3 |
import xgboost as xgb
|
4 |
from io import BytesIO
|
@@ -14,6 +16,7 @@ from copy import deepcopy
|
|
14 |
from rag.cv.table_recognize import TableTransformer
|
15 |
from rag.cv.ppdetection import PPDet
|
16 |
from huggingface_hub import hf_hub_download
|
|
|
17 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
18 |
|
19 |
|
@@ -22,8 +25,8 @@ class HuParser:
|
|
22 |
from paddleocr import PaddleOCR
|
23 |
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
24 |
self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
|
25 |
-
self.layouter = PPDet()
|
26 |
-
self.tbl_det =
|
27 |
|
28 |
self.updown_cnt_mdl = xgb.Booster()
|
29 |
if torch.cuda.is_available():
|
@@ -55,7 +58,7 @@ class HuParser:
|
|
55 |
def _y_dis(
|
56 |
self, a, b):
|
57 |
return (
|
58 |
-
|
59 |
|
60 |
def _match_proj(self, b):
|
61 |
proj_patt = [
|
@@ -78,9 +81,9 @@ class HuParser:
|
|
78 |
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
79 |
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
80 |
tks_all = up["text"][-LEN:].strip() \
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
tks_all = huqie.qie(tks_all).split(" ")
|
85 |
fea = [
|
86 |
up.get("R", -1) == down.get("R", -1),
|
@@ -102,7 +105,7 @@ class HuParser:
|
|
102 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
103 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
104 |
True if re.search(r"[\((][^\))]+$", up["text"])
|
105 |
-
|
106 |
self._match_proj(down),
|
107 |
True if re.match(r"[A-Z]", down["text"]) else False,
|
108 |
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
@@ -141,6 +144,21 @@ class HuParser:
|
|
141 |
arr[j + 1] = deepcopy(tmp)
|
142 |
return arr
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
@staticmethod
|
145 |
def sort_R_firstly(arr, thr=0):
|
146 |
# sort using y1 first and then x1
|
@@ -219,7 +237,7 @@ class HuParser:
|
|
219 |
assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
|
220 |
tp, btm, x0, x1, b)
|
221 |
ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
|
222 |
-
|
223 |
if ov > 0 and ratio:
|
224 |
ov /= (x1 - x0) * (btm - tp)
|
225 |
return ov
|
@@ -326,7 +344,7 @@ class HuParser:
|
|
326 |
return layouts
|
327 |
|
328 |
def __table_paddle(self, images):
|
329 |
-
tbls = self.tbl_det([img for img in images],
|
330 |
res = []
|
331 |
# align left&right for rows, align top&bottom for columns
|
332 |
for tbl in tbls:
|
@@ -384,7 +402,7 @@ class HuParser:
|
|
384 |
continue
|
385 |
for tb in tbls: # for table
|
386 |
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
|
387 |
-
|
388 |
left *= ZM
|
389 |
top *= ZM
|
390 |
right *= ZM
|
@@ -482,10 +500,13 @@ class HuParser:
|
|
482 |
continue
|
483 |
ch = c["bottom"] - c["top"]
|
484 |
bh = bxs[ii]["bottom"] - bxs[ii]["top"]
|
485 |
-
if abs(ch - bh) / max(ch, bh) >= 0.7:
|
486 |
self.lefted_chars.append(c)
|
487 |
continue
|
488 |
-
|
|
|
|
|
|
|
489 |
|
490 |
for b in bxs:
|
491 |
if not b["text"]:
|
@@ -629,7 +650,7 @@ class HuParser:
|
|
629 |
i += 1
|
630 |
self.boxes = bxs
|
631 |
|
632 |
-
def _concat_downward(self):
|
633 |
# count boxes in the same row as a feature
|
634 |
for i in range(len(self.boxes)):
|
635 |
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
@@ -665,6 +686,8 @@ class HuParser:
|
|
665 |
if not smpg and ydis > mh * 16:
|
666 |
break
|
667 |
down = boxes[i]
|
|
|
|
|
668 |
|
669 |
if up.get("R", "") != down.get(
|
670 |
"R", "") and up["text"][-1] != ",":
|
@@ -735,43 +758,29 @@ class HuParser:
|
|
735 |
|
736 |
self.boxes = self.sort_Y_firstly(boxes, 0)
|
737 |
|
738 |
-
def
|
739 |
if not self.boxes:
|
740 |
return
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
if possible(c):
|
761 |
-
pg_hits[c["page_number"] - 1] += 1
|
762 |
-
|
763 |
-
st, ed = -1, -1
|
764 |
-
for i in range(len(self.boxes)):
|
765 |
-
c = self.boxes[i]
|
766 |
-
if c["page_number"] >= to:
|
767 |
break
|
768 |
-
if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
|
769 |
-
if st < 0:
|
770 |
-
st = i
|
771 |
-
else:
|
772 |
-
ed = i
|
773 |
-
for _ in range(st, ed + 1):
|
774 |
-
self.boxes.pop(st)
|
775 |
|
776 |
def _blockType(self, b):
|
777 |
patt = [
|
@@ -918,7 +927,7 @@ class HuParser:
|
|
918 |
lst_r = rows[-1]
|
919 |
if lst_r[-1].get("R", "") != b.get("R", "") \
|
920 |
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
|
921 |
-
|
922 |
btm = b["bottom"]
|
923 |
b["rn"] += 1
|
924 |
rows.append([b])
|
@@ -968,9 +977,9 @@ class HuParser:
|
|
968 |
j += 1
|
969 |
continue
|
970 |
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
|
971 |
-
|
972 |
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
|
973 |
-
|
974 |
if f and ff:
|
975 |
j += 1
|
976 |
continue
|
@@ -1031,9 +1040,9 @@ class HuParser:
|
|
1031 |
i += 1
|
1032 |
continue
|
1033 |
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
|
1034 |
-
|
1035 |
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
|
1036 |
-
|
1037 |
if f and ff:
|
1038 |
i += 1
|
1039 |
continue
|
@@ -1153,6 +1162,7 @@ class HuParser:
|
|
1153 |
headers = {}
|
1154 |
hdrset = set()
|
1155 |
lst_hdr = []
|
|
|
1156 |
for r in sorted(list(hdr_rowno)):
|
1157 |
headers[r] = ["" for _ in range(clmno)]
|
1158 |
for i in range(clmno):
|
@@ -1184,12 +1194,12 @@ class HuParser:
|
|
1184 |
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
1185 |
continue
|
1186 |
if len(headers[j][k]) > len(headers[j - 1][k]):
|
1187 |
-
headers[j][k] += (
|
1188 |
else "") + headers[j - 1][k]
|
1189 |
else:
|
1190 |
headers[j][k] = headers[j - 1][k] \
|
1191 |
-
|
1192 |
-
|
1193 |
|
1194 |
logging.debug(
|
1195 |
f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
@@ -1241,7 +1251,11 @@ class HuParser:
|
|
1241 |
row_txt.append("; ".join(rtxt))
|
1242 |
|
1243 |
if cap:
|
1244 |
-
|
|
|
|
|
|
|
|
|
1245 |
return row_txt
|
1246 |
|
1247 |
@staticmethod
|
@@ -1254,7 +1268,7 @@ class HuParser:
|
|
1254 |
return True
|
1255 |
return False
|
1256 |
|
1257 |
-
def
|
1258 |
tables = {}
|
1259 |
figures = {}
|
1260 |
# extract figure and table boxes
|
@@ -1266,7 +1280,7 @@ class HuParser:
|
|
1266 |
i += 1
|
1267 |
continue
|
1268 |
lout_no = str(self.boxes[i]["page_number"]) + \
|
1269 |
-
|
1270 |
if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
|
1271 |
"figure caption", "reference"]:
|
1272 |
nomerge_lout_no.append(lst_lout_no)
|
@@ -1574,8 +1588,14 @@ class HuParser:
|
|
1574 |
self.page_chars.append([])
|
1575 |
|
1576 |
logging.info("Images converted.")
|
|
|
|
|
|
|
|
|
|
|
|
|
1577 |
for i, img in enumerate(self.page_images):
|
1578 |
-
chars = self.page_chars[i]
|
1579 |
self.mean_height.append(
|
1580 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
1581 |
)
|
@@ -1583,6 +1603,14 @@ class HuParser:
|
|
1583 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
1584 |
)
|
1585 |
self.page_cum_height.append(img.size[1] / zoomin)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1586 |
# if i > 0:
|
1587 |
# if not chars:
|
1588 |
# self.page_cum_height.append(img.size[1] / zoomin)
|
@@ -1591,8 +1619,13 @@ class HuParser:
|
|
1591 |
# np.max([c["bottom"] for c in chars]))
|
1592 |
self.__ocr_paddle(i + 1, img, chars, zoomin)
|
1593 |
|
|
|
|
|
|
|
|
|
|
|
1594 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
1595 |
-
assert len(self.page_cum_height) == len(self.page_images)+1
|
1596 |
|
1597 |
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
1598 |
self.__images__(fnm, zoomin)
|
@@ -1600,8 +1633,8 @@ class HuParser:
|
|
1600 |
self._table_transformer_job(zoomin)
|
1601 |
self._text_merge()
|
1602 |
self._concat_downward()
|
1603 |
-
self.
|
1604 |
-
tbls = self.
|
1605 |
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
1606 |
|
1607 |
def remove_tag(self, txt):
|
@@ -1622,7 +1655,7 @@ class HuParser:
|
|
1622 |
self.page_images[pns[0]].crop((left * ZM, top * ZM,
|
1623 |
right *
|
1624 |
ZM, min(
|
1625 |
-
|
1626 |
))
|
1627 |
)
|
1628 |
bottom -= self.page_images[pns[0]].size[1]
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
import random
|
3 |
+
|
4 |
import fitz
|
5 |
import xgboost as xgb
|
6 |
from io import BytesIO
|
|
|
16 |
from rag.cv.table_recognize import TableTransformer
|
17 |
from rag.cv.ppdetection import PPDet
|
18 |
from huggingface_hub import hf_hub_download
|
19 |
+
|
20 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
21 |
|
22 |
|
|
|
25 |
from paddleocr import PaddleOCR
|
26 |
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
27 |
self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
|
28 |
+
self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet")
|
29 |
+
self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl")
|
30 |
|
31 |
self.updown_cnt_mdl = xgb.Booster()
|
32 |
if torch.cuda.is_available():
|
|
|
58 |
def _y_dis(
|
59 |
self, a, b):
|
60 |
return (
|
61 |
+
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
|
62 |
|
63 |
def _match_proj(self, b):
|
64 |
proj_patt = [
|
|
|
81 |
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
82 |
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
83 |
tks_all = up["text"][-LEN:].strip() \
|
84 |
+
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
85 |
+
up["text"][-1] + down["text"][0]) else "") \
|
86 |
+
+ down["text"][:LEN].strip()
|
87 |
tks_all = huqie.qie(tks_all).split(" ")
|
88 |
fea = [
|
89 |
up.get("R", -1) == down.get("R", -1),
|
|
|
105 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
106 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
107 |
True if re.search(r"[\((][^\))]+$", up["text"])
|
108 |
+
and re.search(r"[\))]", down["text"]) else False,
|
109 |
self._match_proj(down),
|
110 |
True if re.match(r"[A-Z]", down["text"]) else False,
|
111 |
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
|
|
144 |
arr[j + 1] = deepcopy(tmp)
|
145 |
return arr
|
146 |
|
147 |
+
@staticmethod
|
148 |
+
def sort_X_by_page(arr, threashold):
|
149 |
+
# sort using y1 first and then x1
|
150 |
+
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
|
151 |
+
for i in range(len(arr) - 1):
|
152 |
+
for j in range(i, -1, -1):
|
153 |
+
# restore the order using th
|
154 |
+
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
|
155 |
+
and arr[j + 1]["top"] < arr[j]["top"]\
|
156 |
+
and arr[j + 1]["page_number"] == arr[j]["page_number"]:
|
157 |
+
tmp = arr[j]
|
158 |
+
arr[j] = arr[j + 1]
|
159 |
+
arr[j + 1] = tmp
|
160 |
+
return arr
|
161 |
+
|
162 |
@staticmethod
|
163 |
def sort_R_firstly(arr, thr=0):
|
164 |
# sort using y1 first and then x1
|
|
|
237 |
assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
|
238 |
tp, btm, x0, x1, b)
|
239 |
ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
|
240 |
+
x0 != 0 and btm - tp != 0 else 0
|
241 |
if ov > 0 and ratio:
|
242 |
ov /= (x1 - x0) * (btm - tp)
|
243 |
return ov
|
|
|
344 |
return layouts
|
345 |
|
346 |
def __table_paddle(self, images):
|
347 |
+
tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
|
348 |
res = []
|
349 |
# align left&right for rows, align top&bottom for columns
|
350 |
for tbl in tbls:
|
|
|
402 |
continue
|
403 |
for tb in tbls: # for table
|
404 |
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
|
405 |
+
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
|
406 |
left *= ZM
|
407 |
top *= ZM
|
408 |
right *= ZM
|
|
|
500 |
continue
|
501 |
ch = c["bottom"] - c["top"]
|
502 |
bh = bxs[ii]["bottom"] - bxs[ii]["top"]
|
503 |
+
if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
|
504 |
self.lefted_chars.append(c)
|
505 |
continue
|
506 |
+
if c["text"] == " " and bxs[ii]["text"]:
|
507 |
+
if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
|
508 |
+
else:
|
509 |
+
bxs[ii]["text"] += c["text"]
|
510 |
|
511 |
for b in bxs:
|
512 |
if not b["text"]:
|
|
|
650 |
i += 1
|
651 |
self.boxes = bxs
|
652 |
|
653 |
+
def _concat_downward(self, concat_between_pages=True):
|
654 |
# count boxes in the same row as a feature
|
655 |
for i in range(len(self.boxes)):
|
656 |
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
|
|
686 |
if not smpg and ydis > mh * 16:
|
687 |
break
|
688 |
down = boxes[i]
|
689 |
+
if not concat_between_pages and down["page_number"] > up["page_number"]:
|
690 |
+
break
|
691 |
|
692 |
if up.get("R", "") != down.get(
|
693 |
"R", "") and up["text"][-1] != ",":
|
|
|
758 |
|
759 |
self.boxes = self.sort_Y_firstly(boxes, 0)
|
760 |
|
761 |
+
def _filter_forpages(self):
|
762 |
if not self.boxes:
|
763 |
return
|
764 |
+
i = 0
|
765 |
+
while i < len(self.boxes):
|
766 |
+
if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
|
767 |
+
i += 1
|
768 |
+
continue
|
769 |
+
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
|
770 |
+
self.boxes.pop(i)
|
771 |
+
if i >= len(self.boxes): break
|
772 |
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
|
773 |
+
while not prefix:
|
774 |
+
self.boxes.pop(i)
|
775 |
+
if i >= len(self.boxes): break
|
776 |
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
|
777 |
+
self.boxes.pop(i)
|
778 |
+
if i >= len(self.boxes) or not prefix: break
|
779 |
+
for j in range(i, min(i + 128, len(self.boxes))):
|
780 |
+
if not re.match(prefix, self.boxes[j]["text"]):
|
781 |
+
continue
|
782 |
+
for k in range(i, j): self.boxes.pop(i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
784 |
|
785 |
def _blockType(self, b):
|
786 |
patt = [
|
|
|
927 |
lst_r = rows[-1]
|
928 |
if lst_r[-1].get("R", "") != b.get("R", "") \
|
929 |
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
|
930 |
+
): # new row
|
931 |
btm = b["bottom"]
|
932 |
b["rn"] += 1
|
933 |
rows.append([b])
|
|
|
977 |
j += 1
|
978 |
continue
|
979 |
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
|
980 |
+
[j - 1][0].get("text")) or j == 0
|
981 |
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
|
982 |
+
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
|
983 |
if f and ff:
|
984 |
j += 1
|
985 |
continue
|
|
|
1040 |
i += 1
|
1041 |
continue
|
1042 |
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
|
1043 |
+
[jj][0].get("text")) or i == 0
|
1044 |
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
|
1045 |
+
[jj][0].get("text")) or i + 1 >= len(tbl)
|
1046 |
if f and ff:
|
1047 |
i += 1
|
1048 |
continue
|
|
|
1162 |
headers = {}
|
1163 |
hdrset = set()
|
1164 |
lst_hdr = []
|
1165 |
+
de = "的" if not self.is_english else " for "
|
1166 |
for r in sorted(list(hdr_rowno)):
|
1167 |
headers[r] = ["" for _ in range(clmno)]
|
1168 |
for i in range(clmno):
|
|
|
1194 |
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
1195 |
continue
|
1196 |
if len(headers[j][k]) > len(headers[j - 1][k]):
|
1197 |
+
headers[j][k] += (de if headers[j][k]
|
1198 |
else "") + headers[j - 1][k]
|
1199 |
else:
|
1200 |
headers[j][k] = headers[j - 1][k] \
|
1201 |
+
+ (de if headers[j - 1][k] else "") \
|
1202 |
+
+ headers[j][k]
|
1203 |
|
1204 |
logging.debug(
|
1205 |
f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
|
|
1251 |
row_txt.append("; ".join(rtxt))
|
1252 |
|
1253 |
if cap:
|
1254 |
+
if self.is_english:
|
1255 |
+
from_ = " in "
|
1256 |
+
else:
|
1257 |
+
from_ = "来自"
|
1258 |
+
row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
|
1259 |
return row_txt
|
1260 |
|
1261 |
@staticmethod
|
|
|
1268 |
return True
|
1269 |
return False
|
1270 |
|
1271 |
+
def _extract_table_figure(self, need_image, ZM, return_html):
|
1272 |
tables = {}
|
1273 |
figures = {}
|
1274 |
# extract figure and table boxes
|
|
|
1280 |
i += 1
|
1281 |
continue
|
1282 |
lout_no = str(self.boxes[i]["page_number"]) + \
|
1283 |
+
"-" + str(self.boxes[i]["layoutno"])
|
1284 |
if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
|
1285 |
"figure caption", "reference"]:
|
1286 |
nomerge_lout_no.append(lst_lout_no)
|
|
|
1588 |
self.page_chars.append([])
|
1589 |
|
1590 |
logging.info("Images converted.")
|
1591 |
+
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))]
|
1592 |
+
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
|
1593 |
+
self.is_english = True
|
1594 |
+
else:
|
1595 |
+
self.is_english = False
|
1596 |
+
|
1597 |
for i, img in enumerate(self.page_images):
|
1598 |
+
chars = self.page_chars[i] if not self.is_english else []
|
1599 |
self.mean_height.append(
|
1600 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
1601 |
)
|
|
|
1603 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
1604 |
)
|
1605 |
self.page_cum_height.append(img.size[1] / zoomin)
|
1606 |
+
j = 0
|
1607 |
+
while j + 1 < len(chars):
|
1608 |
+
if chars[j]["text"] and chars[j + 1]["text"] \
|
1609 |
+
and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
|
1610 |
+
and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
|
1611 |
+
chars[j]["width"]) / 2:
|
1612 |
+
chars[j]["text"] += " "
|
1613 |
+
j += 1
|
1614 |
# if i > 0:
|
1615 |
# if not chars:
|
1616 |
# self.page_cum_height.append(img.size[1] / zoomin)
|
|
|
1619 |
# np.max([c["bottom"] for c in chars]))
|
1620 |
self.__ocr_paddle(i + 1, img, chars, zoomin)
|
1621 |
|
1622 |
+
if not self.is_english and not all([c for c in self.page_chars]) and self.boxes:
|
1623 |
+
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)]))
|
1624 |
+
|
1625 |
+
logging.info("Is it English:", self.is_english)
|
1626 |
+
|
1627 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
1628 |
+
assert len(self.page_cum_height) == len(self.page_images) + 1
|
1629 |
|
1630 |
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
1631 |
self.__images__(fnm, zoomin)
|
|
|
1633 |
self._table_transformer_job(zoomin)
|
1634 |
self._text_merge()
|
1635 |
self._concat_downward()
|
1636 |
+
self._filter_forpages()
|
1637 |
+
tbls = self._extract_table_figure(need_image, zoomin, return_html)
|
1638 |
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
1639 |
|
1640 |
def remove_tag(self, txt):
|
|
|
1655 |
self.page_images[pns[0]].crop((left * ZM, top * ZM,
|
1656 |
right *
|
1657 |
ZM, min(
|
1658 |
+
bottom, self.page_images[pns[0]].size[1])
|
1659 |
))
|
1660 |
)
|
1661 |
bottom -= self.page_images[pns[0]].size[1]
|