KevinHuSh commited on
Commit
96a1a44
·
1 Parent(s): 04aba1b

add paper & manual parser (#46)

Browse files
rag/app/__init__.py CHANGED
@@ -1,5 +1,9 @@
1
  import re
2
 
 
 
 
 
3
 
4
  def callback__(progress, msg, func):
5
  if not func :return
@@ -46,3 +50,21 @@ def bullets_category(sections):
46
  res = i
47
  maxium = h
48
  return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
 
3
+ from nltk import word_tokenize
4
+
5
+ from rag.nlp import stemmer, huqie
6
+
7
 
8
  def callback__(progress, msg, func):
9
  if not func :return
 
50
  res = i
51
  maxium = h
52
  return res
53
+
54
+ def is_english(texts):
55
+ eng = 0
56
+ for t in texts:
57
+ if re.match(r"[a-zA-Z]", t.strip()):
58
+ eng += 1
59
+ if eng / len(texts) > 0.8:
60
+ return True
61
+ return False
62
+
63
+ def tokenize(d, t, eng):
64
+ d["content_with_weight"] = t
65
+ if eng:
66
+ t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
67
+ d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
68
+ else:
69
+ d["content_ltks"] = huqie.qie(t)
70
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
rag/app/laws.py CHANGED
@@ -3,12 +3,13 @@ import re
3
  from io import BytesIO
4
  from docx import Document
5
  import numpy as np
6
- from rag.app import callback__, bullets_category, BULLET_PATTERN
7
  from rag.nlp import huqie
 
8
  from rag.parser.pdf_parser import HuParser
9
 
10
 
11
- class Docx(object):
12
  def __init__(self):
13
  pass
14
 
@@ -42,14 +43,7 @@ class Pdf(HuParser):
42
  print("paddle layouts:", timer()-start)
43
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
44
  # is it English
45
- eng = 0
46
- for b in bxs:
47
- if re.match(r"[a-zA-Z]", b["text"].strip()):
48
- eng += 1
49
- if eng / len(bxs) > 0.8:
50
- eng = True
51
- else:
52
- eng = False
53
  # Merge vertically
54
  i = 0
55
  while i + 1 < len(bxs):
@@ -59,7 +53,7 @@ class Pdf(HuParser):
59
  bxs.pop(i)
60
  continue
61
  concatting_feats = [
62
- b["text"].strip()[-1] in ",;:'\",、‘“;:",
63
  len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
64
  b["text"].strip()[0] in "。;?!?”)),,、:",
65
  ]
@@ -118,14 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
118
  sections = [l for l in sections if l]
119
 
120
  # is it English
121
- eng = 0
122
- for sec in sections:
123
- if re.match(r"[a-zA-Z]", sec.strip()):
124
- eng += 1
125
- if eng / len(sections) > 0.8:
126
- eng = True
127
- else:
128
- eng = False
129
  # Remove 'Contents' part
130
  i = 0
131
  while i < len(sections):
@@ -181,8 +168,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
181
  if pdf_parser:
182
  d["image"] = pdf_parser.crop(ck)
183
  ck = pdf_parser.remove_tag(ck)
184
- d["content_ltks"] = huqie.qie(ck)
185
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
186
  res.append(d)
187
  return res
188
 
 
3
  from io import BytesIO
4
  from docx import Document
5
  import numpy as np
6
+ from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
7
  from rag.nlp import huqie
8
+ from rag.parser.docx_parser import HuDocxParser
9
  from rag.parser.pdf_parser import HuParser
10
 
11
 
12
+ class Docx(HuDocxParser):
13
  def __init__(self):
14
  pass
15
 
 
43
  print("paddle layouts:", timer()-start)
44
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
45
  # is it English
46
+ eng = is_english([b["text"] for b in bxs])
 
 
 
 
 
 
 
47
  # Merge vertically
48
  i = 0
49
  while i + 1 < len(bxs):
 
53
  bxs.pop(i)
54
  continue
55
  concatting_feats = [
56
+ b["text"].strip()[-1] in ",;:'\",、‘“;:-",
57
  len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
58
  b["text"].strip()[0] in "。;?!?”)),,、:",
59
  ]
 
112
  sections = [l for l in sections if l]
113
 
114
  # is it English
115
+ eng = is_english(sections)
 
 
 
 
 
 
 
116
  # Remove 'Contents' part
117
  i = 0
118
  while i < len(sections):
 
168
  if pdf_parser:
169
  d["image"] = pdf_parser.crop(ck)
170
  ck = pdf_parser.remove_tag(ck)
171
+ tokenize(d, ck, eng)
 
172
  res.append(d)
173
  return res
174
 
rag/app/manual.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re
3
+ from collections import Counter
4
+ from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
5
+ from rag.nlp import huqie, stemmer
6
+ from rag.parser.docx_parser import HuDocxParser
7
+ from rag.parser.pdf_parser import HuParser
8
+ from nltk.tokenize import word_tokenize
9
+ import numpy as np
10
+ from rag.utils import num_tokens_from_string
11
+
12
+
13
+ class Pdf(HuParser):
14
+ def __call__(self, filename, binary=None, from_page=0,
15
+ to_page=100000, zoomin=3, callback=None):
16
+ self.__images__(
17
+ filename if not binary else binary,
18
+ zoomin,
19
+ from_page,
20
+ to_page)
21
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
22
+ "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
23
+
24
+ from timeit import default_timer as timer
25
+ start = timer()
26
+ self._layouts_paddle(zoomin)
27
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
28
+ "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
29
+ print("paddle layouts:", timer() - start)
30
+ self._table_transformer_job(zoomin)
31
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
32
+ "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
33
+ self._text_merge()
34
+ column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
35
+ self._concat_downward(concat_between_pages=False)
36
+ self._filter_forpages()
37
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
38
+ "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
39
+ tbls = self._extract_table_figure(True, zoomin, False)
40
+
41
+ # clean mess
42
+ for b in self.boxes:
43
+ b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
44
+
45
+ # merge chunks with the same bullets
46
+ i = 0
47
+ while i + 1 < len(self.boxes):
48
+ b = self.boxes[i]
49
+ b_ = self.boxes[i + 1]
50
+ if b["text"].strip()[0] != b_["text"].strip()[0] \
51
+ or b["page_number"]!=b_["page_number"] \
52
+ or b["top"] > b_["bottom"]:
53
+ i += 1
54
+ continue
55
+ b_["text"] = b["text"] + "\n" + b_["text"]
56
+ b_["x0"] = min(b["x0"], b_["x0"])
57
+ b_["x1"] = max(b["x1"], b_["x1"])
58
+ b_["top"] = b["top"]
59
+ self.boxes.pop(i)
60
+ # merge title with decent chunk
61
+ i = 0
62
+ while i + 1 < len(self.boxes):
63
+ b = self.boxes[i]
64
+ if b.get("layoutno","").find("title") < 0:
65
+ i += 1
66
+ continue
67
+ b_ = self.boxes[i + 1]
68
+ b_["text"] = b["text"] + "\n" + b_["text"]
69
+ b_["x0"] = min(b["x0"], b_["x0"])
70
+ b_["x1"] = max(b["x1"], b_["x1"])
71
+ b_["top"] = b["top"]
72
+ self.boxes.pop(i)
73
+
74
+ for b in self.boxes: print(b["text"], b.get("layoutno"))
75
+
76
+ print(tbls)
77
+ return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
78
+
79
+
80
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
81
+ pdf_parser = None
82
+ paper = {}
83
+
84
+ if re.search(r"\.pdf$", filename, re.IGNORECASE):
85
+ pdf_parser = Pdf()
86
+ cks, tbls = pdf_parser(filename if not binary else binary,
87
+ from_page=from_page, to_page=to_page, callback=callback)
88
+ doc = {
89
+ "docnm_kwd": filename
90
+ }
91
+ doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
92
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
93
+ # is it English
94
+ eng = pdf_parser.is_english
95
+
96
+ res = []
97
+ # add tables
98
+ for img, rows in tbls:
99
+ bs = 10
100
+ de = ";" if eng else ";"
101
+ for i in range(0, len(rows), bs):
102
+ d = copy.deepcopy(doc)
103
+ r = de.join(rows[i:i + bs])
104
+ r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
105
+ tokenize(d, r, eng)
106
+ d["image"] = img
107
+ res.append(d)
108
+
109
+ i = 0
110
+ chunk = []
111
+ tk_cnt = 0
112
+ def add_chunk():
113
+ nonlocal chunk, res, doc, pdf_parser, tk_cnt
114
+ d = copy.deepcopy(doc)
115
+ ck = "\n".join(chunk)
116
+ tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
117
+ d["image"] = pdf_parser.crop(ck)
118
+ res.append(d)
119
+ chunk = []
120
+ tk_cnt = 0
121
+
122
+ while i < len(cks):
123
+ if tk_cnt > 128: add_chunk()
124
+ txt = cks[i]
125
+ txt_ = pdf_parser.remove_tag(txt)
126
+ i += 1
127
+ cnt = num_tokens_from_string(txt_)
128
+ chunk.append(txt)
129
+ tk_cnt += cnt
130
+ if chunk: add_chunk()
131
+ for i, d in enumerate(res):
132
+ print(d)
133
+ # d["image"].save(f"./logs/{i}.jpg")
134
+ return res
135
+
136
+
137
+ if __name__ == "__main__":
138
+ import sys
139
+
140
+ chunk(sys.argv[1])
rag/app/paper.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re
3
+ from collections import Counter
4
+ from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
5
+ from rag.nlp import huqie, stemmer
6
+ from rag.parser.docx_parser import HuDocxParser
7
+ from rag.parser.pdf_parser import HuParser
8
+ from nltk.tokenize import word_tokenize
9
+ import numpy as np
10
+ from rag.utils import num_tokens_from_string
11
+
12
+
13
+ class Pdf(HuParser):
14
+ def __call__(self, filename, binary=None, from_page=0,
15
+ to_page=100000, zoomin=3, callback=None):
16
+ self.__images__(
17
+ filename if not binary else binary,
18
+ zoomin,
19
+ from_page,
20
+ to_page)
21
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
22
+ "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
23
+
24
+ from timeit import default_timer as timer
25
+ start = timer()
26
+ self._layouts_paddle(zoomin)
27
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
28
+ "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
29
+ print("paddle layouts:", timer() - start)
30
+ self._table_transformer_job(zoomin)
31
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
32
+ "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
33
+ self._text_merge()
34
+ column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
35
+ self._concat_downward(concat_between_pages=False)
36
+ self._filter_forpages()
37
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
38
+ "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
39
+ tbls = self._extract_table_figure(True, zoomin, False)
40
+
41
+ # clean mess
42
+ if column_width < self.page_images[0].size[0] / zoomin / 2:
43
+ print("two_column...................", column_width,
44
+ self.page_images[0].size[0] / zoomin / 2)
45
+ self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
46
+ for b in self.boxes:
47
+ b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
48
+ freq = Counter([b["text"] for b in self.boxes])
49
+ garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
50
+ i = 0
51
+ while i < len(self.boxes):
52
+ if self.boxes[i]["text"] in garbage \
53
+ or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
54
+ or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
55
+ self.boxes.pop(i)
56
+ elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
57
+ '1'):
58
+ # merge within same layouts
59
+ self.boxes[i + 1]["top"] = self.boxes[i]["top"]
60
+ self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
61
+ self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
62
+ self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
63
+ self.boxes.pop(i)
64
+ else:
65
+ i += 1
66
+
67
+ def _begin(txt):
68
+ return re.match(
69
+ "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
70
+ txt.lower().strip())
71
+
72
+ # get title and authors
73
+ title = ""
74
+ authors = []
75
+ i = 0
76
+ while i < min(32, len(self.boxes)):
77
+ b = self.boxes[i]
78
+ i += 1
79
+ if b.get("layoutno", "").find("title") >= 0:
80
+ title = b["text"]
81
+ if _begin(title):
82
+ title = ""
83
+ break
84
+ for j in range(3):
85
+ if _begin(self.boxes[i + j]["text"]): break
86
+ authors.append(self.boxes[i + j]["text"])
87
+ break
88
+ break
89
+ # get abstract
90
+ abstr = ""
91
+ i = 0
92
+ while i + 1 < min(32, len(self.boxes)):
93
+ b = self.boxes[i]
94
+ i += 1
95
+ txt = b["text"].lower().strip()
96
+ if re.match("(abstract|摘要)", txt):
97
+ if len(txt.split(" ")) > 32 or len(txt) > 64:
98
+ abstr = txt + self._line_tag(b, zoomin)
99
+ i += 1
100
+ break
101
+ txt = self.boxes[i + 1]["text"].lower().strip()
102
+ if len(txt.split(" ")) > 32 or len(txt) > 64:
103
+ abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
104
+ i += 1
105
+ break
106
+ if not abstr: i = 0
107
+
108
+ for b in self.boxes: print(b["text"], b.get("layoutno"))
109
+ print(tbls)
110
+
111
+ return {
112
+ "title": title if title else filename,
113
+ "authors": " ".join(authors),
114
+ "abstract": abstr,
115
+ "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
116
+ re.match(r"(text|title)", b.get("layoutno", "text"))],
117
+ "tables": tbls
118
+ }
119
+
120
+
121
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
122
+ pdf_parser = None
123
+ paper = {}
124
+
125
+ if re.search(r"\.pdf$", filename, re.IGNORECASE):
126
+ pdf_parser = Pdf()
127
+ paper = pdf_parser(filename if not binary else binary,
128
+ from_page=from_page, to_page=to_page, callback=callback)
129
+ doc = {
130
+ "docnm_kwd": paper["title"] if paper["title"] else filename,
131
+ "authors_tks": paper["authors"]
132
+ }
133
+ doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
134
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
135
+ doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
136
+ # is it English
137
+ eng = pdf_parser.is_english
138
+ print("It's English.....", eng)
139
+
140
+ res = []
141
+ # add tables
142
+ for img, rows in paper["tables"]:
143
+ bs = 10
144
+ de = ";" if eng else ";"
145
+ for i in range(0, len(rows), bs):
146
+ d = copy.deepcopy(doc)
147
+ r = de.join(rows[i:i + bs])
148
+ r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
149
+ tokenize(d, r)
150
+ d["image"] = img
151
+ res.append(d)
152
+
153
+ if paper["abstract"]:
154
+ d = copy.deepcopy(doc)
155
+ txt = pdf_parser.remove_tag(paper["abstract"])
156
+ d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
157
+ d["important_tks"] = " ".join(d["important_kwd"])
158
+ d["image"] = pdf_parser.crop(paper["abstract"])
159
+ tokenize(d, txt, eng)
160
+ res.append(d)
161
+
162
+ readed = [0] * len(paper["lines"])
163
+ # find colon firstly
164
+ i = 0
165
+ while i + 1 < len(paper["lines"]):
166
+ txt = pdf_parser.remove_tag(paper["lines"][i][0])
167
+ j = i
168
+ if txt.strip("\n").strip()[-1] not in "::":
169
+ i += 1
170
+ continue
171
+ i += 1
172
+ while i < len(paper["lines"]) and not paper["lines"][i][0]:
173
+ i += 1
174
+ if i >= len(paper["lines"]): break
175
+ proj = [paper["lines"][i][0].strip()]
176
+ i += 1
177
+ while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
178
+ proj.append(paper["lines"][i])
179
+ i += 1
180
+ for k in range(j, i): readed[k] = True
181
+ txt = txt[::-1]
182
+ if eng:
183
+ r = re.search(r"(.*?) ([\.;?!]|$)", txt)
184
+ txt = r.group(1)[::-1] if r else txt[::-1]
185
+ else:
186
+ r = re.search(r"(.*?) ([。?;!]|$)", txt)
187
+ txt = r.group(1)[::-1] if r else txt[::-1]
188
+ for p in proj:
189
+ d = copy.deepcopy(doc)
190
+ txt += "\n" + pdf_parser.remove_tag(p)
191
+ d["image"] = pdf_parser.crop(p)
192
+ tokenize(d, txt)
193
+ res.append(d)
194
+
195
+ i = 0
196
+ chunk = []
197
+ tk_cnt = 0
198
+ def add_chunk():
199
+ nonlocal chunk, res, doc, pdf_parser, tk_cnt
200
+ d = copy.deepcopy(doc)
201
+ ck = "\n".join(chunk)
202
+ tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
203
+ d["image"] = pdf_parser.crop(ck)
204
+ res.append(d)
205
+ chunk = []
206
+ tk_cnt = 0
207
+
208
+ while i < len(paper["lines"]):
209
+ if tk_cnt > 128:
210
+ add_chunk()
211
+ if readed[i]:
212
+ i += 1
213
+ continue
214
+ readed[i] = True
215
+ txt, layouts = paper["lines"][i]
216
+ txt_ = pdf_parser.remove_tag(txt)
217
+ i += 1
218
+ cnt = num_tokens_from_string(txt_)
219
+ if any([
220
+ layouts.find("title") >= 0 and chunk,
221
+ cnt + tk_cnt > 128 and tk_cnt > 32,
222
+ ]):
223
+ add_chunk()
224
+ chunk = [txt]
225
+ tk_cnt = cnt
226
+ else:
227
+ chunk.append(txt)
228
+ tk_cnt += cnt
229
+
230
+ if chunk: add_chunk()
231
+ for i, d in enumerate(res):
232
+ print(d)
233
+ # d["image"].save(f"./logs/{i}.jpg")
234
+ return res
235
+
236
+
237
+ if __name__ == "__main__":
238
+ import sys
239
+
240
+ chunk(sys.argv[1])
rag/app/presentation.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  from io import BytesIO
4
  from pptx import Presentation
5
 
6
- from rag.app import callback__
7
  from rag.nlp import huqie
8
  from rag.parser.pdf_parser import HuParser
9
 
@@ -57,7 +57,7 @@ class Ppt(object):
57
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
58
  callback__((min(to_page, self.total_page) - from_page) / self.total_page,
59
  "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
60
-
61
  return [(txts[i], imgs[i]) for i in range(len(txts))]
62
 
63
 
@@ -103,19 +103,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
103
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
104
  res = []
105
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
106
- for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback):
 
107
  d = copy.deepcopy(doc)
108
- d["content_ltks"] = huqie.qie(txt)
109
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
110
  d["image"] = img
 
111
  res.append(d)
112
  return res
113
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
114
- for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
 
115
  d = copy.deepcopy(doc)
116
- d["content_ltks"] = huqie.qie(txt)
117
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
118
  d["image"] = img
 
119
  res.append(d)
120
  return res
121
  callback__(-1, "This kind of presentation document did not support yet!", callback)
 
3
  from io import BytesIO
4
  from pptx import Presentation
5
 
6
+ from rag.app import callback__, tokenize, is_english
7
  from rag.nlp import huqie
8
  from rag.parser.pdf_parser import HuParser
9
 
 
57
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
58
  callback__((min(to_page, self.total_page) - from_page) / self.total_page,
59
  "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
60
+ self.is_english = is_english(txts)
61
  return [(txts[i], imgs[i]) for i in range(len(txts))]
62
 
63
 
 
103
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
104
  res = []
105
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
106
+ ppt_parser = Ppt()
107
+ for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback):
108
  d = copy.deepcopy(doc)
 
 
109
  d["image"] = img
110
+ tokenize(d, txt, ppt_parser.is_english)
111
  res.append(d)
112
  return res
113
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
114
+ pdf_parser = Pdf()
115
+ for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
116
  d = copy.deepcopy(doc)
 
 
117
  d["image"] = img
118
+ tokenize(d, txt, pdf_parser.is_english)
119
  res.append(d)
120
  return res
121
  callback__(-1, "This kind of presentation document did not support yet!", callback)
rag/nlp/__init__.py CHANGED
@@ -1,4 +1,7 @@
1
  from . import search
2
  from rag.utils import ELASTICSEARCH
3
 
4
- retrievaler = search.Dealer(ELASTICSEARCH)
 
 
 
 
1
  from . import search
2
  from rag.utils import ELASTICSEARCH
3
 
4
+ retrievaler = search.Dealer(ELASTICSEARCH)
5
+
6
+ from nltk.stem import PorterStemmer
7
+ stemmer = PorterStemmer()
rag/parser/pdf_parser.py CHANGED
@@ -1,4 +1,6 @@
1
  # -*- coding: utf-8 -*-
 
 
2
  import fitz
3
  import xgboost as xgb
4
  from io import BytesIO
@@ -14,6 +16,7 @@ from copy import deepcopy
14
  from rag.cv.table_recognize import TableTransformer
15
  from rag.cv.ppdetection import PPDet
16
  from huggingface_hub import hf_hub_download
 
17
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
18
 
19
 
@@ -22,8 +25,8 @@ class HuParser:
22
  from paddleocr import PaddleOCR
23
  logging.getLogger("ppocr").setLevel(logging.ERROR)
24
  self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
25
- self.layouter = PPDet()
26
- self.tbl_det = TableTransformer()
27
 
28
  self.updown_cnt_mdl = xgb.Booster()
29
  if torch.cuda.is_available():
@@ -55,7 +58,7 @@ class HuParser:
55
  def _y_dis(
56
  self, a, b):
57
  return (
58
- b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
59
 
60
  def _match_proj(self, b):
61
  proj_patt = [
@@ -78,9 +81,9 @@ class HuParser:
78
  tks_down = huqie.qie(down["text"][:LEN]).split(" ")
79
  tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
80
  tks_all = up["text"][-LEN:].strip() \
81
- + (" " if re.match(r"[a-zA-Z0-9]+",
82
- up["text"][-1] + down["text"][0]) else "") \
83
- + down["text"][:LEN].strip()
84
  tks_all = huqie.qie(tks_all).split(" ")
85
  fea = [
86
  up.get("R", -1) == down.get("R", -1),
@@ -102,7 +105,7 @@ class HuParser:
102
  True if re.search(r"[,,][^。.]+$", up["text"]) else False,
103
  True if re.search(r"[,,][^。.]+$", up["text"]) else False,
104
  True if re.search(r"[\((][^\))]+$", up["text"])
105
- and re.search(r"[\))]", down["text"]) else False,
106
  self._match_proj(down),
107
  True if re.match(r"[A-Z]", down["text"]) else False,
108
  True if re.match(r"[A-Z]", up["text"][-1]) else False,
@@ -141,6 +144,21 @@ class HuParser:
141
  arr[j + 1] = deepcopy(tmp)
142
  return arr
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  @staticmethod
145
  def sort_R_firstly(arr, thr=0):
146
  # sort using y1 first and then x1
@@ -219,7 +237,7 @@ class HuParser:
219
  assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
220
  tp, btm, x0, x1, b)
221
  ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
222
- x0 != 0 and btm - tp != 0 else 0
223
  if ov > 0 and ratio:
224
  ov /= (x1 - x0) * (btm - tp)
225
  return ov
@@ -326,7 +344,7 @@ class HuParser:
326
  return layouts
327
 
328
  def __table_paddle(self, images):
329
- tbls = self.tbl_det([img for img in images], threshold=0.5)
330
  res = []
331
  # align left&right for rows, align top&bottom for columns
332
  for tbl in tbls:
@@ -384,7 +402,7 @@ class HuParser:
384
  continue
385
  for tb in tbls: # for table
386
  left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
387
- tb["x1"] + MARGIN, tb["bottom"] + MARGIN
388
  left *= ZM
389
  top *= ZM
390
  right *= ZM
@@ -482,10 +500,13 @@ class HuParser:
482
  continue
483
  ch = c["bottom"] - c["top"]
484
  bh = bxs[ii]["bottom"] - bxs[ii]["top"]
485
- if abs(ch - bh) / max(ch, bh) >= 0.7:
486
  self.lefted_chars.append(c)
487
  continue
488
- bxs[ii]["text"] += c["text"]
 
 
 
489
 
490
  for b in bxs:
491
  if not b["text"]:
@@ -629,7 +650,7 @@ class HuParser:
629
  i += 1
630
  self.boxes = bxs
631
 
632
- def _concat_downward(self):
633
  # count boxes in the same row as a feature
634
  for i in range(len(self.boxes)):
635
  mh = self.mean_height[self.boxes[i]["page_number"] - 1]
@@ -665,6 +686,8 @@ class HuParser:
665
  if not smpg and ydis > mh * 16:
666
  break
667
  down = boxes[i]
 
 
668
 
669
  if up.get("R", "") != down.get(
670
  "R", "") and up["text"][-1] != ",":
@@ -735,43 +758,29 @@ class HuParser:
735
 
736
  self.boxes = self.sort_Y_firstly(boxes, 0)
737
 
738
- def __filter_forpages(self):
739
  if not self.boxes:
740
  return
741
- to = min(7, len(self.page_images) // 5)
742
- pg_hits = [0 for _ in range(to)]
743
-
744
- def possible(c):
745
- if c.get("layout_type", "") == "reference":
746
- return True
747
- if c["bottom"] - c["top"] >= 2 * \
748
- self.mean_height[c["page_number"] - 1]:
749
- return False
750
- if c["text"].find("....") >= 0 \
751
- or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$",
752
- c["text"].strip())):
753
- return True
754
- return self.is_caption(c) and re.search(
755
- r"[0-9]+$", c["text"].strip())
756
-
757
- for c in self.boxes:
758
- if c["page_number"] >= to:
759
- break
760
- if possible(c):
761
- pg_hits[c["page_number"] - 1] += 1
762
-
763
- st, ed = -1, -1
764
- for i in range(len(self.boxes)):
765
- c = self.boxes[i]
766
- if c["page_number"] >= to:
767
  break
768
- if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
769
- if st < 0:
770
- st = i
771
- else:
772
- ed = i
773
- for _ in range(st, ed + 1):
774
- self.boxes.pop(st)
775
 
776
  def _blockType(self, b):
777
  patt = [
@@ -918,7 +927,7 @@ class HuParser:
918
  lst_r = rows[-1]
919
  if lst_r[-1].get("R", "") != b.get("R", "") \
920
  or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
921
- ): # new row
922
  btm = b["bottom"]
923
  b["rn"] += 1
924
  rows.append([b])
@@ -968,9 +977,9 @@ class HuParser:
968
  j += 1
969
  continue
970
  f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
971
- [j - 1][0].get("text")) or j == 0
972
  ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
973
- [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
974
  if f and ff:
975
  j += 1
976
  continue
@@ -1031,9 +1040,9 @@ class HuParser:
1031
  i += 1
1032
  continue
1033
  f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
1034
- [jj][0].get("text")) or i == 0
1035
  ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
1036
- [jj][0].get("text")) or i + 1 >= len(tbl)
1037
  if f and ff:
1038
  i += 1
1039
  continue
@@ -1153,6 +1162,7 @@ class HuParser:
1153
  headers = {}
1154
  hdrset = set()
1155
  lst_hdr = []
 
1156
  for r in sorted(list(hdr_rowno)):
1157
  headers[r] = ["" for _ in range(clmno)]
1158
  for i in range(clmno):
@@ -1184,12 +1194,12 @@ class HuParser:
1184
  if headers[j][k].find(headers[j - 1][k]) >= 0:
1185
  continue
1186
  if len(headers[j][k]) > len(headers[j - 1][k]):
1187
- headers[j][k] += ("的" if headers[j][k]
1188
  else "") + headers[j - 1][k]
1189
  else:
1190
  headers[j][k] = headers[j - 1][k] \
1191
- + ("的" if headers[j - 1][k] else "") \
1192
- + headers[j][k]
1193
 
1194
  logging.debug(
1195
  f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
@@ -1241,7 +1251,11 @@ class HuParser:
1241
  row_txt.append("; ".join(rtxt))
1242
 
1243
  if cap:
1244
- row_txt = [t + f"\t——来自“{cap}”" for t in row_txt]
 
 
 
 
1245
  return row_txt
1246
 
1247
  @staticmethod
@@ -1254,7 +1268,7 @@ class HuParser:
1254
  return True
1255
  return False
1256
 
1257
- def __extract_table_figure(self, need_image, ZM, return_html):
1258
  tables = {}
1259
  figures = {}
1260
  # extract figure and table boxes
@@ -1266,7 +1280,7 @@ class HuParser:
1266
  i += 1
1267
  continue
1268
  lout_no = str(self.boxes[i]["page_number"]) + \
1269
- "-" + str(self.boxes[i]["layoutno"])
1270
  if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
1271
  "figure caption", "reference"]:
1272
  nomerge_lout_no.append(lst_lout_no)
@@ -1574,8 +1588,14 @@ class HuParser:
1574
  self.page_chars.append([])
1575
 
1576
  logging.info("Images converted.")
 
 
 
 
 
 
1577
  for i, img in enumerate(self.page_images):
1578
- chars = self.page_chars[i]
1579
  self.mean_height.append(
1580
  np.median(sorted([c["height"] for c in chars])) if chars else 0
1581
  )
@@ -1583,6 +1603,14 @@ class HuParser:
1583
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1584
  )
1585
  self.page_cum_height.append(img.size[1] / zoomin)
 
 
 
 
 
 
 
 
1586
  # if i > 0:
1587
  # if not chars:
1588
  # self.page_cum_height.append(img.size[1] / zoomin)
@@ -1591,8 +1619,13 @@ class HuParser:
1591
  # np.max([c["bottom"] for c in chars]))
1592
  self.__ocr_paddle(i + 1, img, chars, zoomin)
1593
 
 
 
 
 
 
1594
  self.page_cum_height = np.cumsum(self.page_cum_height)
1595
- assert len(self.page_cum_height) == len(self.page_images)+1
1596
 
1597
  def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1598
  self.__images__(fnm, zoomin)
@@ -1600,8 +1633,8 @@ class HuParser:
1600
  self._table_transformer_job(zoomin)
1601
  self._text_merge()
1602
  self._concat_downward()
1603
- self.__filter_forpages()
1604
- tbls = self.__extract_table_figure(need_image, zoomin, return_html)
1605
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1606
 
1607
  def remove_tag(self, txt):
@@ -1622,7 +1655,7 @@ class HuParser:
1622
  self.page_images[pns[0]].crop((left * ZM, top * ZM,
1623
  right *
1624
  ZM, min(
1625
- bottom, self.page_images[pns[0]].size[1])
1626
  ))
1627
  )
1628
  bottom -= self.page_images[pns[0]].size[1]
 
1
  # -*- coding: utf-8 -*-
2
+ import random
3
+
4
  import fitz
5
  import xgboost as xgb
6
  from io import BytesIO
 
16
  from rag.cv.table_recognize import TableTransformer
17
  from rag.cv.ppdetection import PPDet
18
  from huggingface_hub import hf_hub_download
19
+
20
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
21
 
22
 
 
25
  from paddleocr import PaddleOCR
26
  logging.getLogger("ppocr").setLevel(logging.ERROR)
27
  self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
28
+ self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet")
29
+ self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl")
30
 
31
  self.updown_cnt_mdl = xgb.Booster()
32
  if torch.cuda.is_available():
 
58
  def _y_dis(
59
  self, a, b):
60
  return (
61
+ b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
62
 
63
  def _match_proj(self, b):
64
  proj_patt = [
 
81
  tks_down = huqie.qie(down["text"][:LEN]).split(" ")
82
  tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
83
  tks_all = up["text"][-LEN:].strip() \
84
+ + (" " if re.match(r"[a-zA-Z0-9]+",
85
+ up["text"][-1] + down["text"][0]) else "") \
86
+ + down["text"][:LEN].strip()
87
  tks_all = huqie.qie(tks_all).split(" ")
88
  fea = [
89
  up.get("R", -1) == down.get("R", -1),
 
105
  True if re.search(r"[,,][^。.]+$", up["text"]) else False,
106
  True if re.search(r"[,,][^。.]+$", up["text"]) else False,
107
  True if re.search(r"[\((][^\))]+$", up["text"])
108
+ and re.search(r"[\))]", down["text"]) else False,
109
  self._match_proj(down),
110
  True if re.match(r"[A-Z]", down["text"]) else False,
111
  True if re.match(r"[A-Z]", up["text"][-1]) else False,
 
144
  arr[j + 1] = deepcopy(tmp)
145
  return arr
146
 
147
+ @staticmethod
148
+ def sort_X_by_page(arr, threashold):
149
+ # sort using y1 first and then x1
150
+ arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
151
+ for i in range(len(arr) - 1):
152
+ for j in range(i, -1, -1):
153
+ # restore the order using th
154
+ if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
155
+ and arr[j + 1]["top"] < arr[j]["top"]\
156
+ and arr[j + 1]["page_number"] == arr[j]["page_number"]:
157
+ tmp = arr[j]
158
+ arr[j] = arr[j + 1]
159
+ arr[j + 1] = tmp
160
+ return arr
161
+
162
  @staticmethod
163
  def sort_R_firstly(arr, thr=0):
164
  # sort using y1 first and then x1
 
237
  assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
238
  tp, btm, x0, x1, b)
239
  ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
240
+ x0 != 0 and btm - tp != 0 else 0
241
  if ov > 0 and ratio:
242
  ov /= (x1 - x0) * (btm - tp)
243
  return ov
 
344
  return layouts
345
 
346
  def __table_paddle(self, images):
347
+ tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
348
  res = []
349
  # align left&right for rows, align top&bottom for columns
350
  for tbl in tbls:
 
402
  continue
403
  for tb in tbls: # for table
404
  left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
405
+ tb["x1"] + MARGIN, tb["bottom"] + MARGIN
406
  left *= ZM
407
  top *= ZM
408
  right *= ZM
 
500
  continue
501
  ch = c["bottom"] - c["top"]
502
  bh = bxs[ii]["bottom"] - bxs[ii]["top"]
503
+ if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
504
  self.lefted_chars.append(c)
505
  continue
506
+ if c["text"] == " " and bxs[ii]["text"]:
507
+ if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
508
+ else:
509
+ bxs[ii]["text"] += c["text"]
510
 
511
  for b in bxs:
512
  if not b["text"]:
 
650
  i += 1
651
  self.boxes = bxs
652
 
653
+ def _concat_downward(self, concat_between_pages=True):
654
  # count boxes in the same row as a feature
655
  for i in range(len(self.boxes)):
656
  mh = self.mean_height[self.boxes[i]["page_number"] - 1]
 
686
  if not smpg and ydis > mh * 16:
687
  break
688
  down = boxes[i]
689
+ if not concat_between_pages and down["page_number"] > up["page_number"]:
690
+ break
691
 
692
  if up.get("R", "") != down.get(
693
  "R", "") and up["text"][-1] != ",":
 
758
 
759
  self.boxes = self.sort_Y_firstly(boxes, 0)
760
 
761
+ def _filter_forpages(self):
762
  if not self.boxes:
763
  return
764
+ i = 0
765
+ while i < len(self.boxes):
766
+ if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
767
+ i += 1
768
+ continue
769
+ eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
770
+ self.boxes.pop(i)
771
+ if i >= len(self.boxes): break
772
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
773
+ while not prefix:
774
+ self.boxes.pop(i)
775
+ if i >= len(self.boxes): break
776
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
777
+ self.boxes.pop(i)
778
+ if i >= len(self.boxes) or not prefix: break
779
+ for j in range(i, min(i + 128, len(self.boxes))):
780
+ if not re.match(prefix, self.boxes[j]["text"]):
781
+ continue
782
+ for k in range(i, j): self.boxes.pop(i)
 
 
 
 
 
 
 
783
  break
 
 
 
 
 
 
 
784
 
785
  def _blockType(self, b):
786
  patt = [
 
927
  lst_r = rows[-1]
928
  if lst_r[-1].get("R", "") != b.get("R", "") \
929
  or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
930
+ ): # new row
931
  btm = b["bottom"]
932
  b["rn"] += 1
933
  rows.append([b])
 
977
  j += 1
978
  continue
979
  f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
980
+ [j - 1][0].get("text")) or j == 0
981
  ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
982
+ [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
983
  if f and ff:
984
  j += 1
985
  continue
 
1040
  i += 1
1041
  continue
1042
  f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
1043
+ [jj][0].get("text")) or i == 0
1044
  ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
1045
+ [jj][0].get("text")) or i + 1 >= len(tbl)
1046
  if f and ff:
1047
  i += 1
1048
  continue
 
1162
  headers = {}
1163
  hdrset = set()
1164
  lst_hdr = []
1165
+ de = "的" if not self.is_english else " for "
1166
  for r in sorted(list(hdr_rowno)):
1167
  headers[r] = ["" for _ in range(clmno)]
1168
  for i in range(clmno):
 
1194
  if headers[j][k].find(headers[j - 1][k]) >= 0:
1195
  continue
1196
  if len(headers[j][k]) > len(headers[j - 1][k]):
1197
+ headers[j][k] += (de if headers[j][k]
1198
  else "") + headers[j - 1][k]
1199
  else:
1200
  headers[j][k] = headers[j - 1][k] \
1201
+ + (de if headers[j - 1][k] else "") \
1202
+ + headers[j][k]
1203
 
1204
  logging.debug(
1205
  f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
 
1251
  row_txt.append("; ".join(rtxt))
1252
 
1253
  if cap:
1254
+ if self.is_english:
1255
+ from_ = " in "
1256
+ else:
1257
+ from_ = "来自"
1258
+ row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
1259
  return row_txt
1260
 
1261
  @staticmethod
 
1268
  return True
1269
  return False
1270
 
1271
+ def _extract_table_figure(self, need_image, ZM, return_html):
1272
  tables = {}
1273
  figures = {}
1274
  # extract figure and table boxes
 
1280
  i += 1
1281
  continue
1282
  lout_no = str(self.boxes[i]["page_number"]) + \
1283
+ "-" + str(self.boxes[i]["layoutno"])
1284
  if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
1285
  "figure caption", "reference"]:
1286
  nomerge_lout_no.append(lst_lout_no)
 
1588
  self.page_chars.append([])
1589
 
1590
  logging.info("Images converted.")
1591
+ self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))]
1592
+ if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
1593
+ self.is_english = True
1594
+ else:
1595
+ self.is_english = False
1596
+
1597
  for i, img in enumerate(self.page_images):
1598
+ chars = self.page_chars[i] if not self.is_english else []
1599
  self.mean_height.append(
1600
  np.median(sorted([c["height"] for c in chars])) if chars else 0
1601
  )
 
1603
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1604
  )
1605
  self.page_cum_height.append(img.size[1] / zoomin)
1606
+ j = 0
1607
+ while j + 1 < len(chars):
1608
+ if chars[j]["text"] and chars[j + 1]["text"] \
1609
+ and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
1610
+ and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
1611
+ chars[j]["width"]) / 2:
1612
+ chars[j]["text"] += " "
1613
+ j += 1
1614
  # if i > 0:
1615
  # if not chars:
1616
  # self.page_cum_height.append(img.size[1] / zoomin)
 
1619
  # np.max([c["bottom"] for c in chars]))
1620
  self.__ocr_paddle(i + 1, img, chars, zoomin)
1621
 
1622
+ if not self.is_english and not all([c for c in self.page_chars]) and self.boxes:
1623
+ self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)]))
1624
+
1625
+ logging.info("Is it English:", self.is_english)
1626
+
1627
  self.page_cum_height = np.cumsum(self.page_cum_height)
1628
+ assert len(self.page_cum_height) == len(self.page_images) + 1
1629
 
1630
  def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1631
  self.__images__(fnm, zoomin)
 
1633
  self._table_transformer_job(zoomin)
1634
  self._text_merge()
1635
  self._concat_downward()
1636
+ self._filter_forpages()
1637
+ tbls = self._extract_table_figure(need_image, zoomin, return_html)
1638
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1639
 
1640
  def remove_tag(self, txt):
 
1655
  self.page_images[pns[0]].crop((left * ZM, top * ZM,
1656
  right *
1657
  ZM, min(
1658
+ bottom, self.page_images[pns[0]].size[1])
1659
  ))
1660
  )
1661
  bottom -= self.page_images[pns[0]].size[1]