Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Jan 30, 2024

Commit

96a1a44

1 Parent(s): 04aba1b

add paper & manual parser (#46)

Browse files

Files changed (7) hide show

rag/app/__init__.py +22 -0
rag/app/laws.py +7 -21
rag/app/manual.py +140 -0
rag/app/paper.py +240 -0
rag/app/presentation.py +8 -8
rag/nlp/__init__.py +4 -1
rag/parser/pdf_parser.py +96 -63

rag/app/__init__.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import re
 def callback__(progress, msg, func):
     if not func :return
@@ -46,3 +50,21 @@ def bullets_category(sections):
         res = i
         maxium = h
     return res

 import re
+from nltk import word_tokenize
+from rag.nlp import stemmer, huqie
 def callback__(progress, msg, func):
     if not func :return
         res = i
         maxium = h
     return res
+def is_english(texts):
+    eng = 0
+    for t in texts:
+        if re.match(r"[a-zA-Z]", t.strip()):
+            eng += 1
+    if eng / len(texts) > 0.8:
+        return True
+    return False
+def tokenize(d, t, eng):
+    d["content_with_weight"] = t
+    if eng:
+        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
+        d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
+    else:
+        d["content_ltks"] = huqie.qie(t)
+        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])

rag/app/laws.py CHANGED Viewed

@@ -3,12 +3,13 @@ import re
 from io import BytesIO
 from docx import Document
 import numpy as np
-from rag.app import callback__, bullets_category, BULLET_PATTERN
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
-class Docx(object):
     def __init__(self):
         pass
@@ -42,14 +43,7 @@ class Pdf(HuParser):
         print("paddle layouts:", timer()-start)
         bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
         # is it English
-        eng = 0
-        for b in bxs:
-            if re.match(r"[a-zA-Z]", b["text"].strip()):
-                eng += 1
-        if eng / len(bxs) > 0.8:
-            eng = True
-        else:
-            eng = False
         # Merge vertically
         i = 0
         while i + 1 < len(bxs):
@@ -59,7 +53,7 @@ class Pdf(HuParser):
                 bxs.pop(i)
                 continue
             concatting_feats = [
-                b["text"].strip()[-1] in ",;:'\"，、‘“；：",
                 len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\"，‘“、；：",
                 b["text"].strip()[0] in "。；？！?”）),，、：",
             ]
@@ -118,14 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
         sections = [l for l in sections if l]
     # is it English
-    eng = 0
-    for sec in sections:
-        if re.match(r"[a-zA-Z]", sec.strip()):
-            eng += 1
-    if eng / len(sections) > 0.8:
-        eng = True
-    else:
-        eng = False
     # Remove 'Contents' part
     i = 0
     while i < len(sections):
@@ -181,8 +168,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
         if pdf_parser:
             d["image"] = pdf_parser.crop(ck)
             ck = pdf_parser.remove_tag(ck)
-        d["content_ltks"] = huqie.qie(ck)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
         res.append(d)
     return res

 from io import BytesIO
 from docx import Document
 import numpy as np
+from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
 from rag.nlp import huqie
+from rag.parser.docx_parser import HuDocxParser
 from rag.parser.pdf_parser import HuParser
+class Docx(HuDocxParser):
     def __init__(self):
         pass
         print("paddle layouts:", timer()-start)
         bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
         # is it English
+        eng = is_english([b["text"] for b in bxs])
         # Merge vertically
         i = 0
         while i + 1 < len(bxs):
                 bxs.pop(i)
                 continue
             concatting_feats = [
+                b["text"].strip()[-1] in ",;:'\"，、‘“；：-",
                 len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\"，‘“、；：",
                 b["text"].strip()[0] in "。；？！?”）),，、：",
             ]
         sections = [l for l in sections if l]
     # is it English
+    eng = is_english(sections)
     # Remove 'Contents' part
     i = 0
     while i < len(sections):
         if pdf_parser:
             d["image"] = pdf_parser.crop(ck)
             ck = pdf_parser.remove_tag(ck)
+        tokenize(d, ck, eng)
         res.append(d)
     return res

rag/app/manual.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import copy
+import re
+from collections import Counter
+from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
+from rag.nlp import huqie, stemmer
+from rag.parser.docx_parser import HuDocxParser
+from rag.parser.pdf_parser import HuParser
+from nltk.tokenize import word_tokenize
+import numpy as np
+from rag.utils import num_tokens_from_string
+class Pdf(HuParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
+        from timeit import default_timer as timer
+        start = timer()
+        self._layouts_paddle(zoomin)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
+        print("paddle layouts:", timer() - start)
+        self._table_transformer_job(zoomin)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
+        self._text_merge()
+        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
+        self._concat_downward(concat_between_pages=False)
+        self._filter_forpages()
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
+        tbls = self._extract_table_figure(True, zoomin, False)
+        # clean mess
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+        # merge chunks with the same bullets
+        i = 0
+        while i + 1 < len(self.boxes):
+            b = self.boxes[i]
+            b_ = self.boxes[i + 1]
+            if b["text"].strip()[0] != b_["text"].strip()[0] \
+                    or b["page_number"]!=b_["page_number"] \
+                    or b["top"] > b_["bottom"]:
+                i += 1
+                continue
+            b_["text"] = b["text"] + "\n" + b_["text"]
+            b_["x0"] = min(b["x0"], b_["x0"])
+            b_["x1"] = max(b["x1"], b_["x1"])
+            b_["top"] = b["top"]
+            self.boxes.pop(i)
+        # merge title with decent chunk
+        i = 0
+        while i + 1 < len(self.boxes):
+            b = self.boxes[i]
+            if b.get("layoutno","").find("title") < 0:
+                i += 1
+                continue
+            b_ = self.boxes[i + 1]
+            b_["text"] = b["text"] + "\n" + b_["text"]
+            b_["x0"] = min(b["x0"], b_["x0"])
+            b_["x1"] = max(b["x1"], b_["x1"])
+            b_["top"] = b["top"]
+            self.boxes.pop(i)
+        for b in self.boxes: print(b["text"], b.get("layoutno"))
+        print(tbls)
+        return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
+def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
+    pdf_parser = None
+    paper = {}
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        cks, tbls = pdf_parser(filename if not binary else binary,
+                           from_page=from_page, to_page=to_page, callback=callback)
+    doc = {
+        "docnm_kwd": filename
+    }
+    doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
+    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
+    # is it English
+    eng = pdf_parser.is_english
+    res = []
+    # add tables
+    for img, rows in tbls:
+        bs = 10
+        de = ";" if eng else "；"
+        for i in range(0, len(rows), bs):
+            d = copy.deepcopy(doc)
+            r = de.join(rows[i:i + bs])
+            r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
+            tokenize(d, r, eng)
+            d["image"] = img
+            res.append(d)
+    i = 0
+    chunk = []
+    tk_cnt = 0
+    def add_chunk():
+        nonlocal chunk, res, doc, pdf_parser, tk_cnt
+        d = copy.deepcopy(doc)
+        ck = "\n".join(chunk)
+        tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
+        d["image"] = pdf_parser.crop(ck)
+        res.append(d)
+        chunk = []
+        tk_cnt = 0
+    while i < len(cks):
+        if tk_cnt > 128: add_chunk()
+        txt = cks[i]
+        txt_ = pdf_parser.remove_tag(txt)
+        i += 1
+        cnt = num_tokens_from_string(txt_)
+        chunk.append(txt)
+        tk_cnt += cnt
+    if chunk: add_chunk()
+    for i, d in enumerate(res):
+        print(d)
+        # d["image"].save(f"./logs/{i}.jpg")
+    return res
+if __name__ == "__main__":
+    import sys
+    chunk(sys.argv[1])

rag/app/paper.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import copy
+import re
+from collections import Counter
+from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
+from rag.nlp import huqie, stemmer
+from rag.parser.docx_parser import HuDocxParser
+from rag.parser.pdf_parser import HuParser
+from nltk.tokenize import word_tokenize
+import numpy as np
+from rag.utils import num_tokens_from_string
+class Pdf(HuParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
+        from timeit import default_timer as timer
+        start = timer()
+        self._layouts_paddle(zoomin)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
+        print("paddle layouts:", timer() - start)
+        self._table_transformer_job(zoomin)
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
+        self._text_merge()
+        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
+        self._concat_downward(concat_between_pages=False)
+        self._filter_forpages()
+        callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
+                   "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
+        tbls = self._extract_table_figure(True, zoomin, False)
+        # clean mess
+        if column_width < self.page_images[0].size[0] / zoomin / 2:
+            print("two_column...................", column_width,
+                  self.page_images[0].size[0] / zoomin / 2)
+            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+        freq = Counter([b["text"] for b in self.boxes])
+        garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
+        i = 0
+        while i < len(self.boxes):
+            if self.boxes[i]["text"] in garbage \
+                    or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
+                    or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
+                self.boxes.pop(i)
+            elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
+                                                                                                         '1'):
+                # merge within same layouts
+                self.boxes[i + 1]["top"] = self.boxes[i]["top"]
+                self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
+                self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
+                self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
+                self.boxes.pop(i)
+            else:
+                i += 1
+        def _begin(txt):
+            return re.match(
+                "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
+                txt.lower().strip())
+        # get title and authors
+        title = ""
+        authors = []
+        i = 0
+        while i < min(32, len(self.boxes)):
+            b = self.boxes[i]
+            i += 1
+            if b.get("layoutno", "").find("title") >= 0:
+                title = b["text"]
+                if _begin(title):
+                    title = ""
+                    break
+                for j in range(3):
+                    if _begin(self.boxes[i + j]["text"]): break
+                    authors.append(self.boxes[i + j]["text"])
+                    break
+                break
+        # get abstract
+        abstr = ""
+        i = 0
+        while i + 1 < min(32, len(self.boxes)):
+            b = self.boxes[i]
+            i += 1
+            txt = b["text"].lower().strip()
+            if re.match("(abstract|摘要)", txt):
+                if len(txt.split(" ")) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(b, zoomin)
+                    i += 1
+                    break
+                txt = self.boxes[i + 1]["text"].lower().strip()
+                if len(txt.split(" ")) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
+                i += 1
+                break
+        if not abstr: i = 0
+        for b in self.boxes: print(b["text"], b.get("layoutno"))
+        print(tbls)
+        return {
+            "title": title if title else filename,
+            "authors": " ".join(authors),
+            "abstract": abstr,
+            "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                      re.match(r"(text|title)", b.get("layoutno", "text"))],
+            "tables": tbls
+        }
+def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
+    pdf_parser = None
+    paper = {}
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        paper = pdf_parser(filename if not binary else binary,
+                           from_page=from_page, to_page=to_page, callback=callback)
+    doc = {
+        "docnm_kwd": paper["title"] if paper["title"] else filename,
+        "authors_tks": paper["authors"]
+    }
+    doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
+    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
+    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
+    # is it English
+    eng = pdf_parser.is_english
+    print("It's English.....", eng)
+    res = []
+    # add tables
+    for img, rows in paper["tables"]:
+        bs = 10
+        de = ";" if eng else "；"
+        for i in range(0, len(rows), bs):
+            d = copy.deepcopy(doc)
+            r = de.join(rows[i:i + bs])
+            r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
+            tokenize(d, r)
+            d["image"] = img
+            res.append(d)
+    if paper["abstract"]:
+        d = copy.deepcopy(doc)
+        txt = pdf_parser.remove_tag(paper["abstract"])
+        d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
+        d["important_tks"] = " ".join(d["important_kwd"])
+        d["image"] = pdf_parser.crop(paper["abstract"])
+        tokenize(d, txt, eng)
+        res.append(d)
+    readed = [0] * len(paper["lines"])
+    # find colon firstly
+    i = 0
+    while i + 1 < len(paper["lines"]):
+        txt = pdf_parser.remove_tag(paper["lines"][i][0])
+        j = i
+        if txt.strip("\n").strip()[-1] not in ":：":
+            i += 1
+            continue
+        i += 1
+        while i < len(paper["lines"]) and not paper["lines"][i][0]:
+            i += 1
+        if i >= len(paper["lines"]): break
+        proj = [paper["lines"][i][0].strip()]
+        i += 1
+        while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
+            proj.append(paper["lines"][i])
+            i += 1
+        for k in range(j, i): readed[k] = True
+        txt = txt[::-1]
+        if eng:
+            r = re.search(r"(.*?) ([\.;?!]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        else:
+            r = re.search(r"(.*?) ([。？；！]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        for p in proj:
+            d = copy.deepcopy(doc)
+            txt += "\n" + pdf_parser.remove_tag(p)
+            d["image"] = pdf_parser.crop(p)
+            tokenize(d, txt)
+            res.append(d)
+    i = 0
+    chunk = []
+    tk_cnt = 0
+    def add_chunk():
+        nonlocal chunk, res, doc, pdf_parser, tk_cnt
+        d = copy.deepcopy(doc)
+        ck = "\n".join(chunk)
+        tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
+        d["image"] = pdf_parser.crop(ck)
+        res.append(d)
+        chunk = []
+        tk_cnt = 0
+    while i < len(paper["lines"]):
+        if tk_cnt > 128:
+            add_chunk()
+        if readed[i]:
+            i += 1
+            continue
+        readed[i] = True
+        txt, layouts = paper["lines"][i]
+        txt_ = pdf_parser.remove_tag(txt)
+        i += 1
+        cnt = num_tokens_from_string(txt_)
+        if any([
+            layouts.find("title") >= 0 and chunk,
+            cnt + tk_cnt > 128 and tk_cnt > 32,
+        ]):
+            add_chunk()
+            chunk = [txt]
+            tk_cnt = cnt
+        else:
+            chunk.append(txt)
+            tk_cnt += cnt
+    if chunk: add_chunk()
+    for i, d in enumerate(res):
+        print(d)
+        # d["image"].save(f"./logs/{i}.jpg")
+    return res
+if __name__ == "__main__":
+    import sys
+    chunk(sys.argv[1])

rag/app/presentation.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 from io import BytesIO
 from pptx import Presentation
-from rag.app import callback__
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
@@ -57,7 +57,7 @@ class Ppt(object):
         assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
         callback__((min(to_page, self.total_page) - from_page) / self.total_page,
                    "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
         return [(txts[i], imgs[i]) for i in range(len(txts))]
@@ -103,19 +103,19 @@ def chunk(filename, binary=None,  from_page=0, to_page=100000, callback=None):
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     res = []
     if re.search(r"\.pptx?$", filename, re.IGNORECASE):
-        for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback):
             d = copy.deepcopy(doc)
-            d["content_ltks"] = huqie.qie(txt)
-            d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
             d["image"] = img
             res.append(d)
         return res
     if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
             d = copy.deepcopy(doc)
-            d["content_ltks"] = huqie.qie(txt)
-            d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
             d["image"] = img
             res.append(d)
         return res
     callback__(-1, "This kind of presentation document did not support yet!", callback)

 from io import BytesIO
 from pptx import Presentation
+from rag.app import callback__, tokenize, is_english
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
         assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
         callback__((min(to_page, self.total_page) - from_page) / self.total_page,
                    "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
+        self.is_english = is_english(txts)
         return [(txts[i], imgs[i]) for i in range(len(txts))]
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     res = []
     if re.search(r"\.pptx?$", filename, re.IGNORECASE):
+        ppt_parser = Ppt()
+        for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback):
             d = copy.deepcopy(doc)
             d["image"] = img
+            tokenize(d, txt, ppt_parser.is_english)
             res.append(d)
         return res
     if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
             d = copy.deepcopy(doc)
             d["image"] = img
+            tokenize(d, txt, pdf_parser.is_english)
             res.append(d)
         return res
     callback__(-1, "This kind of presentation document did not support yet!", callback)

rag/nlp/__init__.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from . import search
 from rag.utils import ELASTICSEARCH
-retrievaler = search.Dealer(ELASTICSEARCH)

 from . import search
 from rag.utils import ELASTICSEARCH
+retrievaler = search.Dealer(ELASTICSEARCH)
+from nltk.stem import PorterStemmer
+stemmer = PorterStemmer()

rag/parser/pdf_parser.py CHANGED Viewed

@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
 import fitz
 import xgboost as xgb
 from io import BytesIO
@@ -14,6 +16,7 @@ from copy import deepcopy
 from rag.cv.table_recognize import TableTransformer
 from rag.cv.ppdetection import PPDet
 from huggingface_hub import hf_hub_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
@@ -22,8 +25,8 @@ class HuParser:
         from paddleocr import PaddleOCR
         logging.getLogger("ppocr").setLevel(logging.ERROR)
         self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
-        self.layouter = PPDet()
-        self.tbl_det = TableTransformer()
         self.updown_cnt_mdl = xgb.Booster()
         if torch.cuda.is_available():
@@ -55,7 +58,7 @@ class HuParser:
     def _y_dis(
             self, a, b):
         return (
-            b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
     def _match_proj(self, b):
         proj_patt = [
@@ -78,9 +81,9 @@ class HuParser:
         tks_down = huqie.qie(down["text"][:LEN]).split(" ")
         tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
         tks_all = up["text"][-LEN:].strip() \
-            + (" " if re.match(r"[a-zA-Z0-9]+",
-                               up["text"][-1] + down["text"][0]) else "") \
-            + down["text"][:LEN].strip()
         tks_all = huqie.qie(tks_all).split(" ")
         fea = [
             up.get("R", -1) == down.get("R", -1),
@@ -102,7 +105,7 @@ class HuParser:
             True if re.search(r"[，,][^。.]+$", up["text"]) else False,
             True if re.search(r"[，,][^。.]+$", up["text"]) else False,
             True if re.search(r"[\(（][^\)）]+$", up["text"])
-            and re.search(r"[\)）]", down["text"]) else False,
             self._match_proj(down),
             True if re.match(r"[A-Z]", down["text"]) else False,
             True if re.match(r"[A-Z]", up["text"][-1]) else False,
@@ -141,6 +144,21 @@ class HuParser:
                     arr[j + 1] = deepcopy(tmp)
         return arr
     @staticmethod
     def sort_R_firstly(arr, thr=0):
         # sort using y1 first and then x1
@@ -219,7 +237,7 @@ class HuParser:
         assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
             tp, btm, x0, x1, b)
         ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
-            x0 != 0 and btm - tp != 0 else 0
         if ov > 0 and ratio:
             ov /= (x1 - x0) * (btm - tp)
         return ov
@@ -326,7 +344,7 @@ class HuParser:
         return layouts
     def __table_paddle(self, images):
-        tbls = self.tbl_det([img for img in images], threshold=0.5)
         res = []
         # align left&right for rows, align top&bottom for columns
         for tbl in tbls:
@@ -384,7 +402,7 @@ class HuParser:
                 continue
             for tb in tbls:  # for table
                 left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
-                    tb["x1"] + MARGIN, tb["bottom"] + MARGIN
                 left *= ZM
                 top *= ZM
                 right *= ZM
@@ -482,10 +500,13 @@ class HuParser:
                 continue
             ch = c["bottom"] - c["top"]
             bh = bxs[ii]["bottom"] - bxs[ii]["top"]
-            if abs(ch - bh) / max(ch, bh) >= 0.7:
                 self.lefted_chars.append(c)
                 continue
-            bxs[ii]["text"] += c["text"]
         for b in bxs:
             if not b["text"]:
@@ -629,7 +650,7 @@ class HuParser:
             i += 1
         self.boxes = bxs
-    def _concat_downward(self):
         # count boxes in the same row as a feature
         for i in range(len(self.boxes)):
             mh = self.mean_height[self.boxes[i]["page_number"] - 1]
@@ -665,6 +686,8 @@ class HuParser:
                     if not smpg and ydis > mh * 16:
                         break
                     down = boxes[i]
                     if up.get("R", "") != down.get(
                             "R", "") and up["text"][-1] != "，":
@@ -735,43 +758,29 @@ class HuParser:
         self.boxes = self.sort_Y_firstly(boxes, 0)
-    def __filter_forpages(self):
         if not self.boxes:
             return
-        to = min(7, len(self.page_images) // 5)
-        pg_hits = [0 for _ in range(to)]
-        def possible(c):
-            if c.get("layout_type", "") == "reference":
-                return True
-            if c["bottom"] - c["top"] >= 2 * \
-                    self.mean_height[c["page_number"] - 1]:
-                return False
-            if c["text"].find("....") >= 0 \
-                    or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$",
-                                                              c["text"].strip())):
-                return True
-            return self.is_caption(c) and re.search(
-                r"[0-9]+$", c["text"].strip())
-        for c in self.boxes:
-            if c["page_number"] >= to:
-                break
-            if possible(c):
-                pg_hits[c["page_number"] - 1] += 1
-        st, ed = -1, -1
-        for i in range(len(self.boxes)):
-            c = self.boxes[i]
-            if c["page_number"] >= to:
                 break
-            if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
-                if st < 0:
-                    st = i
-                else:
-                    ed = i
-        for _ in range(st, ed + 1):
-            self.boxes.pop(st)
     def _blockType(self, b):
         patt = [
@@ -918,7 +927,7 @@ class HuParser:
             lst_r = rows[-1]
             if lst_r[-1].get("R", "") != b.get("R", "") \
                     or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
-                        ):  # new row
                 btm = b["bottom"]
                 b["rn"] += 1
                 rows.append([b])
@@ -968,9 +977,9 @@ class HuParser:
                     j += 1
                     continue
                 f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
-                     [j - 1][0].get("text")) or j == 0
                 ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
-                      [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
                 if f and ff:
                     j += 1
                     continue
@@ -1031,9 +1040,9 @@ class HuParser:
                     i += 1
                     continue
                 f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
-                     [jj][0].get("text")) or i == 0
                 ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
-                      [jj][0].get("text")) or i + 1 >= len(tbl)
                 if f and ff:
                     i += 1
                     continue
@@ -1153,6 +1162,7 @@ class HuParser:
         headers = {}
         hdrset = set()
         lst_hdr = []
         for r in sorted(list(hdr_rowno)):
             headers[r] = ["" for _ in range(clmno)]
             for i in range(clmno):
@@ -1184,12 +1194,12 @@ class HuParser:
                     if headers[j][k].find(headers[j - 1][k]) >= 0:
                         continue
                     if len(headers[j][k]) > len(headers[j - 1][k]):
-                        headers[j][k] += ("的" if headers[j][k]
                                           else "") + headers[j - 1][k]
                     else:
                         headers[j][k] = headers[j - 1][k] \
-                            + ("的" if headers[j - 1][k] else "") \
-                            + headers[j][k]
         logging.debug(
             f">>>>>>>>>>>>>>>>>{cap}：SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
@@ -1241,7 +1251,11 @@ class HuParser:
                 row_txt.append("; ".join(rtxt))
         if cap:
-            row_txt = [t + f"\t——来自“{cap}”" for t in row_txt]
         return row_txt
     @staticmethod
@@ -1254,7 +1268,7 @@ class HuParser:
             return True
         return False
-    def __extract_table_figure(self, need_image, ZM, return_html):
         tables = {}
         figures = {}
         # extract figure and table boxes
@@ -1266,7 +1280,7 @@ class HuParser:
                 i += 1
                 continue
             lout_no = str(self.boxes[i]["page_number"]) + \
-                "-" + str(self.boxes[i]["layoutno"])
             if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
                                                                                   "figure caption", "reference"]:
                 nomerge_lout_no.append(lst_lout_no)
@@ -1574,8 +1588,14 @@ class HuParser:
                 self.page_chars.append([])
         logging.info("Images converted.")
         for i, img in enumerate(self.page_images):
-            chars = self.page_chars[i]
             self.mean_height.append(
                 np.median(sorted([c["height"] for c in chars])) if chars else 0
             )
@@ -1583,6 +1603,14 @@ class HuParser:
                 np.median(sorted([c["width"] for c in chars])) if chars else 8
             )
             self.page_cum_height.append(img.size[1] / zoomin)
             # if i > 0:
             #     if not chars:
             #         self.page_cum_height.append(img.size[1] / zoomin)
@@ -1591,8 +1619,13 @@ class HuParser:
             #             np.max([c["bottom"] for c in chars]))
             self.__ocr_paddle(i + 1, img, chars, zoomin)
         self.page_cum_height = np.cumsum(self.page_cum_height)
-        assert len(self.page_cum_height) == len(self.page_images)+1
     def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
         self.__images__(fnm, zoomin)
@@ -1600,8 +1633,8 @@ class HuParser:
         self._table_transformer_job(zoomin)
         self._text_merge()
         self._concat_downward()
-        self.__filter_forpages()
-        tbls = self.__extract_table_figure(need_image, zoomin, return_html)
         return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
     def remove_tag(self, txt):
@@ -1622,7 +1655,7 @@ class HuParser:
                 self.page_images[pns[0]].crop((left * ZM, top * ZM,
                                                right *
                                                ZM, min(
-                                                   bottom, self.page_images[pns[0]].size[1])
                                                ))
             )
             bottom -= self.page_images[pns[0]].size[1]

 # -*- coding: utf-8 -*-
+import random
 import fitz
 import xgboost as xgb
 from io import BytesIO
 from rag.cv.table_recognize import TableTransformer
 from rag.cv.ppdetection import PPDet
 from huggingface_hub import hf_hub_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
         from paddleocr import PaddleOCR
         logging.getLogger("ppocr").setLevel(logging.ERROR)
         self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
+        self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet")
+        self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl")
         self.updown_cnt_mdl = xgb.Booster()
         if torch.cuda.is_available():
     def _y_dis(
             self, a, b):
         return (
+                       b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
     def _match_proj(self, b):
         proj_patt = [
         tks_down = huqie.qie(down["text"][:LEN]).split(" ")
         tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
         tks_all = up["text"][-LEN:].strip() \
+                  + (" " if re.match(r"[a-zA-Z0-9]+",
+                                     up["text"][-1] + down["text"][0]) else "") \
+                  + down["text"][:LEN].strip()
         tks_all = huqie.qie(tks_all).split(" ")
         fea = [
             up.get("R", -1) == down.get("R", -1),
             True if re.search(r"[，,][^。.]+$", up["text"]) else False,
             True if re.search(r"[，,][^。.]+$", up["text"]) else False,
             True if re.search(r"[\(（][^\)）]+$", up["text"])
+                    and re.search(r"[\)）]", down["text"]) else False,
             self._match_proj(down),
             True if re.match(r"[A-Z]", down["text"]) else False,
             True if re.match(r"[A-Z]", up["text"][-1]) else False,
                     arr[j + 1] = deepcopy(tmp)
         return arr
+    @staticmethod
+    def sort_X_by_page(arr, threashold):
+        # sort using y1 first and then x1
+        arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
+        for i in range(len(arr) - 1):
+            for j in range(i, -1, -1):
+                # restore the order using th
+                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
+                        and arr[j + 1]["top"] < arr[j]["top"]\
+                        and arr[j + 1]["page_number"] == arr[j]["page_number"]:
+                    tmp = arr[j]
+                    arr[j] = arr[j + 1]
+                    arr[j + 1] = tmp
+        return arr
     @staticmethod
     def sort_R_firstly(arr, thr=0):
         # sort using y1 first and then x1
         assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
             tp, btm, x0, x1, b)
         ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
+                                           x0 != 0 and btm - tp != 0 else 0
         if ov > 0 and ratio:
             ov /= (x1 - x0) * (btm - tp)
         return ov
         return layouts
     def __table_paddle(self, images):
+        tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
         res = []
         # align left&right for rows, align top&bottom for columns
         for tbl in tbls:
                 continue
             for tb in tbls:  # for table
                 left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
+                                         tb["x1"] + MARGIN, tb["bottom"] + MARGIN
                 left *= ZM
                 top *= ZM
                 right *= ZM
                 continue
             ch = c["bottom"] - c["top"]
             bh = bxs[ii]["bottom"] - bxs[ii]["top"]
+            if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
                 self.lefted_chars.append(c)
                 continue
+            if c["text"] == " " and bxs[ii]["text"]:
+                if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
+            else:
+                bxs[ii]["text"] += c["text"]
         for b in bxs:
             if not b["text"]:
             i += 1
         self.boxes = bxs
+    def _concat_downward(self, concat_between_pages=True):
         # count boxes in the same row as a feature
         for i in range(len(self.boxes)):
             mh = self.mean_height[self.boxes[i]["page_number"] - 1]
                     if not smpg and ydis > mh * 16:
                         break
                     down = boxes[i]
+                    if not concat_between_pages and down["page_number"] > up["page_number"]:
+                        break
                     if up.get("R", "") != down.get(
                             "R", "") and up["text"][-1] != "，":
         self.boxes = self.sort_Y_firstly(boxes, 0)
+    def _filter_forpages(self):
         if not self.boxes:
             return
+        i = 0
+        while i < len(self.boxes):
+            if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
+                i += 1
+                continue
+            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
+            self.boxes.pop(i)
+            if i >= len(self.boxes): break
+            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+            while not prefix:
+                self.boxes.pop(i)
+                if i >= len(self.boxes): break
+                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+            self.boxes.pop(i)
+            if i >= len(self.boxes) or not prefix: break
+            for j in range(i, min(i + 128, len(self.boxes))):
+                if not re.match(prefix, self.boxes[j]["text"]):
+                    continue
+                for k in range(i, j): self.boxes.pop(i)
                 break
     def _blockType(self, b):
         patt = [
             lst_r = rows[-1]
             if lst_r[-1].get("R", "") != b.get("R", "") \
                     or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
+            ):  # new row
                 btm = b["bottom"]
                 b["rn"] += 1
                 rows.append([b])
                     j += 1
                     continue
                 f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
+                [j - 1][0].get("text")) or j == 0
                 ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
+                [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
                 if f and ff:
                     j += 1
                     continue
                     i += 1
                     continue
                 f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
+                [jj][0].get("text")) or i == 0
                 ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
+                [jj][0].get("text")) or i + 1 >= len(tbl)
                 if f and ff:
                     i += 1
                     continue
         headers = {}
         hdrset = set()
         lst_hdr = []
+        de = "的" if not self.is_english else " for "
         for r in sorted(list(hdr_rowno)):
             headers[r] = ["" for _ in range(clmno)]
             for i in range(clmno):
                     if headers[j][k].find(headers[j - 1][k]) >= 0:
                         continue
                     if len(headers[j][k]) > len(headers[j - 1][k]):
+                        headers[j][k] += (de if headers[j][k]
                                           else "") + headers[j - 1][k]
                     else:
                         headers[j][k] = headers[j - 1][k] \
+                                        + (de if headers[j - 1][k] else "") \
+                                        + headers[j][k]
         logging.debug(
             f">>>>>>>>>>>>>>>>>{cap}：SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
                 row_txt.append("; ".join(rtxt))
         if cap:
+            if self.is_english:
+                from_ = " in "
+            else:
+                from_ = "来自"
+            row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
         return row_txt
     @staticmethod
             return True
         return False
+    def _extract_table_figure(self, need_image, ZM, return_html):
         tables = {}
         figures = {}
         # extract figure and table boxes
                 i += 1
                 continue
             lout_no = str(self.boxes[i]["page_number"]) + \
+                      "-" + str(self.boxes[i]["layoutno"])
             if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
                                                                                   "figure caption", "reference"]:
                 nomerge_lout_no.append(lst_lout_no)
                 self.page_chars.append([])
         logging.info("Images converted.")
+        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))]
+        if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
+            self.is_english = True
+        else:
+            self.is_english = False
         for i, img in enumerate(self.page_images):
+            chars = self.page_chars[i] if not self.is_english else []
             self.mean_height.append(
                 np.median(sorted([c["height"] for c in chars])) if chars else 0
             )
                 np.median(sorted([c["width"] for c in chars])) if chars else 8
             )
             self.page_cum_height.append(img.size[1] / zoomin)
+            j = 0
+            while j + 1 < len(chars):
+                if chars[j]["text"] and chars[j + 1]["text"] \
+                        and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
+                        and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
+                                                                       chars[j]["width"]) / 2:
+                    chars[j]["text"] += " "
+                j += 1
             # if i > 0:
             #     if not chars:
             #         self.page_cum_height.append(img.size[1] / zoomin)
             #             np.max([c["bottom"] for c in chars]))
             self.__ocr_paddle(i + 1, img, chars, zoomin)
+        if not self.is_english and not all([c for c in self.page_chars]) and self.boxes:
+            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)]))
+        logging.info("Is it English:", self.is_english)
         self.page_cum_height = np.cumsum(self.page_cum_height)
+        assert len(self.page_cum_height) == len(self.page_images) + 1
     def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
         self.__images__(fnm, zoomin)
         self._table_transformer_job(zoomin)
         self._text_merge()
         self._concat_downward()
+        self._filter_forpages()
+        tbls = self._extract_table_figure(need_image, zoomin, return_html)
         return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
     def remove_tag(self, txt):
                 self.page_images[pns[0]].crop((left * ZM, top * ZM,
                                                right *
                                                ZM, min(
+                    bottom, self.page_images[pns[0]].size[1])
                                                ))
             )
             bottom -= self.page_images[pns[0]].size[1]