import logging from typing import Any, Dict, Optional, Sequence, Tuple, cast import numpy as np from pdfminer import settings from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfinterp import ( PDFPageInterpreter, PDFResourceManager, PDFContentParser, PDFInterpreterError, Color, PDFStackT, LITERAL_FORM, LITERAL_IMAGE, ) from pdfminer.pdffont import PDFFont from pdfminer.pdfpage import PDFPage from pdfminer.pdftypes import ( PDFObjRef, dict_value, list_value, resolve1, stream_value, ) from pdfminer.psexceptions import PSEOF from pdfminer.psparser import ( PSKeyword, keyword_name, literal_name, ) from pdfminer.utils import ( MATRIX_IDENTITY, Matrix, Rect, mult_matrix, apply_matrix_pt, ) log = logging.getLogger(__name__) def safe_float(o: Any) -> Optional[float]: try: return float(o) except (TypeError, ValueError): return None class PDFPageInterpreterEx(PDFPageInterpreter): """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary """ def __init__( self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch ) -> None: self.rsrcmgr = rsrcmgr self.device = device self.obj_patch = obj_patch def dup(self) -> "PDFPageInterpreterEx": return self.__class__(self.rsrcmgr, self.device, self.obj_patch) def init_resources(self, resources: Dict[object, object]) -> None: # 重载设置 fontid 和 descent """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap: Dict[object, PDFFont] = {} self.fontid: Dict[PDFFont, object] = {} self.xobjmap = {} self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec: object) -> Optional[PDFColorSpace]: if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, stream_value(spec[1])["N"]) elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) for k, v in dict_value(resources).items(): # log.debug("Resource: %r: %r", k, v) if k == "Font": for fontid, spec in dict_value(v).items(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) self.fontmap[fontid].descent = 0 # hack fix descent self.fontid[self.fontmap[fontid]] = fontid elif k == "ColorSpace": for csid, spec in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec)) if colorspace is not None: self.csmap[csid] = colorspace elif k == "ProcSet": self.rsrcmgr.get_procset(list_value(v)) elif k == "XObject": for xobjid, xobjstrm in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm def do_S(self) -> None: # 重载过滤非公式线条 """Stroke path""" def is_black(color: Color) -> bool: if isinstance(color, Tuple): return sum(color) == 0 else: return color == 0 if ( len(self.curpath) == 2 and self.curpath[0][0] == "m" and self.curpath[1][0] == "l" and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] and is_black(self.graphicstate.scolor) ): # 独立直线,水平,黑色 # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] return "n" else: self.curpath = [] ############################################################ # 重载过滤非公式线条(F/B) def do_f(self) -> None: """Fill path using nonzero winding number rule""" # self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] def do_F(self) -> None: """Fill path using nonzero winding number rule (obsolete)""" def do_f_a(self) -> None: """Fill path using even-odd rule""" # self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" # self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" # self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] ############################################################ # 重载返回调用参数(SCN) def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: n = self.scs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 args = self.pop(n) self.graphicstate.scolor = cast(Color, args) return args def do_scn(self) -> None: """Set color for nonstroking operations""" if self.ncs: n = self.ncs.ncomponents else: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 args = self.pop(n) self.graphicstate.ncolor = cast(Color, args) return args def do_SC(self) -> None: """Set color for stroking operations""" return self.do_SCN() def do_sc(self) -> None: """Set color for nonstroking operations""" return self.do_scn() def do_Do(self, xobjid_arg: PDFStackT) -> None: # 重载设置 xobj 的 obj_patch """Invoke named XObject""" xobjid = literal_name(xobjid_arg) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return # log.debug("Processing xobj: %r", xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() bbox = cast(Rect, list_value(xobj["BBox"])) matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. xobjres = xobj.get("Resources") if xobjres: resources = dict_value(xobjres) else: resources = self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) ctm = mult_matrix(matrix, self.ctm) ops_base = interpreter.render_contents( resources, [xobj], ctm=ctm, ) try: # 有的时候 form 字体加不上这里会烂掉 self.device.fontid = interpreter.fontid self.device.fontmap = interpreter.fontmap ops_new = self.device.end_figure(xobjid) ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) pos_inv = -np.mat(ctm[4:]) * ctm_inv a, b, c, d = ctm_inv.reshape(4).tolist() e, f = pos_inv.tolist()[0] self.obj_patch[self.xobjmap[xobjid].objid] = ( f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}" ) except Exception: pass elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass def process_page(self, page: PDFPage) -> None: # 重载设置 page 的 obj_patch # log.debug("Processing page: %r", page) # print(page.mediabox,page.cropbox) # (x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.cropbox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) elif page.rotate == 180: ctm = (-1, 0, 0, -1, x1, y1) elif page.rotate == 270: ctm = (0, 1, -1, 0, y1, -x0) else: ctm = (1, 0, 0, 1, -x0, -y0) self.device.begin_page(page, ctm) ops_base = self.render_contents(page.resources, page.contents, ctm=ctm) self.device.fontid = self.fontid self.device.fontmap = self.fontmap ops_new = self.device.end_page(page) # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来 self.obj_patch[page.page_xref] = ( f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 ) for obj in page.contents: self.obj_patch[obj.objid] = "" def render_contents( self, resources: Dict[object, object], streams: Sequence[object], ctm: Matrix = MATRIX_IDENTITY, ) -> None: # 重载返回指令流 """Render the content streams. This method may be called recursively. """ # log.debug( # "render_contents: resources=%r, streams=%r, ctm=%r", # resources, # streams, # ctm, # ) self.init_resources(resources) self.init_state(ctm) return self.execute(list_value(streams)) def execute(self, streams: Sequence[object]) -> None: # 重载返回指令流 ops = "" try: parser = PDFContentParser(streams) except PSEOF: # empty page return while True: try: (_, obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): name = keyword_name(obj) method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( "'", "_q", ) if hasattr(self, method): func = getattr(self, method) nargs = func.__code__.co_argcount - 1 if nargs: args = self.pop(nargs) # log.debug("exec: %s %r", name, args) if len(args) == nargs: func(*args) if not ( name[0] == "T" or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"] ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 p = " ".join( [ ( f"{x:f}" if isinstance(x, float) else str(x).replace("'", "") ) for x in args ] ) ops += f"{p} {name} " else: # log.debug("exec: %s", name) targs = func() if targs is None: targs = [] if not (name[0] == "T" or name in ["BI", "ID", "EMC"]): p = " ".join( [ ( f"{x:f}" if isinstance(x, float) else str(x).replace("'", "") ) for x in targs ] ) ops += f"{p} {name} " elif settings.STRICT: error_msg = "Unknown operator: %r" % name raise PDFInterpreterError(error_msg) else: self.push(obj) # print('REV DATA',ops) return ops