sciwin_translate / pdf2zh /pdfinterp.py
leonsimon23's picture
Upload 9 files
8b23ca3 verified
import logging
from typing import Any, Dict, Optional, Sequence, Tuple, cast
import numpy as np
from pdfminer import settings
from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import (
PDFPageInterpreter,
PDFResourceManager,
PDFContentParser,
PDFInterpreterError,
Color,
PDFStackT,
LITERAL_FORM,
LITERAL_IMAGE,
)
from pdfminer.pdffont import PDFFont
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import (
PDFObjRef,
dict_value,
list_value,
resolve1,
stream_value,
)
from pdfminer.psexceptions import PSEOF
from pdfminer.psparser import (
PSKeyword,
keyword_name,
literal_name,
)
from pdfminer.utils import (
MATRIX_IDENTITY,
Matrix,
Rect,
mult_matrix,
apply_matrix_pt,
)
log = logging.getLogger(__name__)
def safe_float(o: Any) -> Optional[float]:
try:
return float(o)
except (TypeError, ValueError):
return None
class PDFPageInterpreterEx(PDFPageInterpreter):
"""Processor for the content of a PDF page
Reference: PDF Reference, Appendix A, Operator Summary
"""
def __init__(
self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
) -> None:
self.rsrcmgr = rsrcmgr
self.device = device
self.obj_patch = obj_patch
def dup(self) -> "PDFPageInterpreterEx":
return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
def init_resources(self, resources: Dict[object, object]) -> None:
# 重载设置 fontid 和 descent
"""Prepare the fonts and XObjects listed in the Resource attribute."""
self.resources = resources
self.fontmap: Dict[object, PDFFont] = {}
self.fontid: Dict[PDFFont, object] = {}
self.xobjmap = {}
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
if not resources:
return
def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
return PDFColorSpace(name, stream_value(spec[1])["N"])
elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
for k, v in dict_value(resources).items():
# log.debug("Resource: %r: %r", k, v)
if k == "Font":
for fontid, spec in dict_value(v).items():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
self.fontmap[fontid].descent = 0 # hack fix descent
self.fontid[self.fontmap[fontid]] = fontid
elif k == "ColorSpace":
for csid, spec in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec))
if colorspace is not None:
self.csmap[csid] = colorspace
elif k == "ProcSet":
self.rsrcmgr.get_procset(list_value(v))
elif k == "XObject":
for xobjid, xobjstrm in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
def do_S(self) -> None:
# 重载过滤非公式线条
"""Stroke path"""
def is_black(color: Color) -> bool:
if isinstance(color, Tuple):
return sum(color) == 0
else:
return color == 0
if (
len(self.curpath) == 2
and self.curpath[0][0] == "m"
and self.curpath[1][0] == "l"
and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
== apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
and is_black(self.graphicstate.scolor)
): # 独立直线,水平,黑色
# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return "n"
else:
self.curpath = []
############################################################
# 重载过滤非公式线条(F/B)
def do_f(self) -> None:
"""Fill path using nonzero winding number rule"""
# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
self.curpath = []
def do_F(self) -> None:
"""Fill path using nonzero winding number rule (obsolete)"""
def do_f_a(self) -> None:
"""Fill path using even-odd rule"""
# self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
def do_B(self) -> None:
"""Fill and stroke path using nonzero winding number rule"""
# self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
def do_B_a(self) -> None:
"""Fill and stroke path using even-odd rule"""
# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
############################################################
# 重载返回调用参数(SCN)
def do_SCN(self) -> None:
"""Set color for stroking operations."""
if self.scs:
n = self.scs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError("No colorspace specified!")
n = 1
args = self.pop(n)
self.graphicstate.scolor = cast(Color, args)
return args
def do_scn(self) -> None:
"""Set color for nonstroking operations"""
if self.ncs:
n = self.ncs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError("No colorspace specified!")
n = 1
args = self.pop(n)
self.graphicstate.ncolor = cast(Color, args)
return args
def do_SC(self) -> None:
"""Set color for stroking operations"""
return self.do_SCN()
def do_sc(self) -> None:
"""Set color for nonstroking operations"""
return self.do_scn()
def do_Do(self, xobjid_arg: PDFStackT) -> None:
# 重载设置 xobj 的 obj_patch
"""Invoke named XObject"""
xobjid = literal_name(xobjid_arg)
try:
xobj = stream_value(self.xobjmap[xobjid])
except KeyError:
if settings.STRICT:
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
return
# log.debug("Processing xobj: %r", xobj)
subtype = xobj.get("Subtype")
if subtype is LITERAL_FORM and "BBox" in xobj:
interpreter = self.dup()
bbox = cast(Rect, list_value(xobj["BBox"]))
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
xobjres = xobj.get("Resources")
if xobjres:
resources = dict_value(xobjres)
else:
resources = self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
ctm = mult_matrix(matrix, self.ctm)
ops_base = interpreter.render_contents(
resources,
[xobj],
ctm=ctm,
)
try: # 有的时候 form 字体加不上这里会烂掉
self.device.fontid = interpreter.fontid
self.device.fontmap = interpreter.fontmap
ops_new = self.device.end_figure(xobjid)
ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
pos_inv = -np.mat(ctm[4:]) * ctm_inv
a, b, c, d = ctm_inv.reshape(4).tolist()
e, f = pos_inv.tolist()[0]
self.obj_patch[self.xobjmap[xobjid].objid] = (
f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
)
except Exception:
pass
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
else:
# unsupported xobject type.
pass
def process_page(self, page: PDFPage) -> None:
# 重载设置 page 的 obj_patch
# log.debug("Processing page: %r", page)
# print(page.mediabox,page.cropbox)
# (x0, y0, x1, y1) = page.mediabox
(x0, y0, x1, y1) = page.cropbox
if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180:
ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270:
ctm = (0, 1, -1, 0, y1, -x0)
else:
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.fontid = self.fontid
self.device.fontmap = self.fontmap
ops_new = self.device.end_page(page)
# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
self.obj_patch[page.page_xref] = (
f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
)
for obj in page.contents:
self.obj_patch[obj.objid] = ""
def render_contents(
self,
resources: Dict[object, object],
streams: Sequence[object],
ctm: Matrix = MATRIX_IDENTITY,
) -> None:
# 重载返回指令流
"""Render the content streams.
This method may be called recursively.
"""
# log.debug(
# "render_contents: resources=%r, streams=%r, ctm=%r",
# resources,
# streams,
# ctm,
# )
self.init_resources(resources)
self.init_state(ctm)
return self.execute(list_value(streams))
def execute(self, streams: Sequence[object]) -> None:
# 重载返回指令流
ops = ""
try:
parser = PDFContentParser(streams)
except PSEOF:
# empty page
return
while True:
try:
(_, obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
"'",
"_q",
)
if hasattr(self, method):
func = getattr(self, method)
nargs = func.__code__.co_argcount - 1
if nargs:
args = self.pop(nargs)
# log.debug("exec: %s %r", name, args)
if len(args) == nargs:
func(*args)
if not (
name[0] == "T"
or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
p = " ".join(
[
(
f"{x:f}"
if isinstance(x, float)
else str(x).replace("'", "")
)
for x in args
]
)
ops += f"{p} {name} "
else:
# log.debug("exec: %s", name)
targs = func()
if targs is None:
targs = []
if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
p = " ".join(
[
(
f"{x:f}"
if isinstance(x, float)
else str(x).replace("'", "")
)
for x in targs
]
)
ops += f"{p} {name} "
elif settings.STRICT:
error_msg = "Unknown operator: %r" % name
raise PDFInterpreterError(error_msg)
else:
self.push(obj)
# print('REV DATA',ops)
return ops