Spaces:

leonsimon23
/

sciwin_translate

Running

App Files Files Community

sciwin_translate / pdf2zh /pdfinterp.py

leonsimon23

Upload 9 files

8b23ca3 verified 7 months ago

raw

history blame

13.7 kB

	import logging
	from typing import Any, Dict, Optional, Sequence, Tuple, cast
	import numpy as np

	from pdfminer import settings
	from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
	from pdfminer.pdfdevice import PDFDevice
	from pdfminer.pdfinterp import (
	PDFPageInterpreter,
	PDFResourceManager,
	PDFContentParser,
	PDFInterpreterError,
	Color,
	PDFStackT,
	LITERAL_FORM,
	LITERAL_IMAGE,
	)
	from pdfminer.pdffont import PDFFont
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdftypes import (
	PDFObjRef,
	dict_value,
	list_value,
	resolve1,
	stream_value,
	)
	from pdfminer.psexceptions import PSEOF
	from pdfminer.psparser import (
	PSKeyword,
	keyword_name,
	literal_name,
	)
	from pdfminer.utils import (
	MATRIX_IDENTITY,
	Matrix,
	Rect,
	mult_matrix,
	apply_matrix_pt,
	)

	log = logging.getLogger(__name__)


	def safe_float(o: Any) -> Optional[float]:
	try:
	return float(o)
	except (TypeError, ValueError):
	return None


	class PDFPageInterpreterEx(PDFPageInterpreter):
	"""Processor for the content of a PDF page

	Reference: PDF Reference, Appendix A, Operator Summary
	"""

	def __init__(
	self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
	) -> None:
	self.rsrcmgr = rsrcmgr
	self.device = device
	self.obj_patch = obj_patch

	def dup(self) -> "PDFPageInterpreterEx":
	return self.__class__(self.rsrcmgr, self.device, self.obj_patch)

	def init_resources(self, resources: Dict[object, object]) -> None:
	# 重载设置 fontid 和 descent
	"""Prepare the fonts and XObjects listed in the Resource attribute."""
	self.resources = resources
	self.fontmap: Dict[object, PDFFont] = {}
	self.fontid: Dict[PDFFont, object] = {}
	self.xobjmap = {}
	self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
	if not resources:
	return

	def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
	if isinstance(spec, list):
	name = literal_name(spec[0])
	else:
	name = literal_name(spec)
	if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
	return PDFColorSpace(name, stream_value(spec[1])["N"])
	elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
	return PDFColorSpace(name, len(list_value(spec[1])))
	else:
	return PREDEFINED_COLORSPACE.get(name)

	for k, v in dict_value(resources).items():
	# log.debug("Resource: %r: %r", k, v)
	if k == "Font":
	for fontid, spec in dict_value(v).items():
	objid = None
	if isinstance(spec, PDFObjRef):
	objid = spec.objid
	spec = dict_value(spec)
	self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
	self.fontmap[fontid].descent = 0 # hack fix descent
	self.fontid[self.fontmap[fontid]] = fontid
	elif k == "ColorSpace":
	for csid, spec in dict_value(v).items():
	colorspace = get_colorspace(resolve1(spec))
	if colorspace is not None:
	self.csmap[csid] = colorspace
	elif k == "ProcSet":
	self.rsrcmgr.get_procset(list_value(v))
	elif k == "XObject":
	for xobjid, xobjstrm in dict_value(v).items():
	self.xobjmap[xobjid] = xobjstrm

	def do_S(self) -> None:
	# 重载过滤非公式线条
	"""Stroke path"""

	def is_black(color: Color) -> bool:
	if isinstance(color, Tuple):
	return sum(color) == 0
	else:
	return color == 0

	if (
	len(self.curpath) == 2
	and self.curpath[0][0] == "m"
	and self.curpath[1][0] == "l"
	and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
	== apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
	and is_black(self.graphicstate.scolor)
	): # 独立直线，水平，黑色
	# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
	self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
	self.curpath = []
	return "n"
	else:
	self.curpath = []

	############################################################
	# 重载过滤非公式线条（F/B）
	def do_f(self) -> None:
	"""Fill path using nonzero winding number rule"""
	# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
	self.curpath = []

	def do_F(self) -> None:
	"""Fill path using nonzero winding number rule (obsolete)"""

	def do_f_a(self) -> None:
	"""Fill path using even-odd rule"""
	# self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
	self.curpath = []

	def do_B(self) -> None:
	"""Fill and stroke path using nonzero winding number rule"""
	# self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
	self.curpath = []

	def do_B_a(self) -> None:
	"""Fill and stroke path using even-odd rule"""
	# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
	self.curpath = []

	############################################################
	# 重载返回调用参数（SCN）
	def do_SCN(self) -> None:
	"""Set color for stroking operations."""
	if self.scs:
	n = self.scs.ncomponents
	else:
	if settings.STRICT:
	raise PDFInterpreterError("No colorspace specified!")
	n = 1
	args = self.pop(n)
	self.graphicstate.scolor = cast(Color, args)
	return args

	def do_scn(self) -> None:
	"""Set color for nonstroking operations"""
	if self.ncs:
	n = self.ncs.ncomponents
	else:
	if settings.STRICT:
	raise PDFInterpreterError("No colorspace specified!")
	n = 1
	args = self.pop(n)
	self.graphicstate.ncolor = cast(Color, args)
	return args

	def do_SC(self) -> None:
	"""Set color for stroking operations"""
	return self.do_SCN()

	def do_sc(self) -> None:
	"""Set color for nonstroking operations"""
	return self.do_scn()

	def do_Do(self, xobjid_arg: PDFStackT) -> None:
	# 重载设置 xobj 的 obj_patch
	"""Invoke named XObject"""
	xobjid = literal_name(xobjid_arg)
	try:
	xobj = stream_value(self.xobjmap[xobjid])
	except KeyError:
	if settings.STRICT:
	raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
	return
	# log.debug("Processing xobj: %r", xobj)
	subtype = xobj.get("Subtype")
	if subtype is LITERAL_FORM and "BBox" in xobj:
	interpreter = self.dup()
	bbox = cast(Rect, list_value(xobj["BBox"]))
	matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
	# According to PDF reference 1.7 section 4.9.1, XObjects in
	# earlier PDFs (prior to v1.2) use the page's Resources entry
	# instead of having their own Resources entry.
	xobjres = xobj.get("Resources")
	if xobjres:
	resources = dict_value(xobjres)
	else:
	resources = self.resources.copy()
	self.device.begin_figure(xobjid, bbox, matrix)
	ctm = mult_matrix(matrix, self.ctm)
	ops_base = interpreter.render_contents(
	resources,
	[xobj],
	ctm=ctm,
	)
	try: # 有的时候 form 字体加不上这里会烂掉
	self.device.fontid = interpreter.fontid
	self.device.fontmap = interpreter.fontmap
	ops_new = self.device.end_figure(xobjid)
	ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
	pos_inv = -np.mat(ctm[4:]) * ctm_inv
	a, b, c, d = ctm_inv.reshape(4).tolist()
	e, f = pos_inv.tolist()[0]
	self.obj_patch[self.xobjmap[xobjid].objid] = (
	f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
	)
	except Exception:
	pass
	elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
	self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
	self.device.render_image(xobjid, xobj)
	self.device.end_figure(xobjid)
	else:
	# unsupported xobject type.
	pass

	def process_page(self, page: PDFPage) -> None:
	# 重载设置 page 的 obj_patch
	# log.debug("Processing page: %r", page)
	# print(page.mediabox,page.cropbox)
	# (x0, y0, x1, y1) = page.mediabox
	(x0, y0, x1, y1) = page.cropbox
	if page.rotate == 90:
	ctm = (0, -1, 1, 0, -y0, x1)
	elif page.rotate == 180:
	ctm = (-1, 0, 0, -1, x1, y1)
	elif page.rotate == 270:
	ctm = (0, 1, -1, 0, y1, -x0)
	else:
	ctm = (1, 0, 0, 1, -x0, -y0)
	self.device.begin_page(page, ctm)
	ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
	self.device.fontid = self.fontid
	self.device.fontmap = self.fontmap
	ops_new = self.device.end_page(page)
	# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标，这里输出的时候需要用 cm 把页面偏移加回来
	self.obj_patch[page.page_xref] = (
	f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图，需要让 ops_new 里的文字覆盖在上面，使用 q/Q 重置位置矩阵
	)
	for obj in page.contents:
	self.obj_patch[obj.objid] = ""

	def render_contents(
	self,
	resources: Dict[object, object],
	streams: Sequence[object],
	ctm: Matrix = MATRIX_IDENTITY,
	) -> None:
	# 重载返回指令流
	"""Render the content streams.

	This method may be called recursively.
	"""
	# log.debug(
	# "render_contents: resources=%r, streams=%r, ctm=%r",
	# resources,
	# streams,
	# ctm,
	# )
	self.init_resources(resources)
	self.init_state(ctm)
	return self.execute(list_value(streams))

	def execute(self, streams: Sequence[object]) -> None:
	# 重载返回指令流
	ops = ""
	try:
	parser = PDFContentParser(streams)
	except PSEOF:
	# empty page
	return
	while True:
	try:
	(_, obj) = parser.nextobject()
	except PSEOF:
	break
	if isinstance(obj, PSKeyword):
	name = keyword_name(obj)
	method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
	"'",
	"_q",
	)
	if hasattr(self, method):
	func = getattr(self, method)
	nargs = func.__code__.co_argcount - 1
	if nargs:
	args = self.pop(nargs)
	# log.debug("exec: %s %r", name, args)
	if len(args) == nargs:
	func(*args)
	if not (
	name[0] == "T"
	or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
	): # 过滤 T 系列文字指令，因为 EI 的参数是 obj 所以也需要过滤（只在少数文档中画横线时使用），过滤 marked 系列指令
	p = " ".join(
	[
	(
	f"{x:f}"
	if isinstance(x, float)
	else str(x).replace("'", "")
	)
	for x in args
	]
	)
	ops += f"{p} {name} "
	else:
	# log.debug("exec: %s", name)
	targs = func()
	if targs is None:
	targs = []
	if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
	p = " ".join(
	[
	(
	f"{x:f}"
	if isinstance(x, float)
	else str(x).replace("'", "")
	)
	for x in targs
	]
	)
	ops += f"{p} {name} "
	elif settings.STRICT:
	error_msg = "Unknown operator: %r" % name
	raise PDFInterpreterError(error_msg)
	else:
	self.push(obj)
	# print('REV DATA',ops)
	return ops