Spaces:

sanbo1200
/

PDFTranslate

Running

sanbo

update sth. at 2024-11-26 16:15:47

9b0f4a0 8 months ago

15.4 kB

	"""Adobe character mapping (CMap) support.

	CMaps provide the mapping between character codes and Unicode
	code-points to character ids (CIDs).

	More information is available on:

	https://github.com/adobe-type-tools/cmap-resources

	"""

	import gzip
	import logging
	import os
	import os.path
	import pickle as pickle
	import struct
	import sys
	from typing import (
	Any,
	BinaryIO,
	Dict,
	Iterable,
	Iterator,
	List,
	MutableMapping,
	Optional,
	Set,
	TextIO,
	Tuple,
	Union,
	cast,
	)

	from pdf2zh.encodingdb import name2unicode
	from pdf2zh.pdfexceptions import PDFException, PDFTypeError
	from pdf2zh.psexceptions import PSEOF, PSSyntaxError
	from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
	from pdf2zh.utils import choplist, nunpack

	log = logging.getLogger(__name__)


	class CMapError(PDFException):
	pass


	class CMapBase:
	debug = 0

	def __init__(self, **kwargs: object) -> None:
	self.attrs: MutableMapping[str, object] = kwargs.copy()

	def is_vertical(self) -> bool:
	return self.attrs.get("WMode", 0) != 0

	def set_attr(self, k: str, v: object) -> None:
	self.attrs[k] = v

	def add_code2cid(self, code: str, cid: int) -> None:
	pass

	def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
	pass

	def use_cmap(self, cmap: "CMapBase") -> None:
	pass

	def decode(self, code: bytes) -> Iterable[int]:
	raise NotImplementedError


	class CMap(CMapBase):
	def __init__(self, **kwargs: Union[str, int]) -> None:
	CMapBase.__init__(self, **kwargs)
	self.code2cid: Dict[int, object] = {}

	def __repr__(self) -> str:
	return "<CMap: %s>" % self.attrs.get("CMapName")

	def use_cmap(self, cmap: CMapBase) -> None:
	assert isinstance(cmap, CMap), str(type(cmap))

	def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
	for k, v in src.items():
	if isinstance(v, dict):
	d: Dict[int, object] = {}
	dst[k] = d
	copy(d, v)
	else:
	dst[k] = v

	copy(self.code2cid, cmap.code2cid)

	def decode(self, code: bytes) -> Iterator[int]:
	# log.debug("decode: %r, %r", self, code)
	d = self.code2cid
	for i in iter(code):
	if i in d:
	x = d[i]
	if isinstance(x, int):
	yield x
	d = self.code2cid
	else:
	d = cast(Dict[int, object], x)
	else:
	d = self.code2cid

	def dump(
	self,
	out: TextIO = sys.stdout,
	code2cid: Optional[Dict[int, object]] = None,
	code: Tuple[int, ...] = (),
	) -> None:
	if code2cid is None:
	code2cid = self.code2cid
	code = ()
	for k, v in sorted(code2cid.items()):
	c = code + (k,)
	if isinstance(v, int):
	out.write("code %r = cid %d\n" % (c, v))
	else:
	self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)


	class IdentityCMap(CMapBase):
	def decode(self, code: bytes) -> Tuple[int, ...]:
	n = len(code) // 2
	if n:
	return struct.unpack(">%dH" % n, code)
	else:
	return ()


	class IdentityCMapByte(IdentityCMap):
	def decode(self, code: bytes) -> Tuple[int, ...]:
	n = len(code)
	if n:
	return struct.unpack(">%dB" % n, code)
	else:
	return ()


	class UnicodeMap(CMapBase):
	def __init__(self, **kwargs: Union[str, int]) -> None:
	CMapBase.__init__(self, **kwargs)
	self.cid2unichr: Dict[int, str] = {}

	def __repr__(self) -> str:
	return "<UnicodeMap: %s>" % self.attrs.get("CMapName")

	def get_unichr(self, cid: int) -> str:
	# log.debug("get_unichr: %r, %r", self, cid)
	return self.cid2unichr[cid]

	def dump(self, out: TextIO = sys.stdout) -> None:
	for k, v in sorted(self.cid2unichr.items()):
	out.write("cid %d = unicode %r\n" % (k, v))


	class IdentityUnicodeMap(UnicodeMap):
	def get_unichr(self, cid: int) -> str:
	"""Interpret character id as unicode codepoint"""
	# log.debug("get_unichr: %r, %r", self, cid)
	return chr(cid)


	class FileCMap(CMap):
	def add_code2cid(self, code: str, cid: int) -> None:
	assert isinstance(code, str) and isinstance(cid, int), str(
	(type(code), type(cid)),
	)
	d = self.code2cid
	for c in code[:-1]:
	ci = ord(c)
	if ci in d:
	d = cast(Dict[int, object], d[ci])
	else:
	t: Dict[int, object] = {}
	d[ci] = t
	d = t
	ci = ord(code[-1])
	d[ci] = cid


	class FileUnicodeMap(UnicodeMap):
	def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
	assert isinstance(cid, int), str(type(cid))
	if isinstance(code, PSLiteral):
	# Interpret as an Adobe glyph name.
	assert isinstance(code.name, str)
	unichr = name2unicode(code.name)
	elif isinstance(code, bytes):
	# Interpret as UTF-16BE.
	unichr = code.decode("UTF-16BE", "ignore")
	elif isinstance(code, int):
	unichr = chr(code)
	else:
	raise PDFTypeError(code)

	# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
	if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
	return
	self.cid2unichr[cid] = unichr


	class PyCMap(CMap):
	def __init__(self, name: str, module: Any) -> None:
	super().__init__(CMapName=name)
	self.code2cid = module.CODE2CID
	if module.IS_VERTICAL:
	self.attrs["WMode"] = 1


	class PyUnicodeMap(UnicodeMap):
	def __init__(self, name: str, module: Any, vertical: bool) -> None:
	super().__init__(CMapName=name)
	if vertical:
	self.cid2unichr = module.CID2UNICHR_V
	self.attrs["WMode"] = 1
	else:
	self.cid2unichr = module.CID2UNICHR_H


	class CMapDB:
	_cmap_cache: Dict[str, PyCMap] = {}
	_umap_cache: Dict[str, List[PyUnicodeMap]] = {}

	class CMapNotFound(CMapError):
	pass

	@classmethod
	def _load_data(cls, name: str) -> Any:
	name = name.replace("\0", "")
	filename = "%s.pickle.gz" % name
	# log.debug("loading: %r", name)
	cmap_paths = (
	os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
	os.path.join(os.path.dirname(__file__), "cmap"),
	)
	for directory in cmap_paths:
	path = os.path.join(directory, filename)
	if os.path.exists(path):
	gzfile = gzip.open(path)
	try:
	return type(str(name), (), pickle.loads(gzfile.read()))
	finally:
	gzfile.close()
	raise CMapDB.CMapNotFound(name)

	@classmethod
	def get_cmap(cls, name: str) -> CMapBase:
	if name == "Identity-H":
	return IdentityCMap(WMode=0)
	elif name == "Identity-V":
	return IdentityCMap(WMode=1)
	elif name == "OneByteIdentityH":
	return IdentityCMapByte(WMode=0)
	elif name == "OneByteIdentityV":
	return IdentityCMapByte(WMode=1)
	try:
	return cls._cmap_cache[name]
	except KeyError:
	pass
	data = cls._load_data(name)
	cls._cmap_cache[name] = cmap = PyCMap(name, data)
	return cmap

	@classmethod
	def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
	try:
	return cls._umap_cache[name][vertical]
	except KeyError:
	pass
	data = cls._load_data("to-unicode-%s" % name)
	cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
	return cls._umap_cache[name][vertical]


	class CMapParser(PSStackParser[PSKeyword]):
	def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
	PSStackParser.__init__(self, fp)
	self.cmap = cmap
	# some ToUnicode maps don't have "begincmap" keyword.
	self._in_cmap = True
	self._warnings: Set[str] = set()

	def run(self) -> None:
	try:
	self.nextobject()
	except PSEOF:
	pass

	KEYWORD_BEGINCMAP = KWD(b"begincmap")
	KEYWORD_ENDCMAP = KWD(b"endcmap")
	KEYWORD_USECMAP = KWD(b"usecmap")
	KEYWORD_DEF = KWD(b"def")
	KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
	KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
	KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
	KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
	KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
	KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
	KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
	KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
	KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
	KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
	KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
	KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")

	def do_keyword(self, pos: int, token: PSKeyword) -> None:
	"""ToUnicode CMaps

	See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
	"""
	if token is self.KEYWORD_BEGINCMAP:
	self._in_cmap = True
	self.popall()
	return

	elif token is self.KEYWORD_ENDCMAP:
	self._in_cmap = False
	return

	if not self._in_cmap:
	return

	if token is self.KEYWORD_DEF:
	try:
	((_, k), (_, v)) = self.pop(2)
	self.cmap.set_attr(literal_name(k), v)
	except PSSyntaxError:
	pass
	return

	if token is self.KEYWORD_USECMAP:
	try:
	((_, cmapname),) = self.pop(1)
	self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
	except PSSyntaxError:
	pass
	except CMapDB.CMapNotFound:
	pass
	return

	if token is self.KEYWORD_BEGINCODESPACERANGE:
	self.popall()
	return
	if token is self.KEYWORD_ENDCODESPACERANGE:
	self.popall()
	return

	if token is self.KEYWORD_BEGINCIDRANGE:
	self.popall()
	return

	if token is self.KEYWORD_ENDCIDRANGE:
	objs = [obj for (__, obj) in self.popall()]
	for start_byte, end_byte, cid in choplist(3, objs):
	if not isinstance(start_byte, bytes):
	self._warn_once("The start object of begincidrange is not a byte.")
	continue
	if not isinstance(end_byte, bytes):
	self._warn_once("The end object of begincidrange is not a byte.")
	continue
	if not isinstance(cid, int):
	self._warn_once("The cid object of begincidrange is not a byte.")
	continue
	if len(start_byte) != len(end_byte):
	self._warn_once(
	"The start and end byte of begincidrange have "
	"different lengths.",
	)
	continue
	start_prefix = start_byte[:-4]
	end_prefix = end_byte[:-4]
	if start_prefix != end_prefix:
	self._warn_once(
	"The prefix of the start and end byte of "
	"begincidrange are not the same.",
	)
	continue
	svar = start_byte[-4:]
	evar = end_byte[-4:]
	start = nunpack(svar)
	end = nunpack(evar)
	vlen = len(svar)
	for i in range(end - start + 1):
	x = start_prefix + struct.pack(">L", start + i)[-vlen:]
	self.cmap.add_cid2unichr(cid + i, x)
	return

	if token is self.KEYWORD_BEGINCIDCHAR:
	self.popall()
	return

	if token is self.KEYWORD_ENDCIDCHAR:
	objs = [obj for (__, obj) in self.popall()]
	for cid, code in choplist(2, objs):
	if isinstance(code, bytes) and isinstance(cid, int):
	self.cmap.add_cid2unichr(cid, code)
	return

	if token is self.KEYWORD_BEGINBFRANGE:
	self.popall()
	return

	if token is self.KEYWORD_ENDBFRANGE:
	objs = [obj for (__, obj) in self.popall()]
	for start_byte, end_byte, code in choplist(3, objs):
	if not isinstance(start_byte, bytes):
	self._warn_once("The start object is not a byte.")
	continue
	if not isinstance(end_byte, bytes):
	self._warn_once("The end object is not a byte.")
	continue
	if len(start_byte) != len(end_byte):
	self._warn_once("The start and end byte have different lengths.")
	continue
	start = nunpack(start_byte)
	end = nunpack(end_byte)
	if isinstance(code, list):
	if len(code) != end - start + 1:
	self._warn_once(
	"The difference between the start and end "
	"offsets does not match the code length.",
	)
	for cid, unicode_value in zip(range(start, end + 1), code):
	self.cmap.add_cid2unichr(cid, unicode_value)
	else:
	assert isinstance(code, bytes)
	var = code[-4:]
	base = nunpack(var)
	prefix = code[:-4]
	vlen = len(var)
	for i in range(end - start + 1):
	x = prefix + struct.pack(">L", base + i)[-vlen:]
	self.cmap.add_cid2unichr(start + i, x)
	return

	if token is self.KEYWORD_BEGINBFCHAR:
	self.popall()
	return

	if token is self.KEYWORD_ENDBFCHAR:
	objs = [obj for (__, obj) in self.popall()]
	for cid, code in choplist(2, objs):
	if isinstance(cid, bytes) and isinstance(code, bytes):
	self.cmap.add_cid2unichr(nunpack(cid), code)
	return

	if token is self.KEYWORD_BEGINNOTDEFRANGE:
	self.popall()
	return

	if token is self.KEYWORD_ENDNOTDEFRANGE:
	self.popall()
	return

	self.push((pos, token))

	def _warn_once(self, msg: str) -> None:
	"""Warn once for each unique message"""
	if msg not in self._warnings:
	self._warnings.add(msg)
	base_msg = (
	"Ignoring (part of) ToUnicode map because the PDF data "
	"does not conform to the format. This could result in "
	"(cid) values in the output. "
	)
	log.warning(base_msg + msg)