"""Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode code-points to character ids (CIDs). More information is available on: https://github.com/adobe-type-tools/cmap-resources """ import gzip import logging import os import os.path import pickle as pickle import struct import sys from typing import ( Any, BinaryIO, Dict, Iterable, Iterator, List, MutableMapping, Optional, Set, TextIO, Tuple, Union, cast, ) from pdf2zh.encodingdb import name2unicode from pdf2zh.pdfexceptions import PDFException, PDFTypeError from pdf2zh.psexceptions import PSEOF, PSSyntaxError from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name from pdf2zh.utils import choplist, nunpack log = logging.getLogger(__name__) class CMapError(PDFException): pass class CMapBase: debug = 0 def __init__(self, **kwargs: object) -> None: self.attrs: MutableMapping[str, object] = kwargs.copy() def is_vertical(self) -> bool: return self.attrs.get("WMode", 0) != 0 def set_attr(self, k: str, v: object) -> None: self.attrs[k] = v def add_code2cid(self, code: str, cid: int) -> None: pass def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: pass def use_cmap(self, cmap: "CMapBase") -> None: pass def decode(self, code: bytes) -> Iterable[int]: raise NotImplementedError class CMap(CMapBase): def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) self.code2cid: Dict[int, object] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: for k, v in src.items(): if isinstance(v, dict): d: Dict[int, object] = {} dst[k] = d copy(d, v) else: dst[k] = v copy(self.code2cid, cmap.code2cid) def decode(self, code: bytes) -> Iterator[int]: # log.debug("decode: %r, %r", self, code) d = self.code2cid for i in iter(code): if i in d: x = d[i] if isinstance(x, int): yield x d = self.code2cid else: d = cast(Dict[int, object], x) else: d = self.code2cid def dump( self, out: TextIO = sys.stdout, code2cid: Optional[Dict[int, object]] = None, code: Tuple[int, ...] = (), ) -> None: if code2cid is None: code2cid = self.code2cid code = () for k, v in sorted(code2cid.items()): c = code + (k,) if isinstance(v, int): out.write("code %r = cid %d\n" % (c, v)) else: self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) class IdentityCMap(CMapBase): def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) // 2 if n: return struct.unpack(">%dH" % n, code) else: return () class IdentityCMapByte(IdentityCMap): def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) if n: return struct.unpack(">%dB" % n, code) else: return () class UnicodeMap(CMapBase): def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) self.cid2unichr: Dict[int, str] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def get_unichr(self, cid: int) -> str: # log.debug("get_unichr: %r, %r", self, cid) return self.cid2unichr[cid] def dump(self, out: TextIO = sys.stdout) -> None: for k, v in sorted(self.cid2unichr.items()): out.write("cid %d = unicode %r\n" % (k, v)) class IdentityUnicodeMap(UnicodeMap): def get_unichr(self, cid: int) -> str: """Interpret character id as unicode codepoint""" # log.debug("get_unichr: %r, %r", self, cid) return chr(cid) class FileCMap(CMap): def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int), str( (type(code), type(cid)), ) d = self.code2cid for c in code[:-1]: ci = ord(c) if ci in d: d = cast(Dict[int, object], d[ci]) else: t: Dict[int, object] = {} d[ci] = t d = t ci = ord(code[-1]) d[ci] = cid class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: assert isinstance(cid, int), str(type(cid)) if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) unichr = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. unichr = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): unichr = chr(code) else: raise PDFTypeError(code) # A0 = non-breaking space, some weird fonts can have a collision on a cid here. if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": return self.cid2unichr[cid] = unichr class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: super().__init__(CMapName=name) self.code2cid = module.CODE2CID if module.IS_VERTICAL: self.attrs["WMode"] = 1 class PyUnicodeMap(UnicodeMap): def __init__(self, name: str, module: Any, vertical: bool) -> None: super().__init__(CMapName=name) if vertical: self.cid2unichr = module.CID2UNICHR_V self.attrs["WMode"] = 1 else: self.cid2unichr = module.CID2UNICHR_H class CMapDB: _cmap_cache: Dict[str, PyCMap] = {} _umap_cache: Dict[str, List[PyUnicodeMap]] = {} class CMapNotFound(CMapError): pass @classmethod def _load_data(cls, name: str) -> Any: name = name.replace("\0", "") filename = "%s.pickle.gz" % name # log.debug("loading: %r", name) cmap_paths = ( os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"), os.path.join(os.path.dirname(__file__), "cmap"), ) for directory in cmap_paths: path = os.path.join(directory, filename) if os.path.exists(path): gzfile = gzip.open(path) try: return type(str(name), (), pickle.loads(gzfile.read())) finally: gzfile.close() raise CMapDB.CMapNotFound(name) @classmethod def get_cmap(cls, name: str) -> CMapBase: if name == "Identity-H": return IdentityCMap(WMode=0) elif name == "Identity-V": return IdentityCMap(WMode=1) elif name == "OneByteIdentityH": return IdentityCMapByte(WMode=0) elif name == "OneByteIdentityV": return IdentityCMapByte(WMode=1) try: return cls._cmap_cache[name] except KeyError: pass data = cls._load_data(name) cls._cmap_cache[name] = cmap = PyCMap(name, data) return cmap @classmethod def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: try: return cls._umap_cache[name][vertical] except KeyError: pass data = cls._load_data("to-unicode-%s" % name) cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] return cls._umap_cache[name][vertical] class CMapParser(PSStackParser[PSKeyword]): def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True self._warnings: Set[str] = set() def run(self) -> None: try: self.nextobject() except PSEOF: pass KEYWORD_BEGINCMAP = KWD(b"begincmap") KEYWORD_ENDCMAP = KWD(b"endcmap") KEYWORD_USECMAP = KWD(b"usecmap") KEYWORD_DEF = KWD(b"def") KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") KEYWORD_ENDBFRANGE = KWD(b"endbfrange") KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") KEYWORD_ENDBFCHAR = KWD(b"endbfchar") KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") def do_keyword(self, pos: int, token: PSKeyword) -> None: """ToUnicode CMaps See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. """ if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() return elif token is self.KEYWORD_ENDCMAP: self._in_cmap = False return if not self._in_cmap: return if token is self.KEYWORD_DEF: try: ((_, k), (_, v)) = self.pop(2) self.cmap.set_attr(literal_name(k), v) except PSSyntaxError: pass return if token is self.KEYWORD_USECMAP: try: ((_, cmapname),) = self.pop(1) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) except PSSyntaxError: pass except CMapDB.CMapNotFound: pass return if token is self.KEYWORD_BEGINCODESPACERANGE: self.popall() return if token is self.KEYWORD_ENDCODESPACERANGE: self.popall() return if token is self.KEYWORD_BEGINCIDRANGE: self.popall() return if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] for start_byte, end_byte, cid in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object of begincidrange is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object of begincidrange is not a byte.") continue if not isinstance(cid, int): self._warn_once("The cid object of begincidrange is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once( "The start and end byte of begincidrange have " "different lengths.", ) continue start_prefix = start_byte[:-4] end_prefix = end_byte[:-4] if start_prefix != end_prefix: self._warn_once( "The prefix of the start and end byte of " "begincidrange are not the same.", ) continue svar = start_byte[-4:] evar = end_byte[-4:] start = nunpack(svar) end = nunpack(evar) vlen = len(svar) for i in range(end - start + 1): x = start_prefix + struct.pack(">L", start + i)[-vlen:] self.cmap.add_cid2unichr(cid + i, x) return if token is self.KEYWORD_BEGINCIDCHAR: self.popall() return if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] for cid, code in choplist(2, objs): if isinstance(code, bytes) and isinstance(cid, int): self.cmap.add_cid2unichr(cid, code) return if token is self.KEYWORD_BEGINBFRANGE: self.popall() return if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] for start_byte, end_byte, code in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once("The start and end byte have different lengths.") continue start = nunpack(start_byte) end = nunpack(end_byte) if isinstance(code, list): if len(code) != end - start + 1: self._warn_once( "The difference between the start and end " "offsets does not match the code length.", ) for cid, unicode_value in zip(range(start, end + 1), code): self.cmap.add_cid2unichr(cid, unicode_value) else: assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) for i in range(end - start + 1): x = prefix + struct.pack(">L", base + i)[-vlen:] self.cmap.add_cid2unichr(start + i, x) return if token is self.KEYWORD_BEGINBFCHAR: self.popall() return if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] for cid, code in choplist(2, objs): if isinstance(cid, bytes) and isinstance(code, bytes): self.cmap.add_cid2unichr(nunpack(cid), code) return if token is self.KEYWORD_BEGINNOTDEFRANGE: self.popall() return if token is self.KEYWORD_ENDNOTDEFRANGE: self.popall() return self.push((pos, token)) def _warn_once(self, msg: str) -> None: """Warn once for each unique message""" if msg not in self._warnings: self._warnings.add(msg) base_msg = ( "Ignoring (part of) ToUnicode map because the PDF data " "does not conform to the format. This could result in " "(cid) values in the output. " ) log.warning(base_msg + msg)