Spaces:
Running
Running
"""Adobe character mapping (CMap) support. | |
CMaps provide the mapping between character codes and Unicode | |
code-points to character ids (CIDs). | |
More information is available on: | |
https://github.com/adobe-type-tools/cmap-resources | |
""" | |
import gzip | |
import logging | |
import os | |
import os.path | |
import pickle as pickle | |
import struct | |
import sys | |
from typing import ( | |
Any, | |
BinaryIO, | |
Dict, | |
Iterable, | |
Iterator, | |
List, | |
MutableMapping, | |
Optional, | |
Set, | |
TextIO, | |
Tuple, | |
Union, | |
cast, | |
) | |
from pdf2zh.encodingdb import name2unicode | |
from pdf2zh.pdfexceptions import PDFException, PDFTypeError | |
from pdf2zh.psexceptions import PSEOF, PSSyntaxError | |
from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name | |
from pdf2zh.utils import choplist, nunpack | |
log = logging.getLogger(__name__) | |
class CMapError(PDFException): | |
pass | |
class CMapBase: | |
debug = 0 | |
def __init__(self, **kwargs: object) -> None: | |
self.attrs: MutableMapping[str, object] = kwargs.copy() | |
def is_vertical(self) -> bool: | |
return self.attrs.get("WMode", 0) != 0 | |
def set_attr(self, k: str, v: object) -> None: | |
self.attrs[k] = v | |
def add_code2cid(self, code: str, cid: int) -> None: | |
pass | |
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: | |
pass | |
def use_cmap(self, cmap: "CMapBase") -> None: | |
pass | |
def decode(self, code: bytes) -> Iterable[int]: | |
raise NotImplementedError | |
class CMap(CMapBase): | |
def __init__(self, **kwargs: Union[str, int]) -> None: | |
CMapBase.__init__(self, **kwargs) | |
self.code2cid: Dict[int, object] = {} | |
def __repr__(self) -> str: | |
return "<CMap: %s>" % self.attrs.get("CMapName") | |
def use_cmap(self, cmap: CMapBase) -> None: | |
assert isinstance(cmap, CMap), str(type(cmap)) | |
def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: | |
for k, v in src.items(): | |
if isinstance(v, dict): | |
d: Dict[int, object] = {} | |
dst[k] = d | |
copy(d, v) | |
else: | |
dst[k] = v | |
copy(self.code2cid, cmap.code2cid) | |
def decode(self, code: bytes) -> Iterator[int]: | |
# log.debug("decode: %r, %r", self, code) | |
d = self.code2cid | |
for i in iter(code): | |
if i in d: | |
x = d[i] | |
if isinstance(x, int): | |
yield x | |
d = self.code2cid | |
else: | |
d = cast(Dict[int, object], x) | |
else: | |
d = self.code2cid | |
def dump( | |
self, | |
out: TextIO = sys.stdout, | |
code2cid: Optional[Dict[int, object]] = None, | |
code: Tuple[int, ...] = (), | |
) -> None: | |
if code2cid is None: | |
code2cid = self.code2cid | |
code = () | |
for k, v in sorted(code2cid.items()): | |
c = code + (k,) | |
if isinstance(v, int): | |
out.write("code %r = cid %d\n" % (c, v)) | |
else: | |
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) | |
class IdentityCMap(CMapBase): | |
def decode(self, code: bytes) -> Tuple[int, ...]: | |
n = len(code) // 2 | |
if n: | |
return struct.unpack(">%dH" % n, code) | |
else: | |
return () | |
class IdentityCMapByte(IdentityCMap): | |
def decode(self, code: bytes) -> Tuple[int, ...]: | |
n = len(code) | |
if n: | |
return struct.unpack(">%dB" % n, code) | |
else: | |
return () | |
class UnicodeMap(CMapBase): | |
def __init__(self, **kwargs: Union[str, int]) -> None: | |
CMapBase.__init__(self, **kwargs) | |
self.cid2unichr: Dict[int, str] = {} | |
def __repr__(self) -> str: | |
return "<UnicodeMap: %s>" % self.attrs.get("CMapName") | |
def get_unichr(self, cid: int) -> str: | |
# log.debug("get_unichr: %r, %r", self, cid) | |
return self.cid2unichr[cid] | |
def dump(self, out: TextIO = sys.stdout) -> None: | |
for k, v in sorted(self.cid2unichr.items()): | |
out.write("cid %d = unicode %r\n" % (k, v)) | |
class IdentityUnicodeMap(UnicodeMap): | |
def get_unichr(self, cid: int) -> str: | |
"""Interpret character id as unicode codepoint""" | |
# log.debug("get_unichr: %r, %r", self, cid) | |
return chr(cid) | |
class FileCMap(CMap): | |
def add_code2cid(self, code: str, cid: int) -> None: | |
assert isinstance(code, str) and isinstance(cid, int), str( | |
(type(code), type(cid)), | |
) | |
d = self.code2cid | |
for c in code[:-1]: | |
ci = ord(c) | |
if ci in d: | |
d = cast(Dict[int, object], d[ci]) | |
else: | |
t: Dict[int, object] = {} | |
d[ci] = t | |
d = t | |
ci = ord(code[-1]) | |
d[ci] = cid | |
class FileUnicodeMap(UnicodeMap): | |
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: | |
assert isinstance(cid, int), str(type(cid)) | |
if isinstance(code, PSLiteral): | |
# Interpret as an Adobe glyph name. | |
assert isinstance(code.name, str) | |
unichr = name2unicode(code.name) | |
elif isinstance(code, bytes): | |
# Interpret as UTF-16BE. | |
unichr = code.decode("UTF-16BE", "ignore") | |
elif isinstance(code, int): | |
unichr = chr(code) | |
else: | |
raise PDFTypeError(code) | |
# A0 = non-breaking space, some weird fonts can have a collision on a cid here. | |
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": | |
return | |
self.cid2unichr[cid] = unichr | |
class PyCMap(CMap): | |
def __init__(self, name: str, module: Any) -> None: | |
super().__init__(CMapName=name) | |
self.code2cid = module.CODE2CID | |
if module.IS_VERTICAL: | |
self.attrs["WMode"] = 1 | |
class PyUnicodeMap(UnicodeMap): | |
def __init__(self, name: str, module: Any, vertical: bool) -> None: | |
super().__init__(CMapName=name) | |
if vertical: | |
self.cid2unichr = module.CID2UNICHR_V | |
self.attrs["WMode"] = 1 | |
else: | |
self.cid2unichr = module.CID2UNICHR_H | |
class CMapDB: | |
_cmap_cache: Dict[str, PyCMap] = {} | |
_umap_cache: Dict[str, List[PyUnicodeMap]] = {} | |
class CMapNotFound(CMapError): | |
pass | |
def _load_data(cls, name: str) -> Any: | |
name = name.replace("\0", "") | |
filename = "%s.pickle.gz" % name | |
# log.debug("loading: %r", name) | |
cmap_paths = ( | |
os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"), | |
os.path.join(os.path.dirname(__file__), "cmap"), | |
) | |
for directory in cmap_paths: | |
path = os.path.join(directory, filename) | |
if os.path.exists(path): | |
gzfile = gzip.open(path) | |
try: | |
return type(str(name), (), pickle.loads(gzfile.read())) | |
finally: | |
gzfile.close() | |
raise CMapDB.CMapNotFound(name) | |
def get_cmap(cls, name: str) -> CMapBase: | |
if name == "Identity-H": | |
return IdentityCMap(WMode=0) | |
elif name == "Identity-V": | |
return IdentityCMap(WMode=1) | |
elif name == "OneByteIdentityH": | |
return IdentityCMapByte(WMode=0) | |
elif name == "OneByteIdentityV": | |
return IdentityCMapByte(WMode=1) | |
try: | |
return cls._cmap_cache[name] | |
except KeyError: | |
pass | |
data = cls._load_data(name) | |
cls._cmap_cache[name] = cmap = PyCMap(name, data) | |
return cmap | |
def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: | |
try: | |
return cls._umap_cache[name][vertical] | |
except KeyError: | |
pass | |
data = cls._load_data("to-unicode-%s" % name) | |
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] | |
return cls._umap_cache[name][vertical] | |
class CMapParser(PSStackParser[PSKeyword]): | |
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: | |
PSStackParser.__init__(self, fp) | |
self.cmap = cmap | |
# some ToUnicode maps don't have "begincmap" keyword. | |
self._in_cmap = True | |
self._warnings: Set[str] = set() | |
def run(self) -> None: | |
try: | |
self.nextobject() | |
except PSEOF: | |
pass | |
KEYWORD_BEGINCMAP = KWD(b"begincmap") | |
KEYWORD_ENDCMAP = KWD(b"endcmap") | |
KEYWORD_USECMAP = KWD(b"usecmap") | |
KEYWORD_DEF = KWD(b"def") | |
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") | |
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") | |
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") | |
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") | |
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") | |
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") | |
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") | |
KEYWORD_ENDBFRANGE = KWD(b"endbfrange") | |
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") | |
KEYWORD_ENDBFCHAR = KWD(b"endbfchar") | |
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") | |
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") | |
def do_keyword(self, pos: int, token: PSKeyword) -> None: | |
"""ToUnicode CMaps | |
See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. | |
""" | |
if token is self.KEYWORD_BEGINCMAP: | |
self._in_cmap = True | |
self.popall() | |
return | |
elif token is self.KEYWORD_ENDCMAP: | |
self._in_cmap = False | |
return | |
if not self._in_cmap: | |
return | |
if token is self.KEYWORD_DEF: | |
try: | |
((_, k), (_, v)) = self.pop(2) | |
self.cmap.set_attr(literal_name(k), v) | |
except PSSyntaxError: | |
pass | |
return | |
if token is self.KEYWORD_USECMAP: | |
try: | |
((_, cmapname),) = self.pop(1) | |
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) | |
except PSSyntaxError: | |
pass | |
except CMapDB.CMapNotFound: | |
pass | |
return | |
if token is self.KEYWORD_BEGINCODESPACERANGE: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDCODESPACERANGE: | |
self.popall() | |
return | |
if token is self.KEYWORD_BEGINCIDRANGE: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDCIDRANGE: | |
objs = [obj for (__, obj) in self.popall()] | |
for start_byte, end_byte, cid in choplist(3, objs): | |
if not isinstance(start_byte, bytes): | |
self._warn_once("The start object of begincidrange is not a byte.") | |
continue | |
if not isinstance(end_byte, bytes): | |
self._warn_once("The end object of begincidrange is not a byte.") | |
continue | |
if not isinstance(cid, int): | |
self._warn_once("The cid object of begincidrange is not a byte.") | |
continue | |
if len(start_byte) != len(end_byte): | |
self._warn_once( | |
"The start and end byte of begincidrange have " | |
"different lengths.", | |
) | |
continue | |
start_prefix = start_byte[:-4] | |
end_prefix = end_byte[:-4] | |
if start_prefix != end_prefix: | |
self._warn_once( | |
"The prefix of the start and end byte of " | |
"begincidrange are not the same.", | |
) | |
continue | |
svar = start_byte[-4:] | |
evar = end_byte[-4:] | |
start = nunpack(svar) | |
end = nunpack(evar) | |
vlen = len(svar) | |
for i in range(end - start + 1): | |
x = start_prefix + struct.pack(">L", start + i)[-vlen:] | |
self.cmap.add_cid2unichr(cid + i, x) | |
return | |
if token is self.KEYWORD_BEGINCIDCHAR: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDCIDCHAR: | |
objs = [obj for (__, obj) in self.popall()] | |
for cid, code in choplist(2, objs): | |
if isinstance(code, bytes) and isinstance(cid, int): | |
self.cmap.add_cid2unichr(cid, code) | |
return | |
if token is self.KEYWORD_BEGINBFRANGE: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDBFRANGE: | |
objs = [obj for (__, obj) in self.popall()] | |
for start_byte, end_byte, code in choplist(3, objs): | |
if not isinstance(start_byte, bytes): | |
self._warn_once("The start object is not a byte.") | |
continue | |
if not isinstance(end_byte, bytes): | |
self._warn_once("The end object is not a byte.") | |
continue | |
if len(start_byte) != len(end_byte): | |
self._warn_once("The start and end byte have different lengths.") | |
continue | |
start = nunpack(start_byte) | |
end = nunpack(end_byte) | |
if isinstance(code, list): | |
if len(code) != end - start + 1: | |
self._warn_once( | |
"The difference between the start and end " | |
"offsets does not match the code length.", | |
) | |
for cid, unicode_value in zip(range(start, end + 1), code): | |
self.cmap.add_cid2unichr(cid, unicode_value) | |
else: | |
assert isinstance(code, bytes) | |
var = code[-4:] | |
base = nunpack(var) | |
prefix = code[:-4] | |
vlen = len(var) | |
for i in range(end - start + 1): | |
x = prefix + struct.pack(">L", base + i)[-vlen:] | |
self.cmap.add_cid2unichr(start + i, x) | |
return | |
if token is self.KEYWORD_BEGINBFCHAR: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDBFCHAR: | |
objs = [obj for (__, obj) in self.popall()] | |
for cid, code in choplist(2, objs): | |
if isinstance(cid, bytes) and isinstance(code, bytes): | |
self.cmap.add_cid2unichr(nunpack(cid), code) | |
return | |
if token is self.KEYWORD_BEGINNOTDEFRANGE: | |
self.popall() | |
return | |
if token is self.KEYWORD_ENDNOTDEFRANGE: | |
self.popall() | |
return | |
self.push((pos, token)) | |
def _warn_once(self, msg: str) -> None: | |
"""Warn once for each unique message""" | |
if msg not in self._warnings: | |
self._warnings.add(msg) | |
base_msg = ( | |
"Ignoring (part of) ToUnicode map because the PDF data " | |
"does not conform to the format. This could result in " | |
"(cid) values in the output. " | |
) | |
log.warning(base_msg + msg) | |