PDFTranslate / pdf2zh /cmapdb.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
"""Adobe character mapping (CMap) support.
CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).
More information is available on:
https://github.com/adobe-type-tools/cmap-resources
"""
import gzip
import logging
import os
import os.path
import pickle as pickle
import struct
import sys
from typing import (
Any,
BinaryIO,
Dict,
Iterable,
Iterator,
List,
MutableMapping,
Optional,
Set,
TextIO,
Tuple,
Union,
cast,
)
from pdf2zh.encodingdb import name2unicode
from pdf2zh.pdfexceptions import PDFException, PDFTypeError
from pdf2zh.psexceptions import PSEOF, PSSyntaxError
from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
from pdf2zh.utils import choplist, nunpack
log = logging.getLogger(__name__)
class CMapError(PDFException):
pass
class CMapBase:
debug = 0
def __init__(self, **kwargs: object) -> None:
self.attrs: MutableMapping[str, object] = kwargs.copy()
def is_vertical(self) -> bool:
return self.attrs.get("WMode", 0) != 0
def set_attr(self, k: str, v: object) -> None:
self.attrs[k] = v
def add_code2cid(self, code: str, cid: int) -> None:
pass
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
pass
def use_cmap(self, cmap: "CMapBase") -> None:
pass
def decode(self, code: bytes) -> Iterable[int]:
raise NotImplementedError
class CMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs)
self.code2cid: Dict[int, object] = {}
def __repr__(self) -> str:
return "<CMap: %s>" % self.attrs.get("CMapName")
def use_cmap(self, cmap: CMapBase) -> None:
assert isinstance(cmap, CMap), str(type(cmap))
def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
for k, v in src.items():
if isinstance(v, dict):
d: Dict[int, object] = {}
dst[k] = d
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
def decode(self, code: bytes) -> Iterator[int]:
# log.debug("decode: %r, %r", self, code)
d = self.code2cid
for i in iter(code):
if i in d:
x = d[i]
if isinstance(x, int):
yield x
d = self.code2cid
else:
d = cast(Dict[int, object], x)
else:
d = self.code2cid
def dump(
self,
out: TextIO = sys.stdout,
code2cid: Optional[Dict[int, object]] = None,
code: Tuple[int, ...] = (),
) -> None:
if code2cid is None:
code2cid = self.code2cid
code = ()
for k, v in sorted(code2cid.items()):
c = code + (k,)
if isinstance(v, int):
out.write("code %r = cid %d\n" % (c, v))
else:
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
class IdentityCMap(CMapBase):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code) // 2
if n:
return struct.unpack(">%dH" % n, code)
else:
return ()
class IdentityCMapByte(IdentityCMap):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code)
if n:
return struct.unpack(">%dB" % n, code)
else:
return ()
class UnicodeMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs)
self.cid2unichr: Dict[int, str] = {}
def __repr__(self) -> str:
return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
def get_unichr(self, cid: int) -> str:
# log.debug("get_unichr: %r, %r", self, cid)
return self.cid2unichr[cid]
def dump(self, out: TextIO = sys.stdout) -> None:
for k, v in sorted(self.cid2unichr.items()):
out.write("cid %d = unicode %r\n" % (k, v))
class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint"""
# log.debug("get_unichr: %r, %r", self, cid)
return chr(cid)
class FileCMap(CMap):
def add_code2cid(self, code: str, cid: int) -> None:
assert isinstance(code, str) and isinstance(cid, int), str(
(type(code), type(cid)),
)
d = self.code2cid
for c in code[:-1]:
ci = ord(c)
if ci in d:
d = cast(Dict[int, object], d[ci])
else:
t: Dict[int, object] = {}
d[ci] = t
d = t
ci = ord(code[-1])
d[ci] = cid
class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
assert isinstance(cid, int), str(type(cid))
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
assert isinstance(code.name, str)
unichr = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
unichr = code.decode("UTF-16BE", "ignore")
elif isinstance(code, int):
unichr = chr(code)
else:
raise PDFTypeError(code)
# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
return
self.cid2unichr[cid] = unichr
class PyCMap(CMap):
def __init__(self, name: str, module: Any) -> None:
super().__init__(CMapName=name)
self.code2cid = module.CODE2CID
if module.IS_VERTICAL:
self.attrs["WMode"] = 1
class PyUnicodeMap(UnicodeMap):
def __init__(self, name: str, module: Any, vertical: bool) -> None:
super().__init__(CMapName=name)
if vertical:
self.cid2unichr = module.CID2UNICHR_V
self.attrs["WMode"] = 1
else:
self.cid2unichr = module.CID2UNICHR_H
class CMapDB:
_cmap_cache: Dict[str, PyCMap] = {}
_umap_cache: Dict[str, List[PyUnicodeMap]] = {}
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "")
filename = "%s.pickle.gz" % name
# log.debug("loading: %r", name)
cmap_paths = (
os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
os.path.join(os.path.dirname(__file__), "cmap"),
)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
try:
return type(str(name), (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
raise CMapDB.CMapNotFound(name)
@classmethod
def get_cmap(cls, name: str) -> CMapBase:
if name == "Identity-H":
return IdentityCMap(WMode=0)
elif name == "Identity-V":
return IdentityCMap(WMode=1)
elif name == "OneByteIdentityH":
return IdentityCMapByte(WMode=0)
elif name == "OneByteIdentityV":
return IdentityCMapByte(WMode=1)
try:
return cls._cmap_cache[name]
except KeyError:
pass
data = cls._load_data(name)
cls._cmap_cache[name] = cmap = PyCMap(name, data)
return cmap
@classmethod
def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
try:
return cls._umap_cache[name][vertical]
except KeyError:
pass
data = cls._load_data("to-unicode-%s" % name)
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
return cls._umap_cache[name][vertical]
class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
self._warnings: Set[str] = set()
def run(self) -> None:
try:
self.nextobject()
except PSEOF:
pass
KEYWORD_BEGINCMAP = KWD(b"begincmap")
KEYWORD_ENDCMAP = KWD(b"endcmap")
KEYWORD_USECMAP = KWD(b"usecmap")
KEYWORD_DEF = KWD(b"def")
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""ToUnicode CMaps
See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
"""
if token is self.KEYWORD_BEGINCMAP:
self._in_cmap = True
self.popall()
return
elif token is self.KEYWORD_ENDCMAP:
self._in_cmap = False
return
if not self._in_cmap:
return
if token is self.KEYWORD_DEF:
try:
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
return
if token is self.KEYWORD_USECMAP:
try:
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
except CMapDB.CMapNotFound:
pass
return
if token is self.KEYWORD_BEGINCODESPACERANGE:
self.popall()
return
if token is self.KEYWORD_ENDCODESPACERANGE:
self.popall()
return
if token is self.KEYWORD_BEGINCIDRANGE:
self.popall()
return
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for start_byte, end_byte, cid in choplist(3, objs):
if not isinstance(start_byte, bytes):
self._warn_once("The start object of begincidrange is not a byte.")
continue
if not isinstance(end_byte, bytes):
self._warn_once("The end object of begincidrange is not a byte.")
continue
if not isinstance(cid, int):
self._warn_once("The cid object of begincidrange is not a byte.")
continue
if len(start_byte) != len(end_byte):
self._warn_once(
"The start and end byte of begincidrange have "
"different lengths.",
)
continue
start_prefix = start_byte[:-4]
end_prefix = end_byte[:-4]
if start_prefix != end_prefix:
self._warn_once(
"The prefix of the start and end byte of "
"begincidrange are not the same.",
)
continue
svar = start_byte[-4:]
evar = end_byte[-4:]
start = nunpack(svar)
end = nunpack(evar)
vlen = len(svar)
for i in range(end - start + 1):
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
self.cmap.add_cid2unichr(cid + i, x)
return
if token is self.KEYWORD_BEGINCIDCHAR:
self.popall()
return
if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()]
for cid, code in choplist(2, objs):
if isinstance(code, bytes) and isinstance(cid, int):
self.cmap.add_cid2unichr(cid, code)
return
if token is self.KEYWORD_BEGINBFRANGE:
self.popall()
return
if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()]
for start_byte, end_byte, code in choplist(3, objs):
if not isinstance(start_byte, bytes):
self._warn_once("The start object is not a byte.")
continue
if not isinstance(end_byte, bytes):
self._warn_once("The end object is not a byte.")
continue
if len(start_byte) != len(end_byte):
self._warn_once("The start and end byte have different lengths.")
continue
start = nunpack(start_byte)
end = nunpack(end_byte)
if isinstance(code, list):
if len(code) != end - start + 1:
self._warn_once(
"The difference between the start and end "
"offsets does not match the code length.",
)
for cid, unicode_value in zip(range(start, end + 1), code):
self.cmap.add_cid2unichr(cid, unicode_value)
else:
assert isinstance(code, bytes)
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in range(end - start + 1):
x = prefix + struct.pack(">L", base + i)[-vlen:]
self.cmap.add_cid2unichr(start + i, x)
return
if token is self.KEYWORD_BEGINBFCHAR:
self.popall()
return
if token is self.KEYWORD_ENDBFCHAR:
objs = [obj for (__, obj) in self.popall()]
for cid, code in choplist(2, objs):
if isinstance(cid, bytes) and isinstance(code, bytes):
self.cmap.add_cid2unichr(nunpack(cid), code)
return
if token is self.KEYWORD_BEGINNOTDEFRANGE:
self.popall()
return
if token is self.KEYWORD_ENDNOTDEFRANGE:
self.popall()
return
self.push((pos, token))
def _warn_once(self, msg: str) -> None:
"""Warn once for each unique message"""
if msg not in self._warnings:
self._warnings.add(msg)
base_msg = (
"Ignoring (part of) ToUnicode map because the PDF data "
"does not conform to the format. This could result in "
"(cid) values in the output. "
)
log.warning(base_msg + msg)