import logging import re from typing import Dict, Iterable, Optional, cast from pdf2zh.glyphlist import glyphname2unicode from pdf2zh.latin_enc import ENCODING from pdf2zh.pdfexceptions import PDFKeyError from pdf2zh.psparser import PSLiteral HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") log = logging.getLogger(__name__) def name2unicode(name: str) -> str: """Converts Adobe glyph names to Unicode numbers. In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. This way the caller must explicitly define what to do when there is not a match. Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping :returns unicode character if name resembles something, otherwise a KeyError """ if not isinstance(name, str): raise PDFKeyError( 'Could not convert unicode name "%s" to character because ' "it should be of type str but is of type %s" % (name, type(name)), ) name = name.split(".")[0] components = name.split("_") if len(components) > 1: return "".join(map(name2unicode, components)) elif name in glyphname2unicode: return glyphname2unicode[name] elif name.startswith("uni"): name_without_uni = name.strip("uni") if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [ int(name_without_uni[i : i + 4], base=16) for i in range(0, len(name_without_uni), 4) ] for digit in unicode_digits: raise_key_error_for_invalid_unicode(digit) characters = map(chr, unicode_digits) return "".join(characters) elif name.startswith("u"): name_without_u = name.strip("u") if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) raise_key_error_for_invalid_unicode(unicode_digit) return chr(unicode_digit) raise PDFKeyError( 'Could not convert unicode name "%s" to character because ' "it does not match specification" % name, ) def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 :raises KeyError if unicode digit is invalid """ if 55295 < unicode_digit < 57344: raise PDFKeyError( "Unicode digit %d is invalid because " "it is in the range D800 through DFFF" % unicode_digit, ) class EncodingDB: std2unicode: Dict[int, str] = {} mac2unicode: Dict[int, str] = {} win2unicode: Dict[int, str] = {} pdf2unicode: Dict[int, str] = {} for name, std, mac, win, pdf in ENCODING: c = name2unicode(name) if std: std2unicode[std] = c if mac: mac2unicode[mac] = c if win: win2unicode[win] = c if pdf: pdf2unicode[pdf] = c encodings = { "StandardEncoding": std2unicode, "MacRomanEncoding": mac2unicode, "WinAnsiEncoding": win2unicode, "PDFDocEncoding": pdf2unicode, } @classmethod def get_encoding( cls, name: str, diff: Optional[Iterable[object]] = None, ) -> Dict[int, str]: cid2unicode = cls.encodings.get(name, cls.std2unicode) if diff: cid2unicode = cid2unicode.copy() cid = 0 for x in diff: if isinstance(x, int): cid = x elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(cast(str, x.name)) except (KeyError, ValueError): # log.debug(str(e)) pass cid += 1 return cid2unicode