PDFTranslate / pdf2zh /encodingdb.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
import logging
import re
from typing import Dict, Iterable, Optional, cast
from pdf2zh.glyphlist import glyphname2unicode
from pdf2zh.latin_enc import ENCODING
from pdf2zh.pdfexceptions import PDFKeyError
from pdf2zh.psparser import PSLiteral
HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
log = logging.getLogger(__name__)
def name2unicode(name: str) -> str:
"""Converts Adobe glyph names to Unicode numbers.
In contrast to the specification, this raises a KeyError instead of return
an empty string when the key is unknown.
This way the caller must explicitly define what to do
when there is not a match.
Reference:
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
:returns unicode character if name resembles something,
otherwise a KeyError
"""
if not isinstance(name, str):
raise PDFKeyError(
'Could not convert unicode name "%s" to character because '
"it should be of type str but is of type %s" % (name, type(name)),
)
name = name.split(".")[0]
components = name.split("_")
if len(components) > 1:
return "".join(map(name2unicode, components))
elif name in glyphname2unicode:
return glyphname2unicode[name]
elif name.startswith("uni"):
name_without_uni = name.strip("uni")
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
unicode_digits = [
int(name_without_uni[i : i + 4], base=16)
for i in range(0, len(name_without_uni), 4)
]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits)
return "".join(characters)
elif name.startswith("u"):
name_without_u = name.strip("u")
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit)
raise PDFKeyError(
'Could not convert unicode name "%s" to character because '
"it does not match specification" % name,
)
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
"""Unicode values should not be in the range D800 through DFFF because
that is used for surrogate pairs in UTF-16
:raises KeyError if unicode digit is invalid
"""
if 55295 < unicode_digit < 57344:
raise PDFKeyError(
"Unicode digit %d is invalid because "
"it is in the range D800 through DFFF" % unicode_digit,
)
class EncodingDB:
std2unicode: Dict[int, str] = {}
mac2unicode: Dict[int, str] = {}
win2unicode: Dict[int, str] = {}
pdf2unicode: Dict[int, str] = {}
for name, std, mac, win, pdf in ENCODING:
c = name2unicode(name)
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
"StandardEncoding": std2unicode,
"MacRomanEncoding": mac2unicode,
"WinAnsiEncoding": win2unicode,
"PDFDocEncoding": pdf2unicode,
}
@classmethod
def get_encoding(
cls,
name: str,
diff: Optional[Iterable[object]] = None,
) -> Dict[int, str]:
cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = name2unicode(cast(str, x.name))
except (KeyError, ValueError):
# log.debug(str(e))
pass
cid += 1
return cid2unicode