Spaces:
Running
Running
import logging | |
import struct | |
from io import BytesIO | |
from typing import ( | |
TYPE_CHECKING, | |
Any, | |
BinaryIO, | |
Dict, | |
Iterable, | |
Iterator, | |
List, | |
Mapping, | |
Optional, | |
Tuple, | |
Union, | |
cast, | |
) | |
from pdf2zh import settings | |
from pdf2zh.cmapdb import ( | |
CMap, | |
CMapBase, | |
CMapDB, | |
CMapParser, | |
FileUnicodeMap, | |
IdentityUnicodeMap, | |
UnicodeMap, | |
) | |
from pdf2zh.encodingdb import EncodingDB, name2unicode | |
from pdf2zh.fontmetrics import FONT_METRICS | |
from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError | |
from pdf2zh.pdftypes import ( | |
PDFStream, | |
dict_value, | |
int_value, | |
list_value, | |
num_value, | |
resolve1, | |
resolve_all, | |
stream_value, | |
) | |
from pdf2zh.psexceptions import PSEOF | |
from pdf2zh.psparser import ( | |
KWD, | |
LIT, | |
PSKeyword, | |
PSLiteral, | |
PSStackParser, | |
literal_name, | |
) | |
from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack | |
if TYPE_CHECKING: | |
from pdf2zh.pdfinterp import PDFResourceManager | |
log = logging.getLogger(__name__) | |
def get_widths(seq: Iterable[object]) -> Dict[int, float]: | |
"""Build a mapping of character widths for horizontal writing.""" | |
widths: Dict[int, float] = {} | |
r: List[float] = [] | |
for v in seq: | |
if isinstance(v, list): | |
if r: | |
char1 = r[-1] | |
for i, w in enumerate(v): | |
widths[cast(int, char1) + i] = w | |
r = [] | |
elif isinstance(v, (int, float)): # == utils.isnumber(v) | |
r.append(v) | |
if len(r) == 3: | |
(char1, char2, w) = r | |
for i in range(cast(int, char1), cast(int, char2) + 1): | |
widths[i] = w | |
r = [] | |
return widths | |
def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: | |
"""Build a mapping of character widths for vertical writing.""" | |
widths: Dict[int, Tuple[float, Point]] = {} | |
r: List[float] = [] | |
for v in seq: | |
if isinstance(v, list): | |
if r: | |
char1 = r[-1] | |
for i, (w, vx, vy) in enumerate(choplist(3, v)): | |
widths[cast(int, char1) + i] = (w, (vx, vy)) | |
r = [] | |
elif isinstance(v, (int, float)): # == utils.isnumber(v) | |
r.append(v) | |
if len(r) == 5: | |
(char1, char2, w, vx, vy) = r | |
for i in range(cast(int, char1), cast(int, char2) + 1): | |
widths[i] = (w, (vx, vy)) | |
r = [] | |
return widths | |
class FontMetricsDB: | |
def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: | |
return FONT_METRICS[fontname] | |
# int here means that we're not extending PSStackParser with additional types. | |
class Type1FontHeaderParser(PSStackParser[int]): | |
KEYWORD_BEGIN = KWD(b"begin") | |
KEYWORD_END = KWD(b"end") | |
KEYWORD_DEF = KWD(b"def") | |
KEYWORD_PUT = KWD(b"put") | |
KEYWORD_DICT = KWD(b"dict") | |
KEYWORD_ARRAY = KWD(b"array") | |
KEYWORD_READONLY = KWD(b"readonly") | |
KEYWORD_FOR = KWD(b"for") | |
def __init__(self, data: BinaryIO) -> None: | |
PSStackParser.__init__(self, data) | |
self._cid2unicode: Dict[int, str] = {} | |
def get_encoding(self) -> Dict[int, str]: | |
"""Parse the font encoding. | |
The Type1 font encoding maps character codes to character names. These | |
character names could either be standard Adobe glyph names, or | |
character names associated with custom CharStrings for this font. A | |
CharString is a sequence of operations that describe how the character | |
should be drawn. Currently, this function returns '' (empty string) | |
for character names that are associated with a CharStrings. | |
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format | |
:returns mapping of character identifiers (cid's) to unicode characters | |
""" | |
while 1: | |
try: | |
_, (cid, name) = self.nextobject() | |
except PSEOF: | |
break | |
try: | |
self._cid2unicode[cid] = name2unicode(cast(str, name)) | |
except KeyError: | |
# log.debug(str(e)) | |
pass | |
return self._cid2unicode | |
def do_keyword(self, pos: int, token: PSKeyword) -> None: | |
if token is self.KEYWORD_PUT: | |
((_, key), (_, value)) = self.pop(2) | |
if isinstance(key, int) and isinstance(value, PSLiteral): | |
self.add_results((key, literal_name(value))) | |
NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") | |
# Mapping of cmap names. Original cmap name is kept if not in the mapping. | |
# (missing reference for why DLIdent is mapped to Identity) | |
IDENTITY_ENCODER = { | |
"DLIdent-H": "Identity-H", | |
"DLIdent-V": "Identity-V", | |
} | |
def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: | |
d: Dict[int, List[Union[float, int]]] = {} | |
fp = BytesIO(data) | |
stack: List[Union[float, int]] = [] | |
while 1: | |
c = fp.read(1) | |
if not c: | |
break | |
b0 = ord(c) | |
if b0 <= 21: | |
d[b0] = stack | |
stack = [] | |
continue | |
if b0 == 30: | |
s = "" | |
loop = True | |
while loop: | |
b = ord(fp.read(1)) | |
for n in (b >> 4, b & 15): | |
if n == 15: | |
loop = False | |
else: | |
nibble = NIBBLES[n] | |
assert nibble is not None | |
s += nibble | |
value = float(s) | |
elif b0 >= 32 and b0 <= 246: | |
value = b0 - 139 | |
else: | |
b1 = ord(fp.read(1)) | |
if b0 >= 247 and b0 <= 250: | |
value = ((b0 - 247) << 8) + b1 + 108 | |
elif b0 >= 251 and b0 <= 254: | |
value = -((b0 - 251) << 8) - b1 - 108 | |
else: | |
b2 = ord(fp.read(1)) | |
if b1 >= 128: | |
b1 -= 256 | |
if b0 == 28: | |
value = b1 << 8 | b2 | |
else: | |
value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] | |
stack.append(value) | |
return d | |
class CFFFont: | |
STANDARD_STRINGS = ( | |
".notdef", | |
"space", | |
"exclam", | |
"quotedbl", | |
"numbersign", | |
"dollar", | |
"percent", | |
"ampersand", | |
"quoteright", | |
"parenleft", | |
"parenright", | |
"asterisk", | |
"plus", | |
"comma", | |
"hyphen", | |
"period", | |
"slash", | |
"zero", | |
"one", | |
"two", | |
"three", | |
"four", | |
"five", | |
"six", | |
"seven", | |
"eight", | |
"nine", | |
"colon", | |
"semicolon", | |
"less", | |
"equal", | |
"greater", | |
"question", | |
"at", | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"F", | |
"G", | |
"H", | |
"I", | |
"J", | |
"K", | |
"L", | |
"M", | |
"N", | |
"O", | |
"P", | |
"Q", | |
"R", | |
"S", | |
"T", | |
"U", | |
"V", | |
"W", | |
"X", | |
"Y", | |
"Z", | |
"bracketleft", | |
"backslash", | |
"bracketright", | |
"asciicircum", | |
"underscore", | |
"quoteleft", | |
"a", | |
"b", | |
"c", | |
"d", | |
"e", | |
"f", | |
"g", | |
"h", | |
"i", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"p", | |
"q", | |
"r", | |
"s", | |
"t", | |
"u", | |
"v", | |
"w", | |
"x", | |
"y", | |
"z", | |
"braceleft", | |
"bar", | |
"braceright", | |
"asciitilde", | |
"exclamdown", | |
"cent", | |
"sterling", | |
"fraction", | |
"yen", | |
"florin", | |
"section", | |
"currency", | |
"quotesingle", | |
"quotedblleft", | |
"guillemotleft", | |
"guilsinglleft", | |
"guilsinglright", | |
"fi", | |
"fl", | |
"endash", | |
"dagger", | |
"daggerdbl", | |
"periodcentered", | |
"paragraph", | |
"bullet", | |
"quotesinglbase", | |
"quotedblbase", | |
"quotedblright", | |
"guillemotright", | |
"ellipsis", | |
"perthousand", | |
"questiondown", | |
"grave", | |
"acute", | |
"circumflex", | |
"tilde", | |
"macron", | |
"breve", | |
"dotaccent", | |
"dieresis", | |
"ring", | |
"cedilla", | |
"hungarumlaut", | |
"ogonek", | |
"caron", | |
"emdash", | |
"AE", | |
"ordfeminine", | |
"Lslash", | |
"Oslash", | |
"OE", | |
"ordmasculine", | |
"ae", | |
"dotlessi", | |
"lslash", | |
"oslash", | |
"oe", | |
"germandbls", | |
"onesuperior", | |
"logicalnot", | |
"mu", | |
"trademark", | |
"Eth", | |
"onehalf", | |
"plusminus", | |
"Thorn", | |
"onequarter", | |
"divide", | |
"brokenbar", | |
"degree", | |
"thorn", | |
"threequarters", | |
"twosuperior", | |
"registered", | |
"minus", | |
"eth", | |
"multiply", | |
"threesuperior", | |
"copyright", | |
"Aacute", | |
"Acircumflex", | |
"Adieresis", | |
"Agrave", | |
"Aring", | |
"Atilde", | |
"Ccedilla", | |
"Eacute", | |
"Ecircumflex", | |
"Edieresis", | |
"Egrave", | |
"Iacute", | |
"Icircumflex", | |
"Idieresis", | |
"Igrave", | |
"Ntilde", | |
"Oacute", | |
"Ocircumflex", | |
"Odieresis", | |
"Ograve", | |
"Otilde", | |
"Scaron", | |
"Uacute", | |
"Ucircumflex", | |
"Udieresis", | |
"Ugrave", | |
"Yacute", | |
"Ydieresis", | |
"Zcaron", | |
"aacute", | |
"acircumflex", | |
"adieresis", | |
"agrave", | |
"aring", | |
"atilde", | |
"ccedilla", | |
"eacute", | |
"ecircumflex", | |
"edieresis", | |
"egrave", | |
"iacute", | |
"icircumflex", | |
"idieresis", | |
"igrave", | |
"ntilde", | |
"oacute", | |
"ocircumflex", | |
"odieresis", | |
"ograve", | |
"otilde", | |
"scaron", | |
"uacute", | |
"ucircumflex", | |
"udieresis", | |
"ugrave", | |
"yacute", | |
"ydieresis", | |
"zcaron", | |
"exclamsmall", | |
"Hungarumlautsmall", | |
"dollaroldstyle", | |
"dollarsuperior", | |
"ampersandsmall", | |
"Acutesmall", | |
"parenleftsuperior", | |
"parenrightsuperior", | |
"twodotenleader", | |
"onedotenleader", | |
"zerooldstyle", | |
"oneoldstyle", | |
"twooldstyle", | |
"threeoldstyle", | |
"fouroldstyle", | |
"fiveoldstyle", | |
"sixoldstyle", | |
"sevenoldstyle", | |
"eightoldstyle", | |
"nineoldstyle", | |
"commasuperior", | |
"threequartersemdash", | |
"periodsuperior", | |
"questionsmall", | |
"asuperior", | |
"bsuperior", | |
"centsuperior", | |
"dsuperior", | |
"esuperior", | |
"isuperior", | |
"lsuperior", | |
"msuperior", | |
"nsuperior", | |
"osuperior", | |
"rsuperior", | |
"ssuperior", | |
"tsuperior", | |
"ff", | |
"ffi", | |
"ffl", | |
"parenleftinferior", | |
"parenrightinferior", | |
"Circumflexsmall", | |
"hyphensuperior", | |
"Gravesmall", | |
"Asmall", | |
"Bsmall", | |
"Csmall", | |
"Dsmall", | |
"Esmall", | |
"Fsmall", | |
"Gsmall", | |
"Hsmall", | |
"Ismall", | |
"Jsmall", | |
"Ksmall", | |
"Lsmall", | |
"Msmall", | |
"Nsmall", | |
"Osmall", | |
"Psmall", | |
"Qsmall", | |
"Rsmall", | |
"Ssmall", | |
"Tsmall", | |
"Usmall", | |
"Vsmall", | |
"Wsmall", | |
"Xsmall", | |
"Ysmall", | |
"Zsmall", | |
"colonmonetary", | |
"onefitted", | |
"rupiah", | |
"Tildesmall", | |
"exclamdownsmall", | |
"centoldstyle", | |
"Lslashsmall", | |
"Scaronsmall", | |
"Zcaronsmall", | |
"Dieresissmall", | |
"Brevesmall", | |
"Caronsmall", | |
"Dotaccentsmall", | |
"Macronsmall", | |
"figuredash", | |
"hypheninferior", | |
"Ogoneksmall", | |
"Ringsmall", | |
"Cedillasmall", | |
"questiondownsmall", | |
"oneeighth", | |
"threeeighths", | |
"fiveeighths", | |
"seveneighths", | |
"onethird", | |
"twothirds", | |
"zerosuperior", | |
"foursuperior", | |
"fivesuperior", | |
"sixsuperior", | |
"sevensuperior", | |
"eightsuperior", | |
"ninesuperior", | |
"zeroinferior", | |
"oneinferior", | |
"twoinferior", | |
"threeinferior", | |
"fourinferior", | |
"fiveinferior", | |
"sixinferior", | |
"seveninferior", | |
"eightinferior", | |
"nineinferior", | |
"centinferior", | |
"dollarinferior", | |
"periodinferior", | |
"commainferior", | |
"Agravesmall", | |
"Aacutesmall", | |
"Acircumflexsmall", | |
"Atildesmall", | |
"Adieresissmall", | |
"Aringsmall", | |
"AEsmall", | |
"Ccedillasmall", | |
"Egravesmall", | |
"Eacutesmall", | |
"Ecircumflexsmall", | |
"Edieresissmall", | |
"Igravesmall", | |
"Iacutesmall", | |
"Icircumflexsmall", | |
"Idieresissmall", | |
"Ethsmall", | |
"Ntildesmall", | |
"Ogravesmall", | |
"Oacutesmall", | |
"Ocircumflexsmall", | |
"Otildesmall", | |
"Odieresissmall", | |
"OEsmall", | |
"Oslashsmall", | |
"Ugravesmall", | |
"Uacutesmall", | |
"Ucircumflexsmall", | |
"Udieresissmall", | |
"Yacutesmall", | |
"Thornsmall", | |
"Ydieresissmall", | |
"001.000", | |
"001.001", | |
"001.002", | |
"001.003", | |
"Black", | |
"Bold", | |
"Book", | |
"Light", | |
"Medium", | |
"Regular", | |
"Roman", | |
"Semibold", | |
) | |
class INDEX: | |
def __init__(self, fp: BinaryIO) -> None: | |
self.fp = fp | |
self.offsets: List[int] = [] | |
(count, offsize) = struct.unpack(">HB", self.fp.read(3)) | |
for i in range(count + 1): | |
self.offsets.append(nunpack(self.fp.read(offsize))) | |
self.base = self.fp.tell() - 1 | |
self.fp.seek(self.base + self.offsets[-1]) | |
def __repr__(self) -> str: | |
return "<INDEX: size=%d>" % len(self) | |
def __len__(self) -> int: | |
return len(self.offsets) - 1 | |
def __getitem__(self, i: int) -> bytes: | |
self.fp.seek(self.base + self.offsets[i]) | |
return self.fp.read(self.offsets[i + 1] - self.offsets[i]) | |
def __iter__(self) -> Iterator[bytes]: | |
return iter(self[i] for i in range(len(self))) | |
def __init__(self, name: str, fp: BinaryIO) -> None: | |
self.name = name | |
self.fp = fp | |
# Header | |
(_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) | |
self.fp.read(hdrsize - 4) | |
# Name INDEX | |
self.name_index = self.INDEX(self.fp) | |
# Top DICT INDEX | |
self.dict_index = self.INDEX(self.fp) | |
# String INDEX | |
self.string_index = self.INDEX(self.fp) | |
# Global Subr INDEX | |
self.subr_index = self.INDEX(self.fp) | |
# Top DICT DATA | |
self.top_dict = getdict(self.dict_index[0]) | |
(charset_pos,) = self.top_dict.get(15, [0]) | |
(encoding_pos,) = self.top_dict.get(16, [0]) | |
(charstring_pos,) = self.top_dict.get(17, [0]) | |
# CharStrings | |
self.fp.seek(cast(int, charstring_pos)) | |
self.charstring = self.INDEX(self.fp) | |
self.nglyphs = len(self.charstring) | |
# Encodings | |
self.code2gid = {} | |
self.gid2code = {} | |
self.fp.seek(cast(int, encoding_pos)) | |
format = self.fp.read(1) | |
if format == b"\x00": | |
# Format 0 | |
(n,) = struct.unpack("B", self.fp.read(1)) | |
for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): | |
self.code2gid[code] = gid | |
self.gid2code[gid] = code | |
elif format == b"\x01": | |
# Format 1 | |
(n,) = struct.unpack("B", self.fp.read(1)) | |
code = 0 | |
for i in range(n): | |
(first, nleft) = struct.unpack("BB", self.fp.read(2)) | |
for gid in range(first, first + nleft + 1): | |
self.code2gid[code] = gid | |
self.gid2code[gid] = code | |
code += 1 | |
else: | |
raise PDFValueError("unsupported encoding format: %r" % format) | |
# Charsets | |
self.name2gid = {} | |
self.gid2name = {} | |
self.fp.seek(cast(int, charset_pos)) | |
format = self.fp.read(1) | |
if format == b"\x00": | |
# Format 0 | |
n = self.nglyphs - 1 | |
for gid, sid in enumerate( | |
cast( | |
Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) | |
), | |
): | |
gid += 1 | |
sidname = self.getstr(sid) | |
self.name2gid[sidname] = gid | |
self.gid2name[gid] = sidname | |
elif format == b"\x01": | |
# Format 1 | |
(n,) = struct.unpack("B", self.fp.read(1)) | |
sid = 0 | |
for i in range(n): | |
(first, nleft) = struct.unpack("BB", self.fp.read(2)) | |
for gid in range(first, first + nleft + 1): | |
sidname = self.getstr(sid) | |
self.name2gid[sidname] = gid | |
self.gid2name[gid] = sidname | |
sid += 1 | |
elif format == b"\x02": | |
# Format 2 | |
assert False, str(("Unhandled", format)) | |
else: | |
raise PDFValueError("unsupported charset format: %r" % format) | |
def getstr(self, sid: int) -> Union[str, bytes]: | |
# This returns str for one of the STANDARD_STRINGS but bytes otherwise, | |
# and appears to be a needless source of type complexity. | |
if sid < len(self.STANDARD_STRINGS): | |
return self.STANDARD_STRINGS[sid] | |
return self.string_index[sid - len(self.STANDARD_STRINGS)] | |
class TrueTypeFont: | |
class CMapNotFound(PDFException): | |
pass | |
def __init__(self, name: str, fp: BinaryIO) -> None: | |
self.name = name | |
self.fp = fp | |
self.tables: Dict[bytes, Tuple[int, int]] = {} | |
self.fonttype = fp.read(4) | |
try: | |
(ntables, _1, _2, _3) = cast( | |
Tuple[int, int, int, int], | |
struct.unpack(">HHHH", fp.read(8)), | |
) | |
for _ in range(ntables): | |
(name_bytes, tsum, offset, length) = cast( | |
Tuple[bytes, int, int, int], | |
struct.unpack(">4sLLL", fp.read(16)), | |
) | |
self.tables[name_bytes] = (offset, length) | |
except struct.error: | |
# Do not fail if there are not enough bytes to read. Even for | |
# corrupted PDFs we would like to get as much information as | |
# possible, so continue. | |
pass | |
def create_unicode_map(self) -> FileUnicodeMap: | |
if b"cmap" not in self.tables: | |
raise TrueTypeFont.CMapNotFound | |
(base_offset, length) = self.tables[b"cmap"] | |
fp = self.fp | |
fp.seek(base_offset) | |
(version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4))) | |
subtables: List[Tuple[int, int, int]] = [] | |
for i in range(nsubtables): | |
subtables.append( | |
cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), | |
) | |
char2gid: Dict[int, int] = {} | |
# Only supports subtable type 0, 2 and 4. | |
for platform_id, encoding_id, st_offset in subtables: | |
# Skip non-Unicode cmaps. | |
# https://docs.microsoft.com/en-us/typography/opentype/spec/cmap | |
if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): | |
continue | |
fp.seek(base_offset + st_offset) | |
(fmttype, fmtlen, fmtlang) = cast( | |
Tuple[int, int, int], | |
struct.unpack(">HHH", fp.read(6)), | |
) | |
if fmttype == 0: | |
char2gid.update( | |
enumerate( | |
cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))), | |
), | |
) | |
elif fmttype == 2: | |
subheaderkeys = cast( | |
Tuple[int, ...], | |
struct.unpack(">256H", fp.read(512)), | |
) | |
firstbytes = [0] * 8192 | |
for i, k in enumerate(subheaderkeys): | |
firstbytes[k // 8] = i | |
nhdrs = max(subheaderkeys) // 8 + 1 | |
hdrs: List[Tuple[int, int, int, int, int]] = [] | |
for i in range(nhdrs): | |
(firstcode, entcount, delta, offset) = cast( | |
Tuple[int, int, int, int], | |
struct.unpack(">HHhH", fp.read(8)), | |
) | |
hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) | |
for i, firstcode, entcount, delta, pos in hdrs: | |
if not entcount: | |
continue | |
first = firstcode + (firstbytes[i] << 8) | |
fp.seek(pos) | |
for c in range(entcount): | |
gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] | |
if gid: | |
gid += delta | |
char2gid[first + c] = gid | |
elif fmttype == 4: | |
(segcount, _1, _2, _3) = cast( | |
Tuple[int, int, int, int], | |
struct.unpack(">HHHH", fp.read(8)), | |
) | |
segcount //= 2 | |
ecs = cast( | |
Tuple[int, ...], | |
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
) | |
fp.read(2) | |
scs = cast( | |
Tuple[int, ...], | |
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
) | |
idds = cast( | |
Tuple[int, ...], | |
struct.unpack(">%dh" % segcount, fp.read(2 * segcount)), | |
) | |
pos = fp.tell() | |
idrs = cast( | |
Tuple[int, ...], | |
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
) | |
for ec, sc, idd, idr in zip(ecs, scs, idds, idrs): | |
if idr: | |
fp.seek(pos + idr) | |
for c in range(sc, ec + 1): | |
b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] | |
char2gid[c] = (b + idd) & 0xFFFF | |
else: | |
for c in range(sc, ec + 1): | |
char2gid[c] = (c + idd) & 0xFFFF | |
else: | |
assert False, str(("Unhandled", fmttype)) | |
if not char2gid: | |
raise TrueTypeFont.CMapNotFound | |
# create unicode map | |
unicode_map = FileUnicodeMap() | |
for char, gid in char2gid.items(): | |
unicode_map.add_cid2unichr(gid, char) | |
return unicode_map | |
class PDFFontError(PDFException): | |
pass | |
class PDFUnicodeNotDefined(PDFFontError): | |
pass | |
LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") | |
LITERAL_TYPE1C = LIT("Type1C") | |
# Font widths are maintained in a dict type that maps from *either* unicode | |
# chars or integer character IDs. | |
FontWidthDict = Union[Dict[int, float], Dict[str, float]] | |
class PDFFont: | |
def __init__( | |
self, | |
descriptor: Mapping[str, Any], | |
widths: FontWidthDict, | |
default_width: Optional[float] = None, | |
) -> None: | |
self.descriptor = descriptor | |
self.widths: FontWidthDict = resolve_all(widths) | |
self.fontname = resolve1(descriptor.get("FontName", "unknown")) | |
if isinstance(self.fontname, PSLiteral): | |
self.fontname = literal_name(self.fontname) | |
self.flags = int_value(descriptor.get("Flags", 0)) | |
self.ascent = num_value(descriptor.get("Ascent", 0)) | |
self.descent = num_value(descriptor.get("Descent", 0)) | |
self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) | |
if default_width is None: | |
self.default_width = num_value(descriptor.get("MissingWidth", 0)) | |
else: | |
self.default_width = default_width | |
self.default_width = resolve1(self.default_width) | |
self.leading = num_value(descriptor.get("Leading", 0)) | |
self.bbox = cast( | |
Rect, | |
list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))), | |
) | |
self.hscale = self.vscale = 0.001 | |
# PDF RM 9.8.1 specifies /Descent should always be a negative number. | |
# PScript5.dll seems to produce Descent with a positive number, but | |
# text analysis will be wrong if this is taken as correct. So force | |
# descent to negative. | |
if self.descent > 0: | |
self.descent = -self.descent | |
def __repr__(self) -> str: | |
return "<PDFFont>" | |
def is_vertical(self) -> bool: | |
return False | |
def is_multibyte(self) -> bool: | |
return False | |
def decode(self, bytes: bytes) -> Iterable[int]: | |
return bytearray(bytes) # map(ord, bytes) | |
def get_ascent(self) -> float: | |
"""Ascent above the baseline, in text space units""" | |
return self.ascent * self.vscale | |
def get_descent(self) -> float: | |
"""Descent below the baseline, in text space units; always negative""" | |
return self.descent * self.vscale | |
def get_width(self) -> float: | |
w = self.bbox[2] - self.bbox[0] | |
if w == 0: | |
w = -self.default_width | |
return w * self.hscale | |
def get_height(self) -> float: | |
h = self.bbox[3] - self.bbox[1] | |
if h == 0: | |
h = self.ascent - self.descent | |
return h * self.vscale | |
def char_width(self, cid: int) -> float: | |
# Because character widths may be mapping either IDs or strings, | |
# we try to lookup the character ID first, then its str equivalent. | |
try: | |
return cast(Dict[int, float], self.widths)[cid] * self.hscale | |
except KeyError: | |
str_widths = cast(Dict[str, float], self.widths) | |
try: | |
return str_widths[self.to_unichr(cid)] * self.hscale | |
except (KeyError, PDFUnicodeNotDefined): | |
return self.default_width * self.hscale | |
def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: | |
"""Returns an integer for horizontal fonts, a tuple for vertical fonts.""" | |
return 0 | |
def string_width(self, s: bytes) -> float: | |
return sum(self.char_width(cid) for cid in self.decode(s)) | |
def to_unichr(self, cid: int) -> str: | |
raise NotImplementedError | |
class PDFSimpleFont(PDFFont): | |
def __init__( | |
self, | |
descriptor: Mapping[str, Any], | |
widths: FontWidthDict, | |
spec: Mapping[str, Any], | |
) -> None: | |
# Font encoding is specified either by a name of | |
# built-in encoding or a dictionary that describes | |
# the differences. | |
if "Encoding" in spec: | |
encoding = resolve1(spec["Encoding"]) | |
else: | |
encoding = LITERAL_STANDARD_ENCODING | |
if isinstance(encoding, dict): | |
name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) | |
diff = list_value(encoding.get("Differences", [])) | |
self.cid2unicode = EncodingDB.get_encoding(name, diff) | |
else: | |
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) | |
self.unicode_map: Optional[UnicodeMap] = None | |
if "ToUnicode" in spec: | |
strm = stream_value(spec["ToUnicode"]) | |
self.unicode_map = FileUnicodeMap() | |
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() | |
PDFFont.__init__(self, descriptor, widths) | |
def to_unichr(self, cid: int) -> str: | |
if self.unicode_map: | |
try: | |
return self.unicode_map.get_unichr(cid) | |
except KeyError: | |
pass | |
try: | |
return self.cid2unicode[cid] | |
except KeyError: | |
raise PDFUnicodeNotDefined(None, cid) | |
class PDFType1Font(PDFSimpleFont): | |
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: | |
try: | |
self.basefont = literal_name(spec["BaseFont"]) | |
except KeyError: | |
if settings.STRICT: | |
raise PDFFontError("BaseFont is missing") | |
self.basefont = "unknown" | |
widths: FontWidthDict | |
try: | |
(descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) | |
widths = cast(Dict[str, float], int_widths) # implicit int->float | |
except KeyError: | |
descriptor = dict_value(spec.get("FontDescriptor", {})) | |
firstchar = int_value(spec.get("FirstChar", 0)) | |
# lastchar = int_value(spec.get('LastChar', 255)) | |
width_list = list_value(spec.get("Widths", [0] * 256)) | |
widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} | |
PDFSimpleFont.__init__(self, descriptor, widths, spec) | |
if "Encoding" not in spec and "FontFile" in descriptor: | |
# try to recover the missing encoding info from the font file. | |
self.fontfile = stream_value(descriptor.get("FontFile")) | |
length1 = int_value(self.fontfile["Length1"]) | |
data = self.fontfile.get_data()[:length1] | |
parser = Type1FontHeaderParser(BytesIO(data)) | |
self.cid2unicode = parser.get_encoding() | |
def __repr__(self) -> str: | |
return "<PDFType1Font: basefont=%r>" % self.basefont | |
class PDFTrueTypeFont(PDFType1Font): | |
def __repr__(self) -> str: | |
return "<PDFTrueTypeFont: basefont=%r>" % self.basefont | |
class PDFType3Font(PDFSimpleFont): | |
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: | |
firstchar = int_value(spec.get("FirstChar", 0)) | |
# lastchar = int_value(spec.get('LastChar', 0)) | |
width_list = list_value(spec.get("Widths", [0] * 256)) | |
widths = {i + firstchar: w for (i, w) in enumerate(width_list)} | |
if "FontDescriptor" in spec: | |
descriptor = dict_value(spec["FontDescriptor"]) | |
else: | |
descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} | |
PDFSimpleFont.__init__(self, descriptor, widths, spec) | |
self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) | |
(_, self.descent, _, self.ascent) = self.bbox | |
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) | |
def __repr__(self) -> str: | |
return "<PDFType3Font>" | |
class PDFCIDFont(PDFFont): | |
default_disp: Union[float, Tuple[Optional[float], float]] | |
def __init__( | |
self, | |
rsrcmgr: "PDFResourceManager", | |
spec: Mapping[str, Any], | |
strict: bool = settings.STRICT, | |
) -> None: | |
try: | |
self.basefont = literal_name(spec["BaseFont"]) | |
except KeyError: | |
if strict: | |
raise PDFFontError("BaseFont is missing") | |
self.basefont = "unknown" | |
self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) | |
cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( | |
"latin1", | |
) | |
cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( | |
"latin1", | |
) | |
self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" | |
self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) | |
try: | |
descriptor = dict_value(spec["FontDescriptor"]) | |
except KeyError: | |
if strict: | |
raise PDFFontError("FontDescriptor is missing") | |
descriptor = {} | |
ttf = None | |
if "FontFile2" in descriptor: | |
self.fontfile = stream_value(descriptor.get("FontFile2")) | |
ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) | |
self.unicode_map: Optional[UnicodeMap] = None | |
if "ToUnicode" in spec: | |
if isinstance(spec["ToUnicode"], PDFStream): | |
strm = stream_value(spec["ToUnicode"]) | |
self.unicode_map = FileUnicodeMap() | |
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() | |
else: | |
cmap_name = literal_name(spec["ToUnicode"]) | |
encoding = literal_name(spec["Encoding"]) | |
if ( | |
"Identity" in cid_ordering | |
or "Identity" in cmap_name | |
or "Identity" in encoding | |
): | |
self.unicode_map = IdentityUnicodeMap() | |
elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): | |
if ttf: | |
try: | |
self.unicode_map = ttf.create_unicode_map() | |
except TrueTypeFont.CMapNotFound: | |
pass | |
else: | |
try: | |
self.unicode_map = CMapDB.get_unicode_map( | |
self.cidcoding, | |
self.cmap.is_vertical(), | |
) | |
except CMapDB.CMapNotFound: | |
pass | |
self.vertical = self.cmap.is_vertical() | |
if self.vertical: | |
# writing mode: vertical | |
widths2 = get_widths2(list_value(spec.get("W2", []))) | |
self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} | |
(vy, w) = resolve1(spec.get("DW2", [880, -1000])) | |
self.default_disp = (None, vy) | |
widths = {cid: w for (cid, (w, _)) in widths2.items()} | |
default_width = w | |
else: | |
# writing mode: horizontal | |
self.disps = {} | |
self.default_disp = 0 | |
widths = get_widths(list_value(spec.get("W", []))) | |
default_width = spec.get("DW", 1000) | |
PDFFont.__init__(self, descriptor, widths, default_width=default_width) | |
def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: | |
"""Get cmap from font specification | |
For certain PDFs, Encoding Type isn't mentioned as an attribute of | |
Encoding but as an attribute of CMapName, where CMapName is an | |
attribute of spec['Encoding']. | |
The horizontal/vertical modes are mentioned with different name | |
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. | |
""" | |
cmap_name = self._get_cmap_name(spec, strict) | |
try: | |
return CMapDB.get_cmap(cmap_name) | |
except CMapDB.CMapNotFound as e: | |
if strict: | |
raise PDFFontError(e) | |
return CMap() | |
def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: | |
"""Get cmap name from font specification""" | |
cmap_name = "unknown" # default value | |
try: | |
spec_encoding = spec["Encoding"] | |
if hasattr(spec_encoding, "name"): | |
cmap_name = literal_name(spec["Encoding"]) | |
else: | |
cmap_name = literal_name(spec_encoding["CMapName"]) | |
except KeyError: | |
if strict: | |
raise PDFFontError("Encoding is unspecified") | |
if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] | |
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) | |
if "CMapName" in cmap_name_stream: | |
cmap_name = cmap_name_stream.get("CMapName").name | |
elif strict: | |
raise PDFFontError("CMapName unspecified for encoding") | |
return IDENTITY_ENCODER.get(cmap_name, cmap_name) | |
def __repr__(self) -> str: | |
return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" | |
def is_vertical(self) -> bool: | |
return self.vertical | |
def is_multibyte(self) -> bool: | |
return True | |
def decode(self, bytes: bytes) -> Iterable[int]: | |
return self.cmap.decode(bytes) | |
def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: | |
"""Returns an integer for horizontal fonts, a tuple for vertical fonts.""" | |
return self.disps.get(cid, self.default_disp) | |
def to_unichr(self, cid: int) -> str: | |
try: | |
if not self.unicode_map: | |
raise PDFKeyError(cid) | |
return self.unicode_map.get_unichr(cid) | |
except KeyError: | |
raise PDFUnicodeNotDefined(self.cidcoding, cid) | |