PDFTranslate / pdf2zh /pdfpage.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
import itertools
import logging
from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
from pdf2zh import settings
from pdf2zh.pdfdocument import (
PDFDocument,
PDFNoPageLabels,
PDFTextExtractionNotAllowed,
)
from pdf2zh.pdfexceptions import PDFObjectNotFound, PDFValueError
from pdf2zh.pdfparser import PDFParser
from pdf2zh.pdftypes import dict_value, int_value, list_value, resolve1
from pdf2zh.psparser import LIT
from pdf2zh.utils import parse_rect
log = logging.getLogger(__name__)
# some predefined literals and keywords.
LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT("Pages")
class PDFPage:
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes
----------
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a dictionary of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
label: the page's label (typically, the logical page number).
"""
def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object,
label: Optional[str],
) -> None:
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
label: page label string.
"""
self.doc = doc
self.pageid = pageid
self.pageno = 0
self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get("LastModified"))
self.resources: Dict[object, object] = resolve1(
self.attrs.get("Resources", dict()),
)
mediabox_params: List[Any] = [
resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]
]
self.mediabox = parse_rect(resolve1(mediabox_params))
self.cropbox = self.mediabox
if "CropBox" in self.attrs:
try:
self.cropbox = parse_rect(resolve1(self.attrs["CropBox"]))
except PDFValueError:
pass
self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
self.annots = self.attrs.get("Annots")
self.beads = self.attrs.get("B")
if "Contents" in self.attrs:
contents = resolve1(self.attrs["Contents"])
else:
contents = []
if not isinstance(contents, list):
contents = [contents]
self.contents: List[object] = contents
def __repr__(self) -> str:
return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
@classmethod
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
def depth_first_search(
obj: Any,
parent: Dict[str, Any],
visited: Optional[Set[Any]] = None,
) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
if isinstance(obj, int):
object_id = obj
object_properties = dict_value(document.getobj(object_id)).copy()
else:
# This looks broken. obj.objid means obj could be either
# PDFObjRef or PDFStream, but neither is valid for dict_value.
object_id = obj.objid # type: ignore[attr-defined]
object_properties = dict_value(obj).copy()
# Avoid recursion errors by keeping track of visited nodes
if visited is None:
visited = set()
if object_id in visited:
return
visited.add(object_id)
for k, v in parent.items():
if k in cls.INHERITABLE_ATTRS and k not in object_properties:
object_properties[k] = v
object_type = object_properties.get("Type")
if object_type is None and not settings.STRICT: # See #64
object_type = object_properties.get("type")
if object_type is LITERAL_PAGES and "Kids" in object_properties:
# log.debug("Pages: Kids=%r", object_properties["Kids"])
for child in list_value(object_properties["Kids"]):
yield from depth_first_search(child, object_properties, visited)
elif object_type is LITERAL_PAGE:
# log.debug("Page: %r", object_properties)
yield (object_id, object_properties)
try:
page_labels: Iterator[Optional[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
pages = False
if "Pages" in document.catalog:
objects = depth_first_search(document.catalog["Pages"], document.catalog)
for objid, tree in objects:
yield cls(document, objid, tree, next(page_labels))
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in document.xrefs:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound:
pass
@classmethod
def get_pages(
cls,
fp: BinaryIO,
pagenos: Optional[Container[int]] = None,
maxpages: int = 0,
password: str = "",
caching: bool = True,
check_extractable: bool = False,
) -> Iterator["PDFPage"]:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction.
# If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = "Text extraction is not allowed: %r" % fp
raise PDFTextExtractionNotAllowed(error_msg)
else:
warning_msg = (
"The PDF %r contains a metadata field "
"indicating that it should not allow "
"text extraction. Ignoring this field "
"and proceeding. Use the check_extractable "
"if you want to raise an error in this case" % fp
)
log.warning(warning_msg)
# Process each page contained in the document.
for pageno, page in enumerate(cls.create_pages(doc)):
page.pageno = pageno
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno + 1:
break