171 lines
6.3 KiB
Python
171 lines
6.3 KiB
Python
"""Utilities common to reading and writing PDF files."""
|
|
from typing import Tuple
|
|
|
|
from . import generic, misc
|
|
from .metadata.model import DocumentMetadata
|
|
from .misc import PdfError
|
|
|
|
__all__ = ['PdfHandler']
|
|
|
|
|
|
class PdfHandler:
|
|
"""Abstract class providing a general interface for quering objects
|
|
in PDF readers and writers alike."""
|
|
|
|
def get_object(
|
|
self, ref: generic.Reference, as_metadata_stream: bool = False
|
|
):
|
|
"""
|
|
Retrieve the object associated with the provided reference from
|
|
this PDF handler.
|
|
|
|
:param ref:
|
|
An instance of :class:`.generic.Reference`.
|
|
:param as_metadata_stream:
|
|
Whether to dereference the object as an XMP metadata stream.
|
|
:return:
|
|
A PDF object.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def trailer_view(self) -> generic.DictionaryObject:
|
|
"""
|
|
Returns a view of the document trailer of the document represented
|
|
by this :class:`.PdfHandler` instance.
|
|
|
|
The view is effectively read-only, in the sense that any writes
|
|
will not be reflected in the actual trailer (if the handler supports
|
|
writing, that is).
|
|
|
|
:return:
|
|
A :class:`.generic.DictionaryObject` representing the current state
|
|
of the document trailer.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def document_meta_view(self) -> DocumentMetadata:
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def root_ref(self) -> generic.Reference:
|
|
"""
|
|
:return: A reference to the document catalog of this PDF handler.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def root(self) -> generic.DictionaryObject:
|
|
"""
|
|
:return: The document catalog of this PDF handler.
|
|
"""
|
|
root = self.root_ref.get_object()
|
|
assert isinstance(root, generic.DictionaryObject)
|
|
return root
|
|
|
|
@property
|
|
def document_id(self) -> Tuple[bytes, bytes]:
|
|
raise NotImplementedError
|
|
|
|
# TODO write tests specifically for this helper function
|
|
def _walk_page_tree(self, page_ix, retrieve_parent):
|
|
# the spec says that this will always be an indirect reference
|
|
page_tree_root_ref = self.root.raw_get('/Pages')
|
|
assert isinstance(page_tree_root_ref, generic.IndirectObject)
|
|
page_tree_root = page_tree_root_ref.get_object()
|
|
assert isinstance(page_tree_root, generic.DictionaryObject)
|
|
try:
|
|
root_resources = page_tree_root['/Resources']
|
|
except KeyError:
|
|
root_resources = generic.DictionaryObject()
|
|
|
|
page_count = page_tree_root['/Count']
|
|
if page_ix < 0:
|
|
page_ix = page_count + page_ix
|
|
if not (0 <= page_ix < page_count):
|
|
raise PdfError('Page index out of range')
|
|
|
|
def _recurse(first_page_ix, pages_obj_ref, last_rsrc_dict, refs_seen):
|
|
pages_obj = pages_obj_ref.get_object()
|
|
kids = pages_obj['/Kids']
|
|
try:
|
|
last_rsrc_dict = pages_obj.raw_get('/Resources')
|
|
except KeyError:
|
|
pass
|
|
|
|
cur_page_ix = first_page_ix
|
|
for kid_index, kid_ref in enumerate(kids):
|
|
if not isinstance(kid_ref, generic.IndirectObject):
|
|
raise misc.PdfReadError(
|
|
"Page tree node children must be indirect objects"
|
|
)
|
|
assert isinstance(kid_ref, generic.IndirectObject)
|
|
if kid_ref.reference in refs_seen:
|
|
raise misc.PdfReadError("Circular reference in page tree")
|
|
|
|
kid = kid_ref.get_object()
|
|
|
|
node_type = kid['/Type']
|
|
if node_type == '/Pages':
|
|
# recurse into this branch if the page we need
|
|
# is part of it
|
|
desc_count = kid['/Count']
|
|
if cur_page_ix <= page_ix < cur_page_ix + desc_count:
|
|
return _recurse(
|
|
cur_page_ix,
|
|
kid_ref,
|
|
last_rsrc_dict,
|
|
refs_seen | {kid_ref.reference},
|
|
)
|
|
cur_page_ix += desc_count
|
|
elif node_type == '/Page':
|
|
if cur_page_ix == page_ix:
|
|
if retrieve_parent:
|
|
return (pages_obj_ref, kid_index, last_rsrc_dict)
|
|
else:
|
|
try:
|
|
last_rsrc_dict = kid.raw_get('/Resources')
|
|
except KeyError:
|
|
pass
|
|
return kid_ref, last_rsrc_dict
|
|
else:
|
|
cur_page_ix += 1
|
|
# This means the PDF is not standards-compliant
|
|
raise PdfError('Page not found')
|
|
|
|
return _recurse(0, page_tree_root_ref, root_resources, set())
|
|
|
|
def find_page_container(self, page_ix):
|
|
"""
|
|
Retrieve the node in the page tree containing the
|
|
page with index ``page_ix``, along with the necessary objects
|
|
to modify it in an incremental update scenario.
|
|
|
|
:param page_ix:
|
|
The (zero-indexed) number of the page for which we want to
|
|
retrieve the parent.
|
|
A negative number counts pages from the back of the document,
|
|
with index ``-1`` referring to the last page.
|
|
:return:
|
|
A triple with the ``/Pages`` object (or a reference to it),
|
|
the index of the target page in said ``/Pages`` object, and a
|
|
(possibly inherited) resource dictionary.
|
|
"""
|
|
return self._walk_page_tree(page_ix, retrieve_parent=True)
|
|
|
|
def find_page_for_modification(self, page_ix):
|
|
"""
|
|
Retrieve the page with index ``page_ix`` from the page tree, along with
|
|
the necessary objects to modify it in an incremental update scenario.
|
|
|
|
:param page_ix:
|
|
The (zero-indexed) number of the page to retrieve.
|
|
A negative number counts pages from the back of the document,
|
|
with index ``-1`` referring to the last page.
|
|
:return:
|
|
A tuple with a reference to the page object and a
|
|
(possibly inherited) resource dictionary.
|
|
"""
|
|
return self._walk_page_tree(page_ix, retrieve_parent=False)
|