ProyectoServicioArmamento/venv/Lib/site-packages/pyhanko/pdf_utils/rw_common.py

"""Utilities common to reading and writing PDF files."""
from typing import Tuple

from . import generic, misc
from .metadata.model import DocumentMetadata
from .misc import PdfError

__all__ = ['PdfHandler']


class PdfHandler:
    """Abstract class providing a general interface for quering objects
    in PDF readers and writers alike."""

    def get_object(
        self, ref: generic.Reference, as_metadata_stream: bool = False
    ):
        """
        Retrieve the object associated with the provided reference from
        this PDF handler.

        :param ref:
            An instance of :class:`.generic.Reference`.
        :param as_metadata_stream:
            Whether to dereference the object as an XMP metadata stream.
        :return:
            A PDF object.
        """
        raise NotImplementedError

    @property
    def trailer_view(self) -> generic.DictionaryObject:
        """
        Returns a view of the document trailer of the document represented
        by this :class:`.PdfHandler` instance.

        The view is effectively read-only, in the sense that any writes
        will not be reflected in the actual trailer (if the handler supports
        writing, that is).

        :return:
            A :class:`.generic.DictionaryObject` representing the current state
            of the document trailer.
        """
        raise NotImplementedError

    @property
    def document_meta_view(self) -> DocumentMetadata:
        raise NotImplementedError

    @property
    def root_ref(self) -> generic.Reference:
        """
        :return: A reference to the document catalog of this PDF handler.
        """
        raise NotImplementedError

    @property
    def root(self) -> generic.DictionaryObject:
        """
        :return: The document catalog of this PDF handler.
        """
        root = self.root_ref.get_object()
        assert isinstance(root, generic.DictionaryObject)
        return root

    @property
    def document_id(self) -> Tuple[bytes, bytes]:
        raise NotImplementedError

    # TODO write tests specifically for this helper function
    def _walk_page_tree(self, page_ix, retrieve_parent):
        # the spec says that this will always be an indirect reference
        page_tree_root_ref = self.root.raw_get('/Pages')
        assert isinstance(page_tree_root_ref, generic.IndirectObject)
        page_tree_root = page_tree_root_ref.get_object()
        assert isinstance(page_tree_root, generic.DictionaryObject)
        try:
            root_resources = page_tree_root['/Resources']
        except KeyError:
            root_resources = generic.DictionaryObject()

        page_count = page_tree_root['/Count']
        if page_ix < 0:
            page_ix = page_count + page_ix
        if not (0 <= page_ix < page_count):
            raise PdfError('Page index out of range')

        def _recurse(first_page_ix, pages_obj_ref, last_rsrc_dict, refs_seen):
            pages_obj = pages_obj_ref.get_object()
            kids = pages_obj['/Kids']
            try:
                last_rsrc_dict = pages_obj.raw_get('/Resources')
            except KeyError:
                pass

            cur_page_ix = first_page_ix
            for kid_index, kid_ref in enumerate(kids):
                if not isinstance(kid_ref, generic.IndirectObject):
                    raise misc.PdfReadError(
                        "Page tree node children must be indirect objects"
                    )
                assert isinstance(kid_ref, generic.IndirectObject)
                if kid_ref.reference in refs_seen:
                    raise misc.PdfReadError("Circular reference in page tree")

                kid = kid_ref.get_object()

                node_type = kid['/Type']
                if node_type == '/Pages':
                    # recurse into this branch if the page we need
                    # is part of it
                    desc_count = kid['/Count']
                    if cur_page_ix <= page_ix < cur_page_ix + desc_count:
                        return _recurse(
                            cur_page_ix,
                            kid_ref,
                            last_rsrc_dict,
                            refs_seen | {kid_ref.reference},
                        )
                    cur_page_ix += desc_count
                elif node_type == '/Page':
                    if cur_page_ix == page_ix:
                        if retrieve_parent:
                            return (pages_obj_ref, kid_index, last_rsrc_dict)
                        else:
                            try:
                                last_rsrc_dict = kid.raw_get('/Resources')
                            except KeyError:
                                pass
                            return kid_ref, last_rsrc_dict
                    else:
                        cur_page_ix += 1
            # This means the PDF is not standards-compliant
            raise PdfError('Page not found')

        return _recurse(0, page_tree_root_ref, root_resources, set())

    def find_page_container(self, page_ix):
        """
        Retrieve the node in the page tree containing the
        page with index ``page_ix``, along with the necessary objects
        to modify it in an incremental update scenario.

        :param page_ix:
            The (zero-indexed) number of the page for which we want to
            retrieve the parent.
            A negative number counts pages from the back of the document,
            with index ``-1`` referring to the last page.
        :return:
            A triple with the ``/Pages`` object (or a reference to it),
            the index of the target page in said ``/Pages`` object, and a
            (possibly inherited) resource dictionary.
        """
        return self._walk_page_tree(page_ix, retrieve_parent=True)

    def find_page_for_modification(self, page_ix):
        """
        Retrieve the page with index ``page_ix`` from the page tree, along with
        the necessary objects to modify it in an incremental update scenario.

        :param page_ix:
            The (zero-indexed) number of the page to retrieve.
            A negative number counts pages from the back of the document,
            with index ``-1`` referring to the last page.
        :return:
            A tuple with a reference to the page object and a
            (possibly inherited) resource dictionary.
        """
        return self._walk_page_tree(page_ix, retrieve_parent=False)