ProyectoServicioArmamento/venv/Lib/site-packages/pyhanko/pdf_utils/writer.py

"""
Utilities for writing PDF files.
Contains code from the PyPDF2 project; see :ref:`here <pypdf2-license>`
for the original license.
"""

import os
import typing
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast

from asn1crypto import x509

from pyhanko.pdf_utils import generic
from pyhanko.pdf_utils.crypt import (
    PubKeySecurityHandler,
    SecurityHandler,
    StandardSecurityHandler,
)
from pyhanko.pdf_utils.extensions import (
    DeveloperExtension,
    DevExtensionMultivalued,
)
from pyhanko.pdf_utils.generic import pdf_name
from pyhanko.pdf_utils.metadata.info import (
    update_info_dict,
    view_from_info_dict,
)
from pyhanko.pdf_utils.metadata.model import DocumentMetadata
from pyhanko.pdf_utils.misc import (
    IndirectObjectExpected,
    PdfError,
    PdfReadError,
    PdfWriteError,
    instance_test,
)
from pyhanko.pdf_utils.rw_common import PdfHandler
from pyhanko.pdf_utils.xref import (
    OBJSTREAM_FORBIDDEN,
    ObjectStream,
    PositionDict,
    XRefStream,
    write_xref_table,
)

if typing.TYPE_CHECKING:
    from .font.api import FontSubsetCollection

__all__ = [
    'BasePdfFileWriter',
    'PageObject',
    'PdfFileWriter',
    'init_xobject_dictionary',
    'copy_into_new_writer',
]


# TODO move this to content.py?
def init_xobject_dictionary(
    command_stream: bytes,
    box_width,
    box_height,
    resources: Optional[generic.DictionaryObject] = None,
) -> generic.StreamObject:
    """
    Helper function to initialise form XObject dictionaries.

    .. note::
        For utilities to handle image XObjects, see :mod:`.images`.

    :param command_stream:
        The XObject's raw appearance stream.
    :param box_width:
        The width of the XObject's bounding box.
    :param box_height:
        The height of the XObject's bounding box.
    :param resources:
        A resource dictionary to include with the form object.
    :return:
        A :class:`~.generic.StreamObject` representation of the form XObject.
    """
    resources = resources or generic.DictionaryObject()
    return generic.StreamObject(
        {
            pdf_name('/BBox'): generic.ArrayObject(
                list(
                    map(generic.FloatObject, (0.0, box_height, box_width, 0.0))
                )
            ),
            pdf_name('/Resources'): resources,
            pdf_name('/Type'): pdf_name('/XObject'),
            pdf_name('/Subtype'): pdf_name('/Form'),
        },
        stream_data=command_stream,
    )


class BasePdfFileWriter(PdfHandler):
    """Base class for PDF writers."""

    output_version = (1, 7)
    """Output version to be declared in the output file."""

    stream_xrefs: bool
    """
    Boolean controlling whether or not the output file will contain
    its cross-references in stream format, or as a classical XRef table.

    The default for new files is ``True``. For incremental updates,
    the writer adapts to the system used in the previous iteration of the
    document (as mandated by the standard).
    """

    def __init__(
        self,
        root: Union[generic.IndirectObject, generic.DictionaryObject],
        info: Union[generic.IndirectObject, generic.DictionaryObject, None],
        document_id: generic.ArrayObject,
        obj_id_start: int = 0,
        stream_xrefs: bool = True,
    ):
        self.objects: Dict[Tuple[int, int], generic.PdfObject] = {}
        self.object_streams: List[ObjectStream] = list()
        self.objs_in_streams: Dict[int, generic.PdfObject] = {}
        self._lastobj_id = obj_id_start
        self._resolves_objs_from: Iterable[PdfHandler] = (self,)
        self._allocated_placeholders: Set[int] = set()

        if isinstance(root, generic.IndirectObject):
            self._root = root
        else:
            self._root = self.add_object(root)

        self.security_handler: Optional[SecurityHandler] = None
        self._encrypt: Optional[generic.IndirectObject] = None
        self._encrypt_key: Optional[bytes] = None
        self._document_id = document_id
        self.stream_xrefs = stream_xrefs
        info_ref = None
        if info is not None:
            if not isinstance(info, generic.IndirectObject):
                info_ref = self.add_object(info)
            else:
                info_ref = generic.IndirectObject(
                    info.idnum, info.generation, self
                )
        self._info = info_ref
        self._meta: DocumentMetadata = DocumentMetadata()

        self._font_resources: Dict[str, 'FontSubsetCollection'] = {}

    def get_subset_collection(self, base_postscript_name: str):
        from .font.api import FontSubsetCollection

        try:
            fsc = self._font_resources[base_postscript_name]
        except KeyError:
            fsc = FontSubsetCollection(base_postscript_name)
            self._font_resources[base_postscript_name] = fsc

        return fsc

    @property
    def document_meta(self) -> DocumentMetadata:
        return self._meta

    @property
    def document_meta_view(self) -> DocumentMetadata:
        # we need the view_over (not just a copy) because
        # e.g. copy_into_new_writer will populate the info dict with
        # base values that can then be "cleanly" overridden in the
        # high-level API without destroying any unsupported entries
        # in the original info dict
        if self._info:
            base = view_from_info_dict(self._info.get_object())
        else:
            base = DocumentMetadata()
        return self._meta.view_over(base)

    def ensure_output_version(self, version):
        if self.output_version < version:
            self.output_version = version

    def set_info(
        self,
        info: Union[generic.IndirectObject, generic.DictionaryObject, None],
    ) -> Optional[generic.IndirectObject]:
        """
        Set the ``/Info`` entry of the document trailer.

        :param info:
            The new ``/Info`` dictionary, as an indirect reference.
        """
        new_info: Optional[generic.IndirectObject]
        if info is not None and not isinstance(info, generic.IndirectObject):
            self._info = new_info = self.add_object(info)
        else:
            self._info = new_info = info
        return new_info

    def set_custom_trailer_entry(
        self, key: generic.NameObject, value: generic.PdfObject
    ):
        """
        Set a custom, unmanaged entry in the document trailer or cross-reference
        stream dictionary.

        .. warning::
            Calling this method to set an entry that is managed by pyHanko
            internally (info dictionary, document catalog, etc.) has undefined
            results.

        :param key:
            Dictionary key to use in the trailer.
        :param value:
            Value to set
        """
        raise NotImplementedError

    @property
    def document_id(self) -> Tuple[bytes, bytes]:
        id_arr = self._document_id
        return id_arr[0].original_bytes, id_arr[1].original_bytes

    def mark_update(
        self, obj_ref: Union[generic.Reference, generic.IndirectObject]
    ):
        """
        Mark an object reference to be updated.
        This is only relevant for incremental updates, but is included
        as a no-op by default for interoperability reasons.

        :param obj_ref:
            An indirect object instance or a reference.
        """
        pass

    def update_container(self, obj: generic.PdfObject):
        """
        Mark the container of an object (as indicated by the
        :attr:`~.generic.PdfObject.container_ref` attribute on
        :class:`~.generic.PdfObject`) for an update.

        As with :meth:`mark_update`, this only applies to incremental updates,
        but defaults to a no-op.

        :param obj:
            The object whose top-level container needs to be rewritten.
        """
        pass

    @property
    def root_ref(self) -> generic.Reference:
        """
        :return:
            A reference to the document catalog.
        """
        return self._root.reference

    def update_root(self):
        """
        Signal that the document catalog should be written to the output.
        Equivalent to calling :meth:`mark_update` with :attr:`root_ref`.
        """
        pass

    def register_extension(self, ext: DeveloperExtension):
        try:
            extensions = self.root['/Extensions']
        except KeyError:
            self.root['/Extensions'] = extensions = generic.DictionaryObject()

        # check if the extension is already registered,
        try:
            cur_ext_value = extensions.raw_get(
                ext.prefix_name, decrypt=generic.EncryptedObjAccess.RAW
            )
        except KeyError:
            cur_ext_value = None
        extension_dicts: Iterable[generic.PdfObject] = ()
        old_ext_multivalued = False
        if isinstance(cur_ext_value, generic.DictionaryObject):
            extension_dicts = (cur_ext_value,)
        elif isinstance(cur_ext_value, generic.ArrayObject):
            extension_dicts = tuple(cur_ext_value)
            old_ext_multivalued = True
        elif cur_ext_value is not None:
            cls_name = type(cur_ext_value).__name__
            raise PdfReadError(
                f"PDF extension value is of type {cls_name}, "
                f"expected (direct) PDF dictionary or array."
            )

        # investigate the relationship between the new extension's extension
        # level and that of any existing extension dictionaries with the same
        # prefix
        for ix, ext_dict in enumerate(extension_dicts):
            if not isinstance(ext_dict, generic.DictionaryObject):
                cls_name = type(ext_dict).__name__
                raise PdfReadError(
                    f"PDF extension array entry is of type "
                    f"{cls_name}, expected (direct) PDF dictionary"
                )
            try:
                lvl = int(ext_dict.raw_get('/ExtensionLevel'))
            except (TypeError, ValueError, KeyError):
                raise PdfReadError("Could not read developer extension level")
            # TODO is this still appropriate with the new ExtensionRevision
            #  values?
            if lvl == ext.extension_level:
                return  # nothing to do
            elif ext.compare_by_level:
                old_ext_applies = lvl >= ext.extension_level
                replace_old = not old_ext_applies
            else:
                # this covers the case where there's no obvious comparison
                old_ext_applies = lvl in ext.subsumed_by
                replace_old = lvl in ext.subsumes
            if old_ext_applies:
                return  # nothing to do, old extension doesn't need replacing
            elif replace_old:
                # New extension takes priority,
                # replace the old extension
                if old_ext_multivalued:
                    # it was an array, so replace the value at the proper
                    # index
                    cur_ext_value[ix] = ext.as_pdf_object()
                elif ext.multivalued == DevExtensionMultivalued.ALWAYS:
                    # extension was previously not multivalued, but we now
                    # make it so
                    extensions[ext.prefix_name] = generic.ArrayObject(
                        [ext.as_pdf_object()]
                    )
                else:
                    extensions[ext.prefix_name] = ext.as_pdf_object()
                self.update_container(extensions)
                return  # we're done here
            elif ext.multivalued == DevExtensionMultivalued.NEVER:
                # there is a matching extension in the file, but we can't
                # replace it -> error
                raise PdfWriteError(
                    f"Could not register extension with prefix "
                    f"{ext.prefix_name} and level {ext.extension_level}; "
                    f"file contains extension with same prefix and "
                    f"extension level {lvl}. If this extension level is "
                    f"safe to override, mark it as subsumed."
                )
        # if we got here, we're going to add the new extension without
        # replacing any existing ones
        ext_dict = ext.as_pdf_object()
        if cur_ext_value is None:
            # nothing there yet, just write the extension
            if ext.multivalued == DevExtensionMultivalued.ALWAYS:
                extensions[ext.prefix_name] = generic.ArrayObject([ext_dict])
            else:
                extensions[ext.prefix_name] = ext_dict
        else:
            if old_ext_multivalued:
                # there was already an array, so just tack on our new value
                cur_ext_value.append(ext_dict)
            else:
                # turn the extension dict into an array
                extensions[ext.prefix_name] = generic.ArrayObject(
                    [cur_ext_value, ext_dict]
                )
        self.update_container(extensions)

    def get_object(self, ido, as_metadata_stream: bool = False):
        if ido.pdf not in self._resolves_objs_from:
            raise PdfError(
                f'Reference {ido} has no relation to this PDF writer.'
            )
        idnum = ido.idnum
        generation = ido.generation
        try:
            return self.objects[(generation, idnum)]
        except KeyError:
            if generation == 0:
                if idnum in self._allocated_placeholders:
                    return generic.NullObject()
                try:
                    return self.objs_in_streams[idnum]
                except KeyError:
                    pass
            raise KeyError(ido)

    def allocate_placeholder(self) -> generic.IndirectObject:
        """
        Allocate an object reference to populate later.
        Calls to :meth:`get_object` for this reference will
        return :class:`~.generic.NullObject` until it is populated using
        :meth:`add_object`.

        This method is only relevant in certain advanced contexts where
        an object ID needs to be known before the object it refers
        to can be built; chances are you'll never need it.

        :return:
            A :class:`~.generic.IndirectObject` instance referring to
            the object just allocated.
        """

        idnum = self._lastobj_id + 1
        self._allocated_placeholders.add(idnum)
        self._lastobj_id += 1
        return generic.IndirectObject(idnum, 0, self)

    def add_object(
        self, obj, obj_stream: Optional[ObjectStream] = None, idnum=None
    ) -> generic.IndirectObject:
        """
        Add a new object to this writer.

        :param obj:
            The object to add.
        :param obj_stream:
            An object stream to add the object to.
        :param idnum:
            Manually specify the object ID of the object to be added.
            This is only allowed for object IDs that have previously been
            allocated using :meth:`allocate_placeholder`.
        :return:
            A :class:`~.generic.IndirectObject` instance referring to
            the object just added.
        """

        if idnum is not None:
            if idnum not in self._allocated_placeholders:
                raise PdfWriteError(
                    "Manually specifying idnum is only allowed for "
                    "references previously allocated using "
                    "allocate_placeholder()."
                )
            preallocated = True
        else:
            preallocated = False
            idnum = self._lastobj_id + 1

        if obj_stream is None:
            self.objects[(0, idnum)] = obj
        elif obj_stream in self.object_streams:
            obj_stream.add_object(idnum, obj)
            self.objs_in_streams[idnum] = obj
        else:
            raise PdfWriteError(
                f'Stream {repr(obj_stream)} is unknown to this PDF writer.'
            )

        if preallocated:
            self._allocated_placeholders.remove(idnum)
        else:
            self._lastobj_id += 1
        return generic.IndirectObject(idnum, 0, self)

    def prepare_object_stream(self, compress=True):
        """Prepare and return a new :class:`.ObjectStream` object.

        :param compress:
            Indicates whether the resulting object stream should be compressed.
        :return:
            An :class:`.ObjectStream` object.
        """
        if not self.stream_xrefs:
            raise PdfWriteError(
                'Object streams require Xref streams to be enabled.'
            )
        stream = ObjectStream(compress=compress)
        self.object_streams.append(stream)
        return stream

    def _write_header(self, stream):
        raise NotImplementedError

    def _assign_security_handler(self, sh: SecurityHandler):
        self.security_handler = sh
        self._encrypt = self.add_object(sh.as_pdf_object())
        min_pdf_version = sh.get_min_pdf_version()
        if min_pdf_version is not None:
            self.ensure_output_version(min_pdf_version)

    def _flush_obj_stream(self, obj_stm: ObjectStream):
        """
        Internal method to flush an object stream as part of the file
        writing process.
        """
        stream_ref = obj_stm.ref
        if obj_stm and not stream_ref:
            # first, register the object stream object
            #  (will get written later)
            obj_stm.ref = stream_ref = self.add_object(obj_stm.as_pdf_object())
        # loop over all objects in the stream, and prepare
        # the data to put in the XRef table
        for ix, (idnum, obj) in enumerate(obj_stm):
            yield idnum, (stream_ref.idnum, ix)

    def _write_objects(self, stream, object_position_dict: PositionDict):
        # deal with objects in object streams first
        for obj_stream in self.object_streams:
            for idnum, pos_record in self._flush_obj_stream(obj_stream):
                object_position_dict[(0, idnum)] = pos_record

        for ix in sorted(self.objects.keys()):
            generation, idnum = ix
            obj = self.objects[ix]
            object_position_dict[ix] = stream.tell()
            stream.write(('%d %d obj\n' % (idnum, generation)).encode('ascii'))
            handler: Optional[SecurityHandler] = None
            if self.security_handler is not None:
                assert self._encrypt is not None
                if idnum != self._encrypt.idnum:
                    handler = self.security_handler
            container_ref = generic.Reference(idnum, generation, self)
            obj.write_to_stream(stream, handler, container_ref)
            stream.write(b'\nendobj\n')

    def _prep_dom_for_writing(self):
        # ensure that all font resources are flushed
        for fsc in self._font_resources.values():
            for engine in fsc.subsets.values():
                engine.prepare_write()

        self._update_meta()

    def _update_meta(self):
        xmp_xml = None
        try:
            # delayed import since the namespace registration operation
            # is global (thank you ElementTree...)
            # also, we want the XML dep(s) to be optional
            # noinspection PyUnresolvedReferences
            from pyhanko.pdf_utils.metadata import xmp_xml

            need_xmp = (
                self._meta.xmp_unmanaged
                or bool(self._meta.xmp_extra)
                or self.output_version >= (2, 0)
                or '/Metadata' in self.root
            )
        except ImportError:  # pragma: nocover
            need_xmp = False

        self._meta.last_modified = 'now'
        if self._info is not None:
            mod = update_info_dict(
                self._meta,
                self._info.get_object(),
                # if we write XMP, we only update existing entries
                only_update_existing=need_xmp,
            )
            if mod:
                self.mark_update(self._info)
        elif not need_xmp:
            info_dict = generic.DictionaryObject()
            update_info_dict(self._meta, info_dict)
            self._info = self.add_object(info_dict)
            # if there's no info dict, and we're going to write XMP anyhow,
            # don't bother creating one

        if need_xmp:
            meta_stm = None
            if '/Metadata' in self.root:
                meta_obj = self.root['/Metadata']
                if isinstance(meta_obj, xmp_xml.MetadataStream):
                    meta_stm = meta_obj
                    meta_stm.update_xmp_with_meta(self._meta)
                    self.update_container(meta_stm)
            if meta_stm is None:
                meta_stm = xmp_xml.MetadataStream.from_xmp(
                    xmp_xml.update_xmp_with_meta(self._meta)
                )
                sh = self.security_handler
                meta_stm._handler = sh
                if (
                    sh is not None
                    and not self.security_handler.encrypt_metadata
                ):
                    # note: this will add the /Identity crypt filter, hence
                    # metadata encryption will be omitted
                    meta_stm.add_crypt_filter()
                self.root['/Metadata'] = self.add_object(meta_stm)
                self.update_root()
            self.update_root()

    def _populate_trailer(self, trailer):
        # prepare trailer dictionary entries
        trailer[pdf_name('/Root')] = self._root
        if self._info is not None:
            trailer[pdf_name('/Info')] = self._info
        if self._encrypt is not None:
            trailer[pdf_name('/Encrypt')] = self._encrypt
        # before doing anything else, we attempt to load the crypto-relevant
        # data, so that we can bail early if something's not right
        trailer[pdf_name('/ID')] = self._document_id

    @property
    def trailer_view(self) -> generic.DictionaryObject:
        trailer = generic.DictionaryObject()
        self._populate_trailer(trailer)
        return trailer

    def write(self, stream):
        """
        Write the contents of this PDF writer to a stream.

        :param stream:
            A writable output stream.
        """
        self._prep_dom_for_writing()
        self._write(stream)

    def _write(self, stream, skip_header: bool = False):
        object_positions: PositionDict = {}

        trailer: generic.DictionaryObject
        if self.stream_xrefs:
            xmp_trailer = XRefStream(object_positions)
            xmp_trailer.compress()
            trailer = xmp_trailer
        else:
            trailer = generic.DictionaryObject()

        if not skip_header:
            self._write_header(stream)
        self._populate_trailer(trailer)
        self._write_objects(stream, object_positions)

        if self.stream_xrefs:
            xref_location = stream.tell()
            xrefs_id = self._lastobj_id + 1
            # add position of XRef stream to the XRef stream
            object_positions[(0, xrefs_id)] = xref_location
            trailer[pdf_name('/Size')] = generic.NumberObject(xrefs_id + 1)
            # write XRef stream
            stream.write(('%d %d obj\n' % (xrefs_id, 0)).encode('ascii'))
            trailer.write_to_stream(stream, None)
            stream.write(b'\nendobj\n')
        else:
            # classical xref table
            xref_location = write_xref_table(
                stream, cast(Dict[Tuple[int, int], int], object_positions)
            )
            trailer[pdf_name('/Size')] = generic.NumberObject(
                self._lastobj_id + 1
            )
            # write trailer
            stream.write(b'trailer\n')
            trailer.write_to_stream(stream, None)

        # write xref table pointer and EOF
        xref_pointer_string = '\nstartxref\n%s\n' % xref_location
        stream.write(xref_pointer_string.encode('ascii') + b'%%EOF\n')

    def register_annotation(self, page_ref, annot_ref):
        """
        Register an annotation to be added to a page.
        This convenience function takes care of calling :meth:`mark_update`
        where necessary.

        :param page_ref:
            Reference to the page object involved.
        :param annot_ref:
            Reference to the annotation object to be added.
        """
        page_obj = page_ref.get_object()
        try:
            annot_arr_ref = page_obj.raw_get('/Annots')
            if isinstance(annot_arr_ref, generic.IndirectObject):
                annots = annot_arr_ref.get_object()
                self.mark_update(annot_arr_ref)
            else:
                # we need to update the entire page object if the annots array
                # is a direct object
                annots = annot_arr_ref
                self.mark_update(page_ref)
        except KeyError:
            annots = generic.ArrayObject()
            self.mark_update(page_ref)
            page_obj[pdf_name('/Annots')] = annots

        annots.append(annot_ref)

    def insert_page(self, new_page, after=None):
        """
        Insert a page object into the tree.

        :param new_page:
            Page object to insert.
        :param after:
            Page number (zero-indexed) after which to insert the page.
        :return:
            A reference to the newly inserted page.
        """
        if new_page['/Type'] != pdf_name('/Page'):
            raise PdfWriteError('Not a page object')
        if '/Parent' in new_page:
            raise PdfWriteError('/Parent must not be set.')

        page_tree_root_ref = self.root.raw_get('/Pages')
        if after is None:
            page_count = page_tree_root_ref.get_object()['/Count']
            after = page_count - 1

        if after == -1:
            # there are no pages yet, this will be the first
            pages_obj_ref = page_tree_root_ref
            kid_ix = -1
        else:
            pages_obj_ref, kid_ix, _ = self.find_page_container(after)

        pages_obj = pages_obj_ref.get_object()
        try:
            kids = pages_obj['/Kids']
        except KeyError:  # pragma: nocover
            raise PdfError('/Pages must have /Kids')

        # increase page count for all parents
        parent = pages_obj
        while parent is not None:
            # can't use += 1 because of the way PyPDF2's generic types work
            count = parent['/Count']
            parent[pdf_name('/Count')] = generic.NumberObject(count + 1)
            parent = parent.get('/Parent')
        new_page_ref = self.add_object(new_page)
        kids.insert(kid_ix + 1, new_page_ref)
        new_page[pdf_name('/Parent')] = pages_obj_ref
        self.update_container(pages_obj)
        self.update_container(kids)

        return new_page_ref

    def import_object(
        self, obj: generic.PdfObject, obj_stream: Optional[ObjectStream] = None
    ) -> generic.PdfObject:
        """
        Deep-copy an object into this writer, dealing with resolving indirect
        references in the process.

        .. danger::
            The table mapping indirect references in the input to indirect
            references in the writer is not preserved between calls.
            Concretely, this means that invoking :meth:`import_object` twice
            on the same input reader may cause object duplication.

        :param obj:
            The object to import.
        :param obj_stream:
            The object stream to import objects into.

            .. note::
                Stream objects and bare references will not be put into
                the object stream; the standard forbids this.
        :return:
            The object as associated with this writer.
            If the input object was an indirect reference, a dictionary
            (incl. streams) or an array, the returned value will always be
            a new instance.
        """

        return self._import_object(obj, {}, obj_stream)

    def _import_object(
        self, obj: generic.PdfObject, reference_map: dict, obj_stream
    ) -> generic.PdfObject:
        # TODO check the spec for guidance on fonts. Do font identifiers have
        #  to be globally unique?

        # TODO deal with container_ref

        if isinstance(obj, generic.DecryptedObjectProxy):
            obj = obj.decrypted
        if isinstance(obj, generic.IndirectObject):
            try:
                return reference_map[obj.reference]
            except KeyError:
                refd = obj.get_object()
                # Add a placeholder to reserve the reference value.
                # This ensures correct behaviour in recursive calls
                # with self-references.
                new_ido = self.allocate_placeholder()
                reference_map[obj.reference] = new_ido
                imported = self._import_object(refd, reference_map, obj_stream)

                # if the imported object is a bare reference and/or a stream
                # object, we can't put it into an object stream.
                if isinstance(imported, OBJSTREAM_FORBIDDEN):
                    obj_stream = None

                # fill in the placeholder
                self.add_object(
                    imported, obj_stream=obj_stream, idnum=new_ido.idnum
                )
                return new_ido
        elif isinstance(obj, generic.DictionaryObject):
            raw_dict = {
                k: self._import_object(v, reference_map, obj_stream)
                for k, v in obj.items()
                if k != '/Metadata'
            }
            try:
                # make sure to import metadata streams as such
                meta_ref = obj.get_value_as_reference('/Metadata')
                # ensure a MetadataStream object ends up in the cache
                meta_ref.get_pdf_handler().get_object(
                    meta_ref, as_metadata_stream=True
                )
                # ...then import the reference
                raw_dict['/Metadata'] = self._import_object(
                    generic.IndirectObject(
                        meta_ref.idnum, meta_ref.generation, meta_ref.pdf
                    ),
                    reference_map,
                    obj_stream,
                )
            except (KeyError, IndirectObjectExpected):
                pass

            if isinstance(obj, generic.StreamObject):
                stm_cls = generic.StreamObject
                # again, make sure to import metadata streams as such
                try:
                    # noinspection PyUnresolvedReferences
                    from pyhanko.pdf_utils.metadata import xmp_xml

                    if isinstance(obj, xmp_xml.MetadataStream):
                        stm_cls = xmp_xml.MetadataStream
                except ImportError:  # pragma: nocover
                    pass
                # In the vast majority of use cases, I'd expect the content
                # to be available in encoded form by default.
                # By initialising the stream object in this way, we avoid
                # a potentially costly decoding operation.
                return stm_cls(raw_dict, encoded_data=obj.encoded_data)
            else:
                return generic.DictionaryObject(raw_dict)
        elif isinstance(obj, generic.ArrayObject):
            return generic.ArrayObject(
                self._import_object(v, reference_map, obj_stream) for v in obj
            )
        else:
            return obj

    def import_page_as_xobject(
        self, other: PdfHandler, page_ix=0, inherit_filters=True
    ):
        """
        Import a page content stream from some other
        :class:`~.rw_common.PdfHandler` into the current one as a form XObject.

        :param other:
            A :class:`~.rw_common.PdfHandler`
        :param page_ix:
            Index of the page to copy (default: 0)
        :param inherit_filters:
            Inherit the content stream's filters, if present.
        :return:
            An :class:`~.generic.IndirectObject` referring to the page object
            as added to the current reader.
        """
        page_ref, resources = other.find_page_for_modification(page_ix)
        page_obj = page_ref.get_object()

        # find the page's /MediaBox by going up the tree until we encounter it
        pagetree_obj = page_obj
        while True:
            try:
                mb = pagetree_obj['/MediaBox']
                break
            except KeyError:
                try:
                    pagetree_obj = pagetree_obj['/Parent']
                except KeyError:  # pragma: nocover
                    raise PdfReadError(
                        f'Page {page_ix} does not have a /MediaBox'
                    )

        stream_dict = {
            pdf_name('/BBox'): mb,
            pdf_name('/Resources'): self.import_object(resources),
            pdf_name('/Type'): pdf_name('/XObject'),
            pdf_name('/Subtype'): pdf_name('/Form'),
        }
        command_stream = page_obj['/Contents']
        # if the page /Contents is an array, retrieve the content stream
        # with the appropriate index
        if isinstance(command_stream, generic.ArrayObject):
            if len(command_stream) == 1 and inherit_filters:
                command_stream = command_stream[0].get_object()
            else:
                # There are multiple streams, so inheriting filters is not
                # a well-defined operations.
                # Decode and concatenate, then put in a flate filter, and return
                result = generic.StreamObject(
                    stream_dict,
                    stream_data=b''.join(
                        partial_stream.get_object().data
                        for partial_stream in command_stream
                    ),
                )
                result.compress()
                return self.add_object(result)
        assert isinstance(command_stream, generic.StreamObject)
        filters = None
        if inherit_filters:
            try:
                # try to inherit filters from the original command stream
                filters = command_stream['/Filter']
            except KeyError:
                pass

        if filters is not None:
            stream_dict[pdf_name('/Filter')] = self.import_object(filters)
            result = generic.StreamObject(
                stream_dict, encoded_data=command_stream.encoded_data
            )
        else:
            result = generic.StreamObject(
                stream_dict, stream_data=command_stream.data
            )

        return self.add_object(result)

    # TODO these can be simplified considerably using the new update_container
    def add_stream_to_page(
        self, page_ix, stream_ref, resources=None, prepend=False
    ):
        """Append an indirect stream object to a page in a PDF as a content
        stream.

        :param page_ix:
            Index of the page to modify.
            The first page has index `0`.
        :param stream_ref:
            :class:`~.generic.IndirectObject` reference to the stream
            object to add.
        :param resources:
            Resource dictionary containing resources to add to the page's
            existing resource dictionary.
        :param prepend:
            Prepend the content stream to the list of content streams, as
            opposed to appending it to the end.
            This has the effect of causing the stream to be rendered
            underneath the already existing content on the page.
        :return:
            An :class:`~.generic.IndirectObject` reference to the page object
            that was modified.
        """

        page_obj_ref, res_ref = self.find_page_for_modification(page_ix)

        page_obj = page_obj_ref.get_object()

        contents_ref = page_obj.raw_get('/Contents')

        if isinstance(contents_ref, generic.IndirectObject):
            contents = contents_ref.get_object()
            if isinstance(contents, generic.ArrayObject):
                # This is the easy case. It suffices to mark
                # this array for an update, and append our stream to it.
                self.mark_update(contents_ref)
                if prepend:
                    contents.insert(0, stream_ref)
                else:
                    contents.append(stream_ref)
            elif isinstance(contents, generic.StreamObject):
                # replace the old stream with an array containing
                # a reference to the original stream, and our own stream.
                new = (
                    [stream_ref, contents_ref]
                    if prepend
                    else [contents_ref, stream_ref]
                )
                contents = generic.ArrayObject(new)
                page_obj[pdf_name('/Contents')] = self.add_object(contents)
                # mark the page to be updated as well
                self.mark_update(page_obj_ref)
            else:
                raise PdfError('Unexpected type for page /Contents')
        elif isinstance(contents_ref, generic.ArrayObject):
            # make /Contents an indirect array, and append our stream
            contents = contents_ref
            if prepend:
                contents.insert(0, stream_ref)
            else:
                contents.append(stream_ref)
            page_obj[pdf_name('/Contents')] = self.add_object(contents)
            self.mark_update(page_obj_ref)
        else:
            raise PdfError('Unexpected type for page /Contents')

        if resources is None:
            return

        if isinstance(res_ref, generic.IndirectObject):
            # we can get away with only updating this reference
            orig_resource_dict = res_ref.get_object()
            assert isinstance(orig_resource_dict, generic.DictionaryObject)
            if self.merge_resources(orig_resource_dict, resources):
                self.mark_update(res_ref)
        else:
            # don't bother trying to update the resource object, just
            # clone it and add it to the current page object.
            orig_resource_dict = generic.DictionaryObject(res_ref)
            page_obj[pdf_name('/Resources')] = self.add_object(
                orig_resource_dict
            )
            self.merge_resources(orig_resource_dict, resources)

        return page_obj_ref

    # TODO this doesn't really belong here
    def merge_resources(
        self,
        orig_dict: generic.DictionaryObject,
        new_dict: generic.DictionaryObject,
    ) -> bool:
        """
        Update an existing resource dictionary object with data from another
        one. Returns ``True`` if the original dict object was modified directly.

        The caller is responsible for avoiding name conflicts with existing
        resources.
        """

        update_needed = False
        for key, value in new_dict.items():
            try:
                orig_value_ref = orig_dict.raw_get(key)
            except KeyError:
                update_needed = True
                orig_dict[key] = value
                continue

            if isinstance(orig_value_ref, generic.IndirectObject):
                orig_value = orig_value_ref.get_object()
                self.mark_update(orig_value_ref)
            else:
                orig_value = orig_value_ref
                update_needed = True

            if (
                isinstance(orig_value, generic.ArrayObject)
                and key == '/ProcSet'
            ):
                orig_value.extend(
                    x
                    for x in value.get_object()
                    if isinstance(x, generic.NameObject) and x not in orig_value
                )
            elif isinstance(orig_value, generic.DictionaryObject):
                for key_, value_ in value.items():
                    if key_ in orig_value:
                        raise PdfError(
                            'Naming conflict in resource of type %s: '
                            'key %s occurs in both.' % (key, key_)
                        )
                    orig_value[key_] = value_

        return update_needed


class PageObject(generic.DictionaryObject):
    """Subclass of :class:`~.generic.DictionaryObject` that handles some of the
    initialisation boilerplate for page objects."""

    # TODO be more clever with inheritable required attributes,
    #  and enforce the requirements on insertion instead
    # (setting /MediaBox at the page tree root seems to make sense, for example)
    def __init__(self, contents, media_box, resources=None):
        resources = resources or generic.DictionaryObject()

        if isinstance(contents, list):
            if not all(map(instance_test(generic.IndirectObject), contents)):
                raise PdfWriteError(
                    'Contents array must consist of indirect references'
                )
            if not isinstance(contents, generic.ArrayObject):
                contents = generic.ArrayObject(contents)
        elif not isinstance(contents, generic.IndirectObject):
            raise PdfWriteError(
                'Contents must be either an indirect reference or an array'
            )

        if len(media_box) != 4:
            raise ValueError('Media box must consist of 4 coordinates.')
        super().__init__(
            {
                pdf_name('/Type'): pdf_name('/Page'),
                pdf_name('/MediaBox'): generic.ArrayObject(
                    map(generic.FloatObject, media_box)
                ),
                pdf_name('/Resources'): resources,
                pdf_name('/Contents'): contents,
            }
        )


class PdfFileWriter(BasePdfFileWriter):
    """Class to write new PDF files."""

    def __init__(self, stream_xrefs=True, init_page_tree=True, info=None):
        # root object
        root = generic.DictionaryObject(
            {
                pdf_name("/Type"): pdf_name("/Catalog"),
            }
        )

        id1 = generic.ByteStringObject(os.urandom(16))
        id2 = generic.ByteStringObject(os.urandom(16))
        id_obj = generic.ArrayObject([id1, id2])

        self._custom_trailer_entries = {}
        super().__init__(root, info, id_obj, stream_xrefs=stream_xrefs)

        if init_page_tree:
            pages = generic.DictionaryObject(
                {
                    pdf_name("/Type"): pdf_name("/Pages"),
                    pdf_name("/Count"): generic.NumberObject(0),
                    pdf_name("/Kids"): generic.ArrayObject(),
                }
            )

            root[pdf_name('/Pages')] = self.add_object(pages)

    def _write_header(self, stream):
        major, minor = self.output_version
        stream.write(f'%PDF-{major}.{minor}\n'.encode('ascii'))
        # write some binary characters to make sure the file is flagged
        # as binary (see § 7.5.2 in ISO 32000-1)
        stream.write(b'%\xc2\xa5\xc2\xb1\xc3\xab\n')

    def encrypt(self, owner_pass, user_pass=None, **kwargs):
        """
        Mark this document to be encrypted with PDF 2.0 encryption (AES-256).

        .. caution::
            While pyHanko supports legacy PDF encryption as well, the API
            to create new documents using outdated encryption is left
            largely undocumented on purpose to discourage its use.

            This caveat does *not* apply to incremental updates added to
            existing documents.

        .. danger::
            The PDF 2.0 standard mandates AES-256 in CBC mode, and also includes
            12 bytes of known plaintext by design. This implies that a
            sufficiently knowledgeable attacker can inject arbitrary content
            into your encrypted files without knowledge of the password.

            Adding a digital signature to the encrypted document is **not**
            a foolproof way to deal with this either, since most viewers will
            still allow the document to be opened before signatures are
            validated, and therefore end users are still exposed to potentially
            malicious content.

            Until the standard supports authenticated encryption schemes, you
            should **never** rely on its encryption provisions if tampering
            is a concern.


        :param owner_pass:
            The desired owner password.
        :param user_pass:
            The desired user password (defaults to the owner password
            if not specified)
        :param kwargs:
            Other keyword arguments to be passed to
            :meth:`.StandardSecurityHandler.build_from_pw`.
        """
        sh = StandardSecurityHandler.build_from_pw(
            owner_pass, user_pass, **kwargs
        )
        self._assign_security_handler(sh)

    def encrypt_pubkey(self, recipients: List[x509.Certificate], **kwargs):
        """
        Mark this document to be encrypted with PDF 2.0 public key encryption.
        The certificates passed in should be RSA certificates.

        PyHanko defaults to AES-256 to encrypt the actual file contents.
        The seed used to derive the file encryption key is also encrypted
        using AES-256 and bundled in a CMS EnvelopedData object.
        The envelope key is then encrypted separately for each recipient, using
        their respective public keys.

        .. caution::
            The caveats for :meth:`encrypt` also apply here.

        :param recipients:
            Certificates of the recipients that should be able to decrypt
            the document.
        :param kwargs:
            Other keyword arguments to be passed to
            :meth:`.PubKeySecurityHandler.build_from_certs`.
        """
        self.output_version = (2, 0)
        sh = PubKeySecurityHandler.build_from_certs(recipients, **kwargs)
        self._assign_security_handler(sh)

    def set_custom_trailer_entry(
        self, key: generic.NameObject, value: generic.PdfObject
    ):
        """
        Set a custom, unmanaged entry in the document trailer or cross-reference
        stream dictionary.

        .. warning::
            Calling this method to set an entry that is managed by pyHanko
            internally (info dictionary, document catalog, etc.) has undefined
            results.

        :param key:
            Dictionary key to use in the trailer.
        :param value:
            Value to set
        """
        self._custom_trailer_entries[key] = value

    def _populate_trailer(self, trailer):
        # allow the base implementation to override internally managed entries
        trailer.update(self._custom_trailer_entries)
        super()._populate_trailer(trailer)


def copy_into_new_writer(
    input_handler: PdfHandler, writer_kwargs: Optional[dict] = None
) -> PdfFileWriter:
    """
    Copy all objects in a given PDF handler into a new :class:`.PdfFileWriter`.
    This operation will attempt to preserve the document catalog
    of the original ``input_handler``.

    Very roughly, calling this function and then immediately invoking
    :meth:`~.BasePdfFileWriter.write` on the resulting writer should result
    in an equivalent document as far as presentation is concerned.
    As a general rule, behaviour that is controlled from outside the document
    catalog (e.g. encryption) or that requires byte-for-byte equivalence with
    the original (e.g. digital signatures) will not survive this translation.


    :param input_handler:
        :class:`.PdfHandler` to source objects from.
    :param writer_kwargs:
        Keyword arguments to pass to the writer.
    :return:
        New :class:`.PdfFileWriter` containing all objects from the input
        handler.
    """

    writer_kwargs = writer_kwargs or {}
    writer_kwargs.setdefault("stream_xrefs", False)

    # TODO try to be more clever with object streams
    w = PdfFileWriter(init_page_tree=False, **writer_kwargs)
    input_root_ref = input_handler.root_ref
    output_root_ref = w.root_ref
    # call _import_object in such a way that we translate the input handler's
    # root to the new writer's root.
    # From a technical PoV this doesn't matter, but it makes the output file
    # somewhat "cleaner" (i.e. it doesn't leave an orphaned document catalog
    # cluttering up the file)
    new_root_dict = w._import_object(
        input_handler.root,
        reference_map={input_root_ref: output_root_ref},
        obj_stream=None,
    )
    # override the old root ref
    ix = (output_root_ref.generation, output_root_ref.idnum)
    w.objects[ix] = new_root_dict

    if "info" not in writer_kwargs:
        try:
            # migrate the info dict. We do this low-level to avoid issues
            # with the producer string handling, and to keep a nice separation
            # between user-supplied metadata values and values that were present
            # in the original doc.
            info_dict = input_handler.trailer_view['/Info']
        except KeyError:
            info_dict = None
        if info_dict is not None:
            imported_info = w._import_object(
                info_dict, reference_map={}, obj_stream=None
            )
            w._info = w.add_object(imported_info)

    return w