2217 lines
67 KiB
Python
2217 lines
67 KiB
Python
"""
|
|
Implementation of PDF object types and other generic functionality.
|
|
The internals were imported from PyPDF2, with modifications.
|
|
|
|
See :ref:`here <pypdf2-license>` for the original license
|
|
of the PyPDF2 project.
|
|
"""
|
|
import binascii
|
|
import codecs
|
|
import decimal
|
|
import enum
|
|
import logging
|
|
import os
|
|
import re
|
|
import typing
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timedelta, timezone
|
|
from io import BytesIO
|
|
from typing import Any, Callable, Iterator, Optional, Tuple, Union
|
|
|
|
from .misc import (
|
|
IndirectObjectExpected,
|
|
PdfError,
|
|
PdfReadError,
|
|
PdfStreamError,
|
|
PdfStrictReadError,
|
|
PdfWriteError,
|
|
is_regular_character,
|
|
read_non_whitespace,
|
|
read_until_delimiter,
|
|
read_until_regex,
|
|
skip_over_whitespace,
|
|
)
|
|
|
|
if typing.TYPE_CHECKING:
|
|
from .crypt.api import SecurityHandler
|
|
|
|
__all__ = [
|
|
'Dereferenceable',
|
|
'Reference',
|
|
'TrailerReference',
|
|
'PdfObject',
|
|
'IndirectObject',
|
|
'NullObject',
|
|
'BooleanObject',
|
|
'FloatObject',
|
|
'NumberObject',
|
|
'ByteStringObject',
|
|
'TextStringObject',
|
|
'NameObject',
|
|
'ArrayObject',
|
|
'DictionaryObject',
|
|
'StreamObject',
|
|
'read_object',
|
|
'pdf_name',
|
|
'pdf_string',
|
|
'pdf_date',
|
|
'TextStringEncoding',
|
|
'EncryptedObjAccess',
|
|
'DecryptedObjectProxy',
|
|
]
|
|
|
|
OBJECT_PREFIXES = b'/<[tf(n%'
|
|
NUMBER_SIGNS = b'+-'
|
|
INDIRECT_PATTERN = re.compile(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]".encode('ascii'))
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EncryptedObjAccess(enum.Enum):
|
|
"""
|
|
Defines what to do when an encrypted object is encountered when retrieving
|
|
an object from a container.
|
|
"""
|
|
|
|
PROXY = 0
|
|
"""
|
|
Return the proxy object as-is, and leave further encryption/decryption
|
|
handling to the caller.
|
|
"""
|
|
|
|
TRANSPARENT = 1
|
|
"""
|
|
Transparently decrypt the proxy's content (similarly wrapping any
|
|
sub-containers in :class:`.DecryptedObjectProxy`, so this applies
|
|
recursively).
|
|
|
|
.. note::
|
|
This is the default in most situations, since it's the least likely
|
|
to get in the way of any APIs that are not explicitly aware of
|
|
content encryption concerns.
|
|
"""
|
|
|
|
RAW = 2
|
|
"""
|
|
Return the underlying raw object as written, without attempting or deferring
|
|
decryption.
|
|
"""
|
|
|
|
|
|
def _deproxy_decrypt(obj, eoa: EncryptedObjAccess):
|
|
if isinstance(obj, DecryptedObjectProxy):
|
|
if eoa == EncryptedObjAccess.TRANSPARENT:
|
|
return obj.decrypted
|
|
elif eoa == EncryptedObjAccess.RAW:
|
|
return obj.raw_object
|
|
return obj
|
|
|
|
|
|
class Dereferenceable:
|
|
"""
|
|
Represents an opaque reference to a PDF object associated with
|
|
a PDF Handler (see :class:`PdfHandler <.rw_common.PdfHandler>`).
|
|
|
|
This can either be a reference to an object with an object ID
|
|
(see :class:`.Reference`) or a reference to the trailer of a PDF document
|
|
(see :class:`.TrailerReference`).
|
|
"""
|
|
|
|
def get_object(self) -> 'PdfObject':
|
|
"""Retrieve the PDF object backing this dereferenceable.
|
|
|
|
:return: A :class:`.PdfObject`.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def get_pdf_handler(self):
|
|
"""Return the PDF handler associated with this dereferenceable.
|
|
|
|
:return: a :class:`~.rw_common.PdfHandler`.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class TrailerReference(Dereferenceable):
|
|
"""
|
|
A reference to the trailer of a PDF document.
|
|
|
|
.. warning::
|
|
Since the trailer does not have a well-defined object ID in files with
|
|
"classical" cross-reference tables (as opposed to cross-reference
|
|
streams), this is not a subclass of :class:`.Reference`.
|
|
|
|
:param reader:
|
|
a :class:`~pyhanko.pdf_utils.reader.PdfFileReader`
|
|
"""
|
|
|
|
def __init__(self, reader):
|
|
self.reader = reader
|
|
|
|
def get_object(self) -> 'PdfObject':
|
|
return self.reader.trailer
|
|
|
|
def get_pdf_handler(self):
|
|
return self.reader
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Reference(Dereferenceable):
|
|
"""
|
|
A reference to an object with a certain ID and generation number, with
|
|
a PDF handler attached to it.
|
|
|
|
.. warning::
|
|
Contrary to what one might expect, the generation number does *not*
|
|
indicate the document revision in which the object was modified. In fact,
|
|
nonzero generation numbers are exceedingly rare these days; in most
|
|
real-world PDF files, objects are simply overridden without ever
|
|
increasing the generation number.
|
|
|
|
Except in very specific circumstances, dereferencing a
|
|
:class:`.Reference` will return the most recent version of the object
|
|
with the stated object ID and generation number.
|
|
"""
|
|
|
|
idnum: int
|
|
"""
|
|
The object's ID.
|
|
"""
|
|
|
|
generation: int = 0
|
|
"""
|
|
The object's generation number (usually `0`)
|
|
"""
|
|
|
|
pdf: object = field(repr=False, hash=False, compare=False, default=None)
|
|
"""
|
|
The PDF handler associated with this reference, an instance of
|
|
:class:`~.rw_common.PdfHandler`.
|
|
|
|
.. warning::
|
|
This field is ignored when hashing or comparing :class:`.Reference`
|
|
objects, so it is the API user's responsibility to not mix up
|
|
references originating from unrelated PDF handlers.
|
|
"""
|
|
|
|
def get_object(self) -> 'PdfObject':
|
|
if self.pdf is None:
|
|
return NullObject()
|
|
from pyhanko.pdf_utils.rw_common import PdfHandler
|
|
|
|
assert isinstance(self.pdf, PdfHandler)
|
|
return self.pdf.get_object(self).get_object()
|
|
|
|
def get_pdf_handler(self):
|
|
return self.pdf
|
|
|
|
|
|
def read_object(
|
|
stream, container_ref: 'Dereferenceable', as_metadata_stream: bool = False
|
|
) -> 'PdfObject':
|
|
"""
|
|
Read a PDF object from an input stream.
|
|
|
|
.. note::
|
|
The `container_ref` parameter tells the API which reference to register
|
|
when the returned object is modified in an incremental update.
|
|
See also here :ref:`here <container-ref-example>` for further
|
|
information.
|
|
|
|
:param stream:
|
|
An input stream.
|
|
:param container_ref:
|
|
A reference to an object containing this one.
|
|
|
|
*Note:* It is perfectly possible (and common) for `container_ref` to
|
|
resolve to the return value of this function.
|
|
:param as_metadata_stream:
|
|
Whether to dereference the object as an XMP metadata stream.
|
|
:return:
|
|
A :class:`.PdfObject`.
|
|
"""
|
|
|
|
tok = stream.read(1)
|
|
stream.seek(-1, os.SEEK_CUR) # reset to start
|
|
idx = OBJECT_PREFIXES.find(tok)
|
|
if idx == 0:
|
|
# name object
|
|
result = NameObject.read_from_stream(stream)
|
|
elif idx == 1:
|
|
# hexadecimal string OR dictionary
|
|
peek = stream.read(2)
|
|
stream.seek(-2, os.SEEK_CUR) # reset to start
|
|
if peek == b'<<':
|
|
result = DictionaryObject.read_from_stream(
|
|
stream, container_ref, as_metadata_stream=as_metadata_stream
|
|
)
|
|
else:
|
|
result = read_hex_string_from_stream(stream)
|
|
elif idx == 2:
|
|
# array object
|
|
result = ArrayObject.read_from_stream(stream, container_ref)
|
|
elif idx == 3 or idx == 4:
|
|
# boolean object
|
|
result = BooleanObject.read_from_stream(stream)
|
|
elif idx == 5:
|
|
# string object
|
|
result = read_string_from_stream(stream)
|
|
elif idx == 6:
|
|
# null object
|
|
result = NullObject.read_from_stream(stream)
|
|
elif idx == 7:
|
|
# comment
|
|
while tok not in (b'\r', b'\n'):
|
|
tok = stream.read(1)
|
|
read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
result = read_object(stream, container_ref)
|
|
else:
|
|
# number object OR indirect reference
|
|
if tok in NUMBER_SIGNS:
|
|
# number
|
|
result = NumberObject.read_from_stream(stream)
|
|
else:
|
|
peek = stream.read(20)
|
|
stream.seek(-len(peek), os.SEEK_CUR) # reset to start
|
|
if INDIRECT_PATTERN.match(peek) is not None:
|
|
result = IndirectObject.read_from_stream(stream, container_ref)
|
|
else:
|
|
result = NumberObject.read_from_stream(stream)
|
|
|
|
result.container_ref = container_ref
|
|
return result
|
|
|
|
|
|
class PdfObject:
|
|
"""Superclass for all PDF objects."""
|
|
|
|
container_ref: Optional[Dereferenceable] = None
|
|
"""
|
|
For objects read from a file, `container_ref` points to the unique
|
|
addressable object containing this object.
|
|
|
|
.. _container-ref-example:
|
|
|
|
.. note::
|
|
Consider the following object definition in a PDF file:
|
|
|
|
.. code-block:: text
|
|
|
|
4 0 obj
|
|
<< /Foo (Bar) >>
|
|
|
|
This declares a dictionary with ID `4`, but the values ``/Foo`` and
|
|
``(Bar)`` are also PDF objects (a name and a string, respectively).
|
|
All of these will have `container_ref` given by a :class:`.Reference`
|
|
with object ID `4` and generation number `0`.
|
|
|
|
If an object is part of the trailer of a PDF file, `container_ref` will be
|
|
a :class:`.TrailerReference`.
|
|
For newly created objects (i.e. those not read from a file), `container_ref`
|
|
is always ``None``.
|
|
"""
|
|
|
|
# TODO simplify a number of modification routines using this new API
|
|
def get_container_ref(self) -> Dereferenceable:
|
|
"""
|
|
Return a reference to the closest parent object containing this object.
|
|
Raises an error if no such reference can be found.
|
|
"""
|
|
ref = self.container_ref
|
|
if ref is None: # pragma: nocover
|
|
raise PdfReadError(
|
|
'No container reference available. This object probably '
|
|
'wasn\'t read from a file.'
|
|
)
|
|
return ref
|
|
|
|
def get_object(self):
|
|
"""Resolves indirect references.
|
|
|
|
:return: `self`, unless an instance of :class:`.IndirectObject`.
|
|
"""
|
|
return self
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref: Optional[Reference] = None,
|
|
):
|
|
"""
|
|
Abstract method to render this object to an output stream.
|
|
|
|
:param stream:
|
|
An output stream.
|
|
:param container_ref:
|
|
Local encryption key.
|
|
:param handler:
|
|
Security handler
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class NullObject(PdfObject):
|
|
"""
|
|
PDF `null` object.
|
|
|
|
All instances are treated as equal and falsy.
|
|
"""
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(b"null")
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream):
|
|
nulltxt = stream.read(4)
|
|
if nulltxt != b"null":
|
|
raise PdfReadError("Could not read Null object")
|
|
return NullObject()
|
|
|
|
def __eq__(self, other):
|
|
return self is other or isinstance(other, NullObject)
|
|
|
|
def __hash__(self):
|
|
return hash(None)
|
|
|
|
def __bool__(self):
|
|
return False
|
|
|
|
|
|
class BooleanObject(PdfObject):
|
|
"""PDF boolean value."""
|
|
|
|
def __init__(self, value):
|
|
self.value = value
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
if self.value:
|
|
stream.write(b"true")
|
|
else:
|
|
stream.write(b"false")
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream):
|
|
word = stream.read(4)
|
|
if word == b"true":
|
|
return BooleanObject(True)
|
|
elif word == b"fals":
|
|
if stream.read(1) == b"e":
|
|
return BooleanObject(False)
|
|
raise PdfReadError('Could not read Boolean object')
|
|
|
|
def __bool__(self):
|
|
return bool(self.value)
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, (BooleanObject, bool)) and bool(self) == bool(
|
|
other
|
|
)
|
|
|
|
def __str__(self):
|
|
return str(bool(self))
|
|
|
|
def __repr__(self):
|
|
return str(self)
|
|
|
|
|
|
class ArrayObject(list, PdfObject):
|
|
"""
|
|
PDF array object. This class extends from Python's list class,
|
|
and supports its interface.
|
|
|
|
.. warning::
|
|
Contrary to the case of dictionary objects, PyPDF2 does not
|
|
transparently dereference array entries when accessed using
|
|
:meth:`__getitem__`.
|
|
For usability & consistency reasons, I decided to depart from that
|
|
and dereference automatically.
|
|
This makes the behaviour of :class:`.ArrayObject` consistent with
|
|
:class:`.DictionaryObject`.
|
|
|
|
That said, some vestiges of the old PyPDF2 behaviour may linger in
|
|
the codebase. I'll fix those as I get to them.
|
|
"""
|
|
|
|
def __getitem__(self, index):
|
|
return self.raw_get(index).get_object()
|
|
|
|
def raw_get(
|
|
self,
|
|
index,
|
|
decrypt: EncryptedObjAccess = EncryptedObjAccess.TRANSPARENT,
|
|
):
|
|
"""
|
|
.. versionchanged:: 0.14.0
|
|
|
|
``decrypt`` parameter is no longer boolean
|
|
|
|
Get a value from an array without dereferencing.
|
|
In other words, if the value corresponding to the given key is of type
|
|
:class:`.IndirectObject`, the indirect reference will not be resolved.
|
|
|
|
:param index:
|
|
Key to look up in the dictionary.
|
|
:param decrypt:
|
|
What to do when retrieving encrypted objects; see
|
|
:class:`.EncryptedObjAccess`. The default is
|
|
:attr:`.EncryptedObjAccess.TRANSPARENT`.
|
|
:return:
|
|
A :class:`.PdfObject`.
|
|
"""
|
|
|
|
val = list.__getitem__(self, index)
|
|
return _deproxy_decrypt(val, decrypt)
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(b"[")
|
|
for data in self:
|
|
stream.write(b" ")
|
|
data.write_to_stream(
|
|
stream, handler=handler, container_ref=container_ref
|
|
)
|
|
stream.write(b" ]")
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream, container_ref):
|
|
arr = ArrayObject()
|
|
tmp = stream.read(1)
|
|
if tmp != b"[":
|
|
raise PdfReadError("Could not read array")
|
|
while True:
|
|
# skip leading whitespace & check for array ending
|
|
peekahead = read_non_whitespace(stream)
|
|
if peekahead == b"]":
|
|
break
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
# read and append obj
|
|
arr.append(read_object(stream, container_ref))
|
|
return arr
|
|
|
|
|
|
class IndirectObject(PdfObject, Dereferenceable):
|
|
"""
|
|
Thin wrapper around a :class:`.Reference`, implementing both the
|
|
:class:`.Dereferenceable` and :class:`.PdfObject` interfaces.
|
|
|
|
.. warning::
|
|
For many purposes, this class is functionally interchangeable with
|
|
:class:`.Reference`, with one important exception:
|
|
:class:`.IndirectObject` instances pointing to the same reference
|
|
but occurring at different locations in the file may have distinct
|
|
`container_ref` values.
|
|
"""
|
|
|
|
def __init__(self, idnum, generation, pdf):
|
|
self.reference = Reference(idnum, generation, pdf)
|
|
|
|
def get_object(self):
|
|
"""
|
|
:return: The PDF object this reference points to.
|
|
"""
|
|
obj = self.reference.get_object()
|
|
# there are few legitimate use cases for indirect references
|
|
# pointing to indirect references, but the standard doesn't forbid
|
|
# them, so we have to support them.
|
|
# TODO protect against reference loops?
|
|
return obj.get_object() if isinstance(obj, IndirectObject) else obj
|
|
|
|
def get_pdf_handler(self):
|
|
return self.reference.get_pdf_handler()
|
|
|
|
@property
|
|
def idnum(self) -> int:
|
|
"""
|
|
:return: the object ID of this reference.
|
|
"""
|
|
return self.reference.idnum
|
|
|
|
@property
|
|
def generation(self):
|
|
"""
|
|
:return: the generation number of this reference.
|
|
"""
|
|
return self.reference.generation
|
|
|
|
def __repr__(self):
|
|
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
|
|
|
|
# TODO I'm starting to think that making indirect objects hashable
|
|
# is a bad idea. Think about that for a bit, I might just be getting
|
|
# overly pedantic.
|
|
def __hash__(self):
|
|
return hash((self.idnum, self.generation))
|
|
|
|
def __eq__(self, other):
|
|
return (
|
|
other is not None
|
|
and isinstance(other, IndirectObject)
|
|
and self.reference == other.reference
|
|
)
|
|
|
|
def __ne__(self, other):
|
|
return not self.__eq__(other)
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(b"%d %d R" % (self.idnum, self.generation))
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream, container_ref: 'Dereferenceable'):
|
|
idnum_str = b""
|
|
while True:
|
|
tok = stream.read(1)
|
|
if not tok:
|
|
# stream has truncated prematurely
|
|
raise PdfStreamError("Stream has ended unexpectedly")
|
|
if tok.isspace():
|
|
if not idnum_str:
|
|
continue
|
|
break
|
|
idnum_str += tok
|
|
generation_str = b""
|
|
while True:
|
|
tok = stream.read(1)
|
|
if not tok:
|
|
# stream has truncated prematurely
|
|
raise PdfStreamError("Stream has ended unexpectedly")
|
|
if tok.isspace():
|
|
if not generation_str:
|
|
continue
|
|
break
|
|
generation_str += tok
|
|
r = read_non_whitespace(stream)
|
|
if r != b"R":
|
|
pos = hex(stream.tell())
|
|
raise PdfReadError(
|
|
"Error reading indirect object reference at byte %s" % pos
|
|
)
|
|
try:
|
|
idnum, generation = int(idnum_str), int(generation_str)
|
|
if not (idnum > 0 and generation >= 0):
|
|
raise ValueError
|
|
except ValueError:
|
|
pos = hex(stream.tell())
|
|
raise PdfReadError(
|
|
f"Parse error on indirect object reference around {pos}"
|
|
)
|
|
return IndirectObject(
|
|
int(idnum_str), int(generation_str), container_ref.get_pdf_handler()
|
|
)
|
|
|
|
|
|
class FloatObject(decimal.Decimal, PdfObject):
|
|
"""
|
|
PDF Float object.
|
|
|
|
Internally, these are treated as decimals (and therefore actually
|
|
fixed-point objects, to be precise).
|
|
"""
|
|
|
|
def __new__(cls, value="0"):
|
|
return decimal.Decimal.__new__(cls, str(value))
|
|
|
|
def __repr__(self):
|
|
if self == self.to_integral():
|
|
return str(self.quantize(decimal.Decimal(1)))
|
|
else:
|
|
return str(self)
|
|
|
|
def as_numeric(self):
|
|
"""
|
|
:return: a Python ``float`` value for this object.
|
|
"""
|
|
return float(self)
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(repr(self).encode('ascii'))
|
|
|
|
|
|
class NumberObject(int, PdfObject):
|
|
"""
|
|
PDF number object. This is the PDF type for integer values.
|
|
"""
|
|
|
|
NumberPattern = re.compile(b'[^+-.0-9]')
|
|
ByteDot = b"."
|
|
|
|
# noinspection PyArgumentList
|
|
def __new__(cls, value):
|
|
val = int(value)
|
|
return int.__new__(cls, val)
|
|
|
|
def as_numeric(self):
|
|
"""
|
|
:return: a Python ``int`` value for this object.
|
|
"""
|
|
return int(self)
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(repr(self).encode('ascii'))
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream):
|
|
num = read_until_regex(
|
|
stream,
|
|
regex=NumberObject.NumberPattern,
|
|
# for consistency with other read_object() output
|
|
ignore_eof=True,
|
|
)
|
|
if num.find(NumberObject.ByteDot) != -1:
|
|
return FloatObject(num.decode('ascii'))
|
|
else:
|
|
return NumberObject(num.decode('ascii'))
|
|
|
|
|
|
# TODO: not sure I like this behaviour of PyPDF2. Review.
|
|
|
|
|
|
def pdf_string(
|
|
string: Union[str, bytes, bytearray]
|
|
) -> Union['ByteStringObject', 'TextStringObject']:
|
|
"""
|
|
Encode a string as a :class:`.TextStringObject` if possible,
|
|
or a :class:`.ByteStringObject` otherwise.
|
|
|
|
:param string:
|
|
A Python string.
|
|
"""
|
|
if isinstance(string, str):
|
|
return TextStringObject(string)
|
|
elif isinstance(string, (bytes, bytearray)):
|
|
guessed = _guess_enc_by_bom(string)
|
|
try:
|
|
retval = TextStringObject(guessed.decode(string))
|
|
retval.autodetected_encoding = guessed
|
|
return retval
|
|
except UnicodeDecodeError:
|
|
return ByteStringObject(string)
|
|
else:
|
|
raise TypeError("pdf_string should have str or bytes arg")
|
|
|
|
|
|
HEX_DIGITS = b'0123456789abcdefABCDEF'
|
|
|
|
|
|
def read_hex_string_from_stream(
|
|
stream,
|
|
) -> Union['ByteStringObject', 'TextStringObject']:
|
|
"""
|
|
Read a hex string from a stream into a PDF string object.
|
|
|
|
:param stream:
|
|
An input stream.
|
|
"""
|
|
stream.read(1)
|
|
|
|
odd = False
|
|
|
|
def read_tokens():
|
|
nonlocal odd
|
|
while True:
|
|
tok = read_non_whitespace(stream)
|
|
if tok == b">":
|
|
return
|
|
elif tok not in HEX_DIGITS:
|
|
raise PdfStreamError(
|
|
"Unexpected token in hex string: " + repr(tok)
|
|
)
|
|
yield tok
|
|
odd = not odd
|
|
|
|
result = binascii.unhexlify(
|
|
b''.join(read_tokens()) + (b'0' if odd else b'')
|
|
)
|
|
return pdf_string(result)
|
|
|
|
|
|
def _read_string_literal_bytes(stream) -> bytes:
|
|
stream.read(1)
|
|
parens = 1
|
|
txt = BytesIO()
|
|
while True:
|
|
tok = stream.read(1)
|
|
if not tok:
|
|
# stream has truncated prematurely
|
|
raise PdfStreamError("Stream has ended unexpectedly")
|
|
if tok == b"(":
|
|
parens += 1
|
|
elif tok == b")":
|
|
parens -= 1
|
|
if parens == 0:
|
|
break
|
|
elif tok == b"\\":
|
|
tok = stream.read(1)
|
|
if tok in b"() /%<>[]#_&$\\":
|
|
pass # simply use the second byte we read
|
|
elif tok == b"n":
|
|
tok = b"\n"
|
|
elif tok == b"r":
|
|
tok = b"\r"
|
|
elif tok == b"t":
|
|
tok = b"\t"
|
|
elif tok == b"b":
|
|
tok = b"\b"
|
|
elif tok == b"f":
|
|
tok = b"\f"
|
|
elif tok.isdigit():
|
|
# "The number ddd may consist of one, two, or three
|
|
# octal digits; high-order overflow shall be ignored.
|
|
# Three octal digits shall be used, with leading zeros
|
|
# as needed, if the next character of the string is also
|
|
# a digit." (PDF reference 7.3.4.2, p 16)
|
|
for i in range(2):
|
|
ntok = stream.read(1)
|
|
if ntok.isdigit():
|
|
tok += ntok
|
|
else:
|
|
# premature end, seek back
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
break
|
|
octal = int(tok, base=8)
|
|
# interpret as byte
|
|
tok = bytes((octal,))
|
|
elif tok in b"\n\r":
|
|
# This case is hit when a backslash followed by a line
|
|
# break occurs. If it's a multi-char EOL, consume the
|
|
# second character:
|
|
tok = stream.read(1)
|
|
if tok not in b"\n\r":
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
# Then don't add anything to the actual string, since this
|
|
# line break was escaped:
|
|
tok = b''
|
|
else:
|
|
raise PdfReadError("Unexpected escaped string: " + repr(tok))
|
|
txt.write(tok)
|
|
return txt.getvalue()
|
|
|
|
|
|
def read_string_from_stream(
|
|
stream,
|
|
) -> Union['ByteStringObject', 'TextStringObject']:
|
|
"""
|
|
Read a PDF string literal from a stream. Attempt to decode it into a text
|
|
string by autodetecting the encoding, or failing that, return it as a byte
|
|
string instead.
|
|
|
|
:param stream:
|
|
An input stream.
|
|
"""
|
|
|
|
return pdf_string(_read_string_literal_bytes(stream))
|
|
|
|
|
|
class ByteStringObject(bytes, PdfObject):
|
|
"""PDF bytestring class."""
|
|
|
|
original_bytes = property(lambda self: self)
|
|
"""
|
|
For compatibility with :attr:`.TextStringObject.original_bytes`
|
|
"""
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
bytearr: bytes = self
|
|
if handler is not None and container_ref is not None:
|
|
cf = handler.get_string_filter()
|
|
local_key = cf.derive_object_key(
|
|
container_ref.idnum, container_ref.generation
|
|
)
|
|
bytearr = cf.encrypt(local_key, bytearr)
|
|
stream.write(b"<")
|
|
stream.write(binascii.hexlify(bytearr))
|
|
stream.write(b">")
|
|
|
|
|
|
class TextStringEncoding(enum.Enum):
|
|
"""
|
|
Encodings for PDF text strings.
|
|
"""
|
|
|
|
PDF_DOC = None
|
|
"""
|
|
PDFDocEncoding (one-byte character codes; PDF-specific).
|
|
"""
|
|
|
|
UTF16BE = (codecs.BOM_UTF16_BE, 'utf-16be')
|
|
"""
|
|
UTF-16BE encoding.
|
|
"""
|
|
|
|
UTF8 = (codecs.BOM_UTF8, 'utf-8')
|
|
"""
|
|
UTF-8 encoding (PDF 2.0)
|
|
"""
|
|
|
|
UTF16LE = (codecs.BOM_UTF16_LE, 'utf-16le')
|
|
"""
|
|
UTF-16LE encoding.
|
|
|
|
.. note::
|
|
This is strictly speaking invalid in PDF 2.0, but some authoring tools
|
|
output such strings anyway (presumably due to the fact that it's the
|
|
default wide character encoding on Windows).
|
|
"""
|
|
|
|
def encode(self, string: str) -> bytes:
|
|
"""
|
|
Encode a string with BOM.
|
|
|
|
:param string:
|
|
The string to encode.
|
|
:return:
|
|
The encoded string.
|
|
"""
|
|
if self == TextStringEncoding.PDF_DOC:
|
|
return encode_pdfdocencoding(string)
|
|
else:
|
|
bom, enc = self.value
|
|
return bom + string.encode(enc)
|
|
|
|
def decode(self, string: Union[bytes, bytearray]) -> str:
|
|
"""
|
|
Decode a string with BOM.
|
|
|
|
:param string:
|
|
The string to encode.
|
|
:return:
|
|
The encoded string.
|
|
:raise UnicodeDecodeError:
|
|
Raised if decoding fails.
|
|
"""
|
|
if self == TextStringEncoding.PDF_DOC:
|
|
return decode_pdfdocencoding(string)
|
|
elif self == TextStringEncoding.UTF8:
|
|
return string.decode('utf-8-sig')
|
|
else:
|
|
return string.decode('utf-16')
|
|
|
|
|
|
def _guess_enc_by_bom(encoded: Union[bytes, bytearray]) -> TextStringEncoding:
|
|
if encoded.startswith(codecs.BOM_UTF16_BE):
|
|
return TextStringEncoding.UTF16BE
|
|
elif encoded.startswith(codecs.BOM_UTF16_LE):
|
|
return TextStringEncoding.UTF16LE
|
|
elif encoded.startswith(codecs.BOM_UTF8):
|
|
return TextStringEncoding.UTF8
|
|
else:
|
|
# This is probably a big performance hit here, but we need to
|
|
# convert string objects into the text/unicode-aware version if
|
|
# possible... and the only way to check if that's possible is
|
|
# to try. Some strings are strings, some are just byte arrays.
|
|
return TextStringEncoding.PDF_DOC
|
|
|
|
|
|
class TextStringObject(str, PdfObject):
|
|
"""
|
|
PDF text string object.
|
|
"""
|
|
|
|
autodetected_encoding: Optional[TextStringEncoding] = None
|
|
"""
|
|
Autodetected encoding when parsing the file.
|
|
"""
|
|
|
|
force_output_encoding: Optional[TextStringEncoding] = None
|
|
"""
|
|
Output encoding to use when serialising the string.
|
|
The default is to try PDFDocEncoding first, and fall back to UTF-16BE.
|
|
"""
|
|
|
|
@property
|
|
def original_bytes(self):
|
|
"""
|
|
Retrieve the original bytes of the string as specified in the
|
|
source file.
|
|
|
|
This may be necessary if this string was misidentified as a text string.
|
|
"""
|
|
|
|
# We're a text string object, but the library is trying to get our raw
|
|
# bytes. This can happen if we auto-detected this string as text, but
|
|
# we were wrong. It's pretty common. Return the original bytes that
|
|
# would have been used to create this object, based upon the autodetect
|
|
# method.
|
|
if self.autodetected_encoding:
|
|
return self.autodetected_encoding.encode(self)
|
|
else:
|
|
raise PdfError("No information about original bytes")
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
encoded: bytes
|
|
if self.force_output_encoding is not None:
|
|
encoded = self.force_output_encoding.encode(self)
|
|
else:
|
|
# Try to write the string out as a PDFDocEncoding encoded string.
|
|
# It's nicer to look at in the PDF file. Sadly, we take a
|
|
# performance hit here for trying...
|
|
try:
|
|
encoded = encode_pdfdocencoding(self)
|
|
except UnicodeEncodeError:
|
|
# fall back to UTF-16BE by default, since it's the only
|
|
# valid pre-2.0 Unicode encoding.
|
|
encoded = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
|
|
|
cf = None
|
|
if handler is not None and container_ref is not None:
|
|
cf_name = handler.crypt_filter_config.string_filter_name
|
|
# apply default processing if the filter is the identity filter
|
|
cf = None if cf_name == '/Identity' else handler.get_string_filter()
|
|
|
|
if cf is not None:
|
|
local_key = cf.derive_object_key(
|
|
container_ref.idnum, container_ref.generation
|
|
)
|
|
encoded = cf.encrypt(local_key, encoded)
|
|
obj = ByteStringObject(encoded)
|
|
obj.write_to_stream(stream)
|
|
else:
|
|
stream.write(b"(")
|
|
for c in encoded:
|
|
c_ = bytes([c])
|
|
if not c_.isalnum() and c != 0x20:
|
|
stream.write(b"\\%03o" % c)
|
|
else:
|
|
stream.write(c_)
|
|
stream.write(b")")
|
|
|
|
|
|
def _as_hex_digit(ascii_char):
|
|
if 0x30 <= ascii_char <= 0x39:
|
|
return ascii_char - 0x30
|
|
elif 0x41 <= ascii_char <= 0x46:
|
|
return ascii_char - 0x37
|
|
elif 0x61 <= ascii_char <= 0x66:
|
|
return ascii_char - 0x57
|
|
else:
|
|
raise PdfReadError(
|
|
"Numeric escape in PDF name must use hexadecimal digits"
|
|
)
|
|
|
|
|
|
def _decode_name(name_bytes: bytes) -> 'NameObject':
|
|
"""
|
|
Decode the bytes that make up a name object (minus the initial /), expanding
|
|
all escapes along the way.
|
|
"""
|
|
result = BytesIO()
|
|
result.write(b'/')
|
|
name_iter = iter(name_bytes)
|
|
try:
|
|
while True:
|
|
cur_byte = next(name_iter)
|
|
if cur_byte == 0x23: # '#' is the 2-digit escape prefix
|
|
# escape sequence: grab next two bytes
|
|
try:
|
|
digit1 = next(name_iter)
|
|
digit2 = next(name_iter)
|
|
except StopIteration:
|
|
raise PdfReadError(
|
|
f"Unterminated escape in PDF name /{repr(name_bytes)}"
|
|
)
|
|
|
|
cur_byte = _as_hex_digit(digit1) * 16 + _as_hex_digit(digit2)
|
|
elif not (0x21 <= cur_byte <= 0x7E) or not is_regular_character(
|
|
cur_byte
|
|
):
|
|
raise PdfReadError(
|
|
f"Byte (0x{cur_byte:02x}) must be escaped in a PDF name"
|
|
)
|
|
result.write(bytes((cur_byte,)))
|
|
except StopIteration:
|
|
pass
|
|
name_bytes = result.getvalue()
|
|
# NOTE: we assume UTF-8, but the PDF spec actually doesn't prescribe
|
|
# a character encoding for names, they're just byte sequences.
|
|
# This doesn't matter in 99.99% of cases (since names are not supposed
|
|
# to contain renderable text, and are typically 7-bit ASCII anyhow),
|
|
# but it's not 100% correct. I don't see a way to fix this without causing
|
|
# massive non-obvious API breakage (since NameObject inherits from 'str' as
|
|
# in PyPDF2), i.e. the correctness benefit is vastly outweighed by the
|
|
# risks (for now)
|
|
encodings_to_try = ('utf8', 'latin1')
|
|
# latin1 should never trigger decoding errors, since Python's implementation
|
|
# maps even unassigned values to corresponding unicode codepoints
|
|
name_str = None
|
|
for enc in encodings_to_try:
|
|
try:
|
|
name_str = name_bytes.decode(enc)
|
|
break
|
|
except ValueError:
|
|
pass
|
|
assert name_str is not None
|
|
return NameObject(name_str)
|
|
|
|
|
|
class NameObject(str, PdfObject):
|
|
"""
|
|
PDF name object. These are valid Python strings, but names and strings
|
|
are treated differently in the PDF specification, so proper care is
|
|
required.
|
|
"""
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
byte_iter = iter(self.encode('utf8'))
|
|
if not next(byte_iter) == 0x2F:
|
|
raise PdfWriteError(
|
|
f"Could not serialise name object {repr(self)}, "
|
|
f"must start with /"
|
|
)
|
|
stream.write(b'/')
|
|
for cur_byte in byte_iter:
|
|
if (
|
|
cur_byte == 0x23
|
|
or not (0x21 <= cur_byte <= 0x7E)
|
|
or not is_regular_character(cur_byte)
|
|
):
|
|
stream.write('#{:X}'.format(cur_byte).encode('ascii'))
|
|
else:
|
|
# no convenient syntax for writing a single byte...
|
|
as_bytes = bytes((cur_byte,))
|
|
stream.write(as_bytes)
|
|
|
|
@staticmethod
|
|
def read_from_stream(stream):
|
|
name_start = stream.read(1)
|
|
if name_start != b'/':
|
|
raise PdfReadError("Name object should start with /")
|
|
name_bytes = read_until_delimiter(stream)
|
|
return _decode_name(name_bytes)
|
|
|
|
|
|
def _normalise_key(key):
|
|
if not isinstance(key, NameObject):
|
|
if isinstance(key, str):
|
|
return NameObject(key)
|
|
else:
|
|
raise ValueError("key must be a name object")
|
|
return key
|
|
|
|
|
|
class DictionaryObject(dict, PdfObject):
|
|
"""
|
|
A PDF dictionary object.
|
|
|
|
Keys in a PDF dictionary are PDF names, and values are PDF objects.
|
|
|
|
When accessing a key using the standard :meth:`__getitem__` syntax,
|
|
:class:`.IndirectObject` references will be resolved.
|
|
"""
|
|
|
|
def __init__(self, dict_data=None):
|
|
if dict_data is not None:
|
|
super().__init__(
|
|
{_normalise_key(k): v for k, v in dict_data.items()}
|
|
)
|
|
else:
|
|
super().__init__()
|
|
|
|
def raw_get(
|
|
self,
|
|
key: Union[NameObject, str],
|
|
decrypt: EncryptedObjAccess = EncryptedObjAccess.TRANSPARENT,
|
|
):
|
|
"""
|
|
.. versionchanged:: 0.14.0
|
|
|
|
``decrypt`` parameter is no longer boolean
|
|
|
|
Get a value from a dictionary without dereferencing.
|
|
In other words, if the value corresponding to the given key is of type
|
|
:class:`.IndirectObject`, the indirect reference will not be resolved.
|
|
|
|
:param key:
|
|
Key to look up in the dictionary.
|
|
:param decrypt:
|
|
What to do when retrieving encrypted objects; see
|
|
:class:`.EncryptedObjAccess`. The default is
|
|
:attr:`.EncryptedObjAccess.TRANSPARENT`.
|
|
:return:
|
|
A :class:`.PdfObject`.
|
|
"""
|
|
val = dict.__getitem__(self, key)
|
|
return _deproxy_decrypt(val, decrypt)
|
|
|
|
def __setitem__(self, key, value):
|
|
key = _normalise_key(key)
|
|
if not isinstance(value, PdfObject):
|
|
raise ValueError("value must be PdfObject")
|
|
if self.container_ref is not None:
|
|
value.container_ref = self.container_ref
|
|
return dict.__setitem__(self, key, value)
|
|
|
|
def setdefault(self, key, value=None):
|
|
key = _normalise_key(key)
|
|
if not isinstance(value, PdfObject):
|
|
raise ValueError("value must be PdfObject")
|
|
if self.container_ref is not None:
|
|
value.container_ref = self.container_ref
|
|
return dict.setdefault(self, key, value)
|
|
|
|
def __getitem__(self, key):
|
|
raw_obj = dict.__getitem__(self, key)
|
|
if key == '/Metadata' and isinstance(raw_obj, IndirectObject):
|
|
from pyhanko.pdf_utils.rw_common import PdfHandler
|
|
|
|
handler = raw_obj.get_pdf_handler()
|
|
assert isinstance(handler, PdfHandler)
|
|
return handler.get_object(
|
|
raw_obj.reference, as_metadata_stream=True
|
|
)
|
|
else:
|
|
deref_obj = raw_obj.get_object()
|
|
if isinstance(deref_obj, NullObject):
|
|
raise KeyError(key)
|
|
else:
|
|
return deref_obj
|
|
|
|
def get_and_apply(
|
|
self,
|
|
key,
|
|
function: Callable[[PdfObject], Any],
|
|
*,
|
|
raw=False,
|
|
default=None,
|
|
):
|
|
try:
|
|
value = self.raw_get(key) if raw else self[key]
|
|
except KeyError:
|
|
return default
|
|
return function(value)
|
|
|
|
def get_value_as_reference(self, key, optional=False) -> Reference:
|
|
def as_ref(obj):
|
|
if isinstance(obj, IndirectObject):
|
|
return obj.reference
|
|
raise IndirectObjectExpected
|
|
|
|
value = self.get_and_apply(key, as_ref, raw=True)
|
|
if value is None and not optional:
|
|
raise KeyError
|
|
return value
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
stream.write(b"<<\n")
|
|
for key, value in list(self.items()):
|
|
key.write_to_stream(stream, handler, container_ref)
|
|
stream.write(b" ")
|
|
value.write_to_stream(stream, handler, container_ref)
|
|
stream.write(b"\n")
|
|
stream.write(b">>")
|
|
|
|
@staticmethod
|
|
def read_from_stream(
|
|
stream,
|
|
container_ref: 'Dereferenceable',
|
|
as_metadata_stream: bool = False,
|
|
):
|
|
tmp = stream.read(2)
|
|
if tmp != b"<<":
|
|
raise PdfReadError(
|
|
"Dictionary read error at byte %s: "
|
|
"stream must begin with '<<'" % hex(stream.tell())
|
|
)
|
|
data = {}
|
|
handler = container_ref.get_pdf_handler()
|
|
while True:
|
|
tok = read_non_whitespace(stream)
|
|
if tok == b">":
|
|
stream.read(1)
|
|
break
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
key = read_object(stream, container_ref)
|
|
read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
value = read_object(stream, container_ref)
|
|
if key not in data:
|
|
data[key] = value
|
|
else:
|
|
err = (
|
|
"Multiple definitions in dictionary at byte "
|
|
"%s for key %s" % (hex(stream.tell()), key)
|
|
)
|
|
if handler.strict:
|
|
raise PdfStrictReadError(err)
|
|
else:
|
|
logger.warning(err)
|
|
|
|
pos = stream.tell()
|
|
s = read_non_whitespace(stream, allow_eof=True)
|
|
stream_data = None
|
|
if s == b's' and stream.read(5) == b'tream':
|
|
# odd PDF file output has spaces after 'stream' keyword
|
|
# but before EOL. Original PyPDF2 patch provided by Danial Sandler,
|
|
# modified by Matthias Valvekens
|
|
skip_over_whitespace(stream, stop_after_eol=True)
|
|
# this is a stream object, not a dictionary
|
|
length = data[pdf_name("/Length")]
|
|
if isinstance(length, IndirectObject):
|
|
t = stream.tell()
|
|
length = handler.get_object(length)
|
|
stream.seek(t)
|
|
stream_data = stream.read(length)
|
|
e = read_non_whitespace(stream)
|
|
ndstream = stream.read(8)
|
|
if (e + ndstream) != b"endstream":
|
|
# (sigh) - the odd PDF file has a length that is too long, so
|
|
# we need to read backwards to find the "endstream" ending.
|
|
# ReportLab (unknown version) generates files with this bug,
|
|
# and Python users into PDF files tend to be our audience.
|
|
# we need to do this to correct the streamdata and chop off
|
|
# an extra character.
|
|
orig_endstream_pos = stream.tell()
|
|
stream.seek(-10, os.SEEK_CUR)
|
|
end = stream.read(9)
|
|
if end == b"endstream":
|
|
# we found it by looking back one character further.
|
|
stream_data = stream_data[:-1]
|
|
else:
|
|
raise PdfReadError(
|
|
"Unable to find 'endstream' marker after "
|
|
"stream at byte %s." % hex(orig_endstream_pos)
|
|
)
|
|
else:
|
|
stream.seek(pos)
|
|
if stream_data is not None:
|
|
# pass in everything as encoded data, the StreamObject class
|
|
# will take care of decoding as necessary
|
|
stm_cls = StreamObject
|
|
if as_metadata_stream:
|
|
try:
|
|
# noinspection PyUnresolvedReferences
|
|
from pyhanko.pdf_utils.metadata.xmp_xml import (
|
|
MetadataStream,
|
|
)
|
|
|
|
stm_cls = MetadataStream
|
|
except ImportError: # pragma: nocover
|
|
pass
|
|
return stm_cls(data, encoded_data=stream_data)
|
|
else:
|
|
return DictionaryObject(data)
|
|
|
|
|
|
class StreamObject(DictionaryObject):
|
|
"""
|
|
PDF stream object.
|
|
|
|
Essentially, a PDF stream is a dictionary object with a binary blob of
|
|
data attached. This data can be encoded by various filters (not all of which
|
|
are currently supported, see :mod:`.filters`).
|
|
|
|
A stream object can be initialised with encoded or decoded data.
|
|
The former is used by :class:`.reader.PdfFileReader` to provide on-demand
|
|
decoding, with :class:`.writer.BasePdfFileWriter` and its subclasses working
|
|
the other way around.
|
|
|
|
.. note::
|
|
The :class:`.StreamObject` class manages some of its dictionary
|
|
keys by itself. This is partly the case for the various ``/Filter``
|
|
and ``/DecodeParms`` entries, but also for the ``/Length`` entry.
|
|
The latter will be overwritten as necessary.
|
|
|
|
:param dict_data:
|
|
The dictionary data for this stream object.
|
|
:param stream_data:
|
|
The (unencoded) stream data.
|
|
:param encoded_data:
|
|
The encoded stream data.
|
|
|
|
.. warning::
|
|
Ordinarily, a stream can be initialised either from decoded and from
|
|
encoded data.
|
|
|
|
If both `stream_data` and `encoded_data` are provided, the caller
|
|
is responsible for making sure that both are compatible given the
|
|
currently relevant filter configuration.
|
|
:param handler:
|
|
A reference to the currently active
|
|
:class:`.pyhanko.pdf_utils.crypt.SecurityHandler`.
|
|
This is only necessary if the stream requires crypt filters.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
dict_data: Optional[dict] = None,
|
|
stream_data: Optional[bytes] = None,
|
|
encoded_data: Optional[bytes] = None,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
):
|
|
super().__init__(dict_data)
|
|
self._data = stream_data
|
|
self._encoded_data = encoded_data
|
|
self._handler = handler
|
|
|
|
def _implicit_decrypt_stream_content(
|
|
self, handler, ref: Reference, decrypted_entries: dict
|
|
):
|
|
"""
|
|
Internal method to handle decrypting streams that are encrypted
|
|
with the document's default encryption handler for streams and/or
|
|
embedded files (i.e. not with any custom crypt filters).
|
|
|
|
This routine is called deep in the object fetching stack, and you should
|
|
never invoke it yourself. It's defined as a method in
|
|
:class:`.StreamObject` because it needs to be able to preserve the
|
|
type (subclass) of the stream object on which it is called, in order
|
|
to properly feed into the logic surrounding metadata streams.
|
|
"""
|
|
|
|
if handler is not None:
|
|
self._handler = handler
|
|
# can't deal with crypt filters here
|
|
if self._has_crypt_filter:
|
|
# in this case, dealing with encryption is delegated
|
|
# to the stream decoding process, so just pretend the data
|
|
# is decrypted.
|
|
# We pass a reference to the security handler below,
|
|
# which is sufficient to take care of /Crypt filters
|
|
# in the stream.
|
|
decrypted_data = self.encoded_data
|
|
else:
|
|
if self.is_embedded_file_stream:
|
|
cf = handler.get_embedded_file_filter()
|
|
else:
|
|
cf = handler.get_stream_filter()
|
|
local_key = cf.derive_object_key(ref.idnum, ref.generation)
|
|
decrypted_data = cf.decrypt(local_key, self.encoded_data)
|
|
|
|
return self.__class__(
|
|
decrypted_entries, encoded_data=decrypted_data, handler=handler
|
|
)
|
|
|
|
@property
|
|
def _has_crypt_filter(self) -> bool:
|
|
return '/Crypt' in (name for name, _ in self._filters())
|
|
|
|
def add_crypt_filter(
|
|
self,
|
|
name=NameObject('/Identity'),
|
|
params=None,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
):
|
|
if handler is not None:
|
|
self._handler = handler
|
|
|
|
if self._handler is None:
|
|
raise PdfStreamError("There is no security handler around")
|
|
|
|
if name not in self._handler.crypt_filter_config:
|
|
raise PdfStreamError(
|
|
f"The crypt filter {name} is not known to the security handler."
|
|
)
|
|
params = params or DictionaryObject()
|
|
params['/Type'] = pdf_name('/CryptFilterDecodeParms')
|
|
params['/Name'] = name
|
|
self.apply_filter(
|
|
pdf_name('/Crypt'), params=params, allow_duplicates=True
|
|
)
|
|
|
|
def _filters(self) -> Iterator[Tuple[str, Optional[dict]]]:
|
|
try:
|
|
filter_arr = self[pdf_name('/Filter')]
|
|
except KeyError:
|
|
return
|
|
|
|
if isinstance(filter_arr, NameObject):
|
|
# we have a single filter instance
|
|
filter_arr = (filter_arr,)
|
|
elif not isinstance(filter_arr, ArrayObject):
|
|
raise PdfStreamError(
|
|
'/Filter should be a name object or an array of names.'
|
|
)
|
|
|
|
try:
|
|
decode_params = self[pdf_name('/DecodeParms')]
|
|
if isinstance(decode_params, DictionaryObject):
|
|
# one instance
|
|
decode_params = [decode_params]
|
|
if isinstance(decode_params, list):
|
|
lendiff = len(filter_arr) - len(decode_params)
|
|
# this should be zero, but let's be lenient
|
|
if lendiff > 0:
|
|
decode_params += [NullObject()] * lendiff
|
|
except KeyError:
|
|
decode_params = [NullObject()] * len(filter_arr)
|
|
|
|
# make sure to deal with resolving decrypted object proxies by
|
|
# calling get_object()
|
|
yield from zip(
|
|
filter_arr, (param_set.get_object() for param_set in decode_params)
|
|
)
|
|
|
|
def _stream_decoders(self):
|
|
from . import filters
|
|
|
|
for filter_type, params in self._filters():
|
|
try:
|
|
if params is None or isinstance(params, NullObject):
|
|
params = {}
|
|
if filter_type == '/Crypt':
|
|
# crypt filters get special treatment
|
|
# if we're dealing with the identity filter, just move on
|
|
if params.get('/Name', '/Identity') == '/Identity':
|
|
continue
|
|
# if it's another one, we need a reference to the security
|
|
# handler
|
|
sh = self._handler
|
|
if sh is None:
|
|
raise PdfStreamError(
|
|
"PDF streams require a security handler to use "
|
|
"explicit /Crypt filters."
|
|
)
|
|
decoder = filters.CryptFilterDecoder(sh)
|
|
else:
|
|
decoder = filters.get_generic_decoder(filter_type)
|
|
yield decoder, params
|
|
except KeyError:
|
|
raise NotImplementedError(
|
|
"Filters of type %s are not supported." % filter_type
|
|
)
|
|
|
|
def strip_filters(self):
|
|
"""
|
|
Ensure the stream is decoded, and remove any filters.
|
|
"""
|
|
|
|
self._data = self._encoded_data = self.data
|
|
self.pop(pdf_name('/Filter'), None)
|
|
self.pop(pdf_name('/DecodeParms'), None)
|
|
|
|
@property
|
|
def data(self) -> bytes:
|
|
"""
|
|
Return the decoded stream data as bytes.
|
|
If the stream hasn't been decoded yet, it will be decoded on-the-fly.
|
|
|
|
:raises .misc.PdfStreamError:
|
|
If the stream could not be decoded.
|
|
"""
|
|
if self._data is None:
|
|
data = self._encoded_data
|
|
if data is None:
|
|
raise PdfStreamError("No data available.")
|
|
for filter_cls, decode_params in self._stream_decoders():
|
|
data = filter_cls.decode(data, decode_params)
|
|
if isinstance(data, memoryview):
|
|
data = data.tobytes()
|
|
self._data = data
|
|
assert self._data is not None
|
|
return self._data
|
|
|
|
@property
|
|
def encoded_data(self) -> bytes:
|
|
"""
|
|
Return the encoded stream data as bytes.
|
|
If the stream hasn't been encoded yet, it will be encoded on-the-fly.
|
|
|
|
:raises .misc.PdfStreamError:
|
|
If the stream could not be encoded.
|
|
"""
|
|
if self._encoded_data is None:
|
|
data = self._data
|
|
if data is None:
|
|
raise PdfStreamError("No data available.")
|
|
decoders = tuple(self._stream_decoders())
|
|
for filter_cls, decode_params in reversed(decoders):
|
|
data = filter_cls.encode(data, decode_params)
|
|
self._encoded_data = data
|
|
assert self._encoded_data is not None
|
|
return self._encoded_data
|
|
|
|
def apply_filter(
|
|
self, filter_name, params=None, allow_duplicates: Optional[bool] = True
|
|
):
|
|
"""
|
|
Apply a new filter to this stream. This filter will be prepended
|
|
to any existing filters.
|
|
This means that is is placed *last* in the encoding order, but *first*
|
|
in the decoding order.
|
|
|
|
*Note:* Calling this method on an encoded stream will first cause the
|
|
stream to be decoded using the filters already present.
|
|
The cached value for the encoded stream data will be cleared.
|
|
|
|
:param filter_name:
|
|
Name of the filter
|
|
(see :const:`~pyhanko.pdf_utils.filters.DECODERS`)
|
|
:param params:
|
|
Parameters to the filter (will be written to ``/DecodeParms`` if
|
|
not ``None``)
|
|
:param allow_duplicates:
|
|
If ``None``, silently ignore duplicate filters.
|
|
If ``False``, raise ValueError when attempting to add a duplicate
|
|
filter. If ``True`` (default), duplicate filters are allowed.
|
|
"""
|
|
# If the stream already contains (encoded) data, we have to reencode it
|
|
# later on, which requires a decoding operation.
|
|
data = self._data
|
|
if data is None and self._encoded_data is not None:
|
|
data = self.data
|
|
|
|
# ... and list all current filters with their parameters.
|
|
cur_filters = list(self._filters())
|
|
# normalise the input parameters
|
|
if not isinstance(filter_name, NameObject):
|
|
filter_name = pdf_name(filter_name)
|
|
if params is not None and not isinstance(params, DictionaryObject):
|
|
params = DictionaryObject(params)
|
|
if not cur_filters:
|
|
# only one filter, so don't write arrays
|
|
self[pdf_name('/Filter')] = filter_name
|
|
if params:
|
|
self[pdf_name('/DecodeParms')] = params
|
|
else:
|
|
# FIXME deal with shortened names for standard filters
|
|
# split cur_filters back into two pieces
|
|
filter_names, param_sets = zip(*cur_filters)
|
|
if not allow_duplicates and filter_name in filter_names:
|
|
if allow_duplicates is False:
|
|
raise PdfWriteError(
|
|
f'Filter {filter_name} has already been applied to '
|
|
f'this stream.'
|
|
)
|
|
else:
|
|
# Silently ignore
|
|
return
|
|
|
|
# prepend the new filter (order is important!)
|
|
self[pdf_name('/Filter')] = ArrayObject(
|
|
(filter_name,) + filter_names
|
|
)
|
|
|
|
if params or any(param_sets):
|
|
|
|
def _params():
|
|
yield params or NullObject()
|
|
for param_set in param_sets:
|
|
yield param_set or NullObject()
|
|
|
|
self[pdf_name('/DecodeParms')] = ArrayObject(_params())
|
|
self._encoded_data = None
|
|
self._data = data
|
|
|
|
def compress(self):
|
|
"""
|
|
Convenience method to add a ``/FlateDecode`` filter with default
|
|
settings, if one is not already present.
|
|
|
|
*Note:* compression is not actually applied until the stream is written.
|
|
"""
|
|
self.apply_filter(pdf_name('/FlateDecode'), allow_duplicates=None)
|
|
|
|
@property
|
|
def is_embedded_file_stream(self):
|
|
try:
|
|
return self.raw_get('/Type') == '/EmbeddedFile'
|
|
except KeyError:
|
|
return False
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
data = self.encoded_data
|
|
if (
|
|
handler is not None
|
|
and container_ref is not None
|
|
and not self._has_crypt_filter
|
|
):
|
|
cf = handler.get_stream_filter()
|
|
local_key = cf.derive_object_key(
|
|
container_ref.idnum, container_ref.generation
|
|
)
|
|
data = cf.encrypt(local_key, data)
|
|
self[NameObject("/Length")] = NumberObject(len(data))
|
|
# write the dictionary
|
|
super().write_to_stream(stream, handler, container_ref)
|
|
del self["/Length"]
|
|
stream.write(b"\nstream\n")
|
|
stream.write(data)
|
|
stream.write(b"\nendstream")
|
|
|
|
|
|
def encode_pdfdocencoding(unicode_string):
|
|
def _build():
|
|
for c in unicode_string:
|
|
try:
|
|
yield _pdfDocEncoding_rev[c]
|
|
except KeyError:
|
|
raise UnicodeEncodeError(
|
|
"pdfdocencoding",
|
|
c,
|
|
-1,
|
|
-1,
|
|
"does not exist in translation table",
|
|
)
|
|
|
|
return bytes(_build())
|
|
|
|
|
|
def decode_pdfdocencoding(byte_array):
|
|
def _build():
|
|
for b in byte_array:
|
|
c = _pdfDocEncoding[b]
|
|
if c == '\u0000':
|
|
raise UnicodeDecodeError(
|
|
"pdfdocencoding",
|
|
bytes((b,)),
|
|
-1,
|
|
-1,
|
|
"does not exist in translation table",
|
|
)
|
|
yield c
|
|
|
|
return ''.join(_build())
|
|
|
|
|
|
_pdfDocEncoding = (
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u0000',
|
|
'\u02d8',
|
|
'\u02c7',
|
|
'\u02c6',
|
|
'\u02d9',
|
|
'\u02dd',
|
|
'\u02db',
|
|
'\u02da',
|
|
'\u02dc',
|
|
'\u0020',
|
|
'\u0021',
|
|
'\u0022',
|
|
'\u0023',
|
|
'\u0024',
|
|
'\u0025',
|
|
'\u0026',
|
|
'\u0027',
|
|
'\u0028',
|
|
'\u0029',
|
|
'\u002a',
|
|
'\u002b',
|
|
'\u002c',
|
|
'\u002d',
|
|
'\u002e',
|
|
'\u002f',
|
|
'\u0030',
|
|
'\u0031',
|
|
'\u0032',
|
|
'\u0033',
|
|
'\u0034',
|
|
'\u0035',
|
|
'\u0036',
|
|
'\u0037',
|
|
'\u0038',
|
|
'\u0039',
|
|
'\u003a',
|
|
'\u003b',
|
|
'\u003c',
|
|
'\u003d',
|
|
'\u003e',
|
|
'\u003f',
|
|
'\u0040',
|
|
'\u0041',
|
|
'\u0042',
|
|
'\u0043',
|
|
'\u0044',
|
|
'\u0045',
|
|
'\u0046',
|
|
'\u0047',
|
|
'\u0048',
|
|
'\u0049',
|
|
'\u004a',
|
|
'\u004b',
|
|
'\u004c',
|
|
'\u004d',
|
|
'\u004e',
|
|
'\u004f',
|
|
'\u0050',
|
|
'\u0051',
|
|
'\u0052',
|
|
'\u0053',
|
|
'\u0054',
|
|
'\u0055',
|
|
'\u0056',
|
|
'\u0057',
|
|
'\u0058',
|
|
'\u0059',
|
|
'\u005a',
|
|
'\u005b',
|
|
'\u005c',
|
|
'\u005d',
|
|
'\u005e',
|
|
'\u005f',
|
|
'\u0060',
|
|
'\u0061',
|
|
'\u0062',
|
|
'\u0063',
|
|
'\u0064',
|
|
'\u0065',
|
|
'\u0066',
|
|
'\u0067',
|
|
'\u0068',
|
|
'\u0069',
|
|
'\u006a',
|
|
'\u006b',
|
|
'\u006c',
|
|
'\u006d',
|
|
'\u006e',
|
|
'\u006f',
|
|
'\u0070',
|
|
'\u0071',
|
|
'\u0072',
|
|
'\u0073',
|
|
'\u0074',
|
|
'\u0075',
|
|
'\u0076',
|
|
'\u0077',
|
|
'\u0078',
|
|
'\u0079',
|
|
'\u007a',
|
|
'\u007b',
|
|
'\u007c',
|
|
'\u007d',
|
|
'\u007e',
|
|
'\u0000',
|
|
'\u2022',
|
|
'\u2020',
|
|
'\u2021',
|
|
'\u2026',
|
|
'\u2014',
|
|
'\u2013',
|
|
'\u0192',
|
|
'\u2044',
|
|
'\u2039',
|
|
'\u203a',
|
|
'\u2212',
|
|
'\u2030',
|
|
'\u201e',
|
|
'\u201c',
|
|
'\u201d',
|
|
'\u2018',
|
|
'\u2019',
|
|
'\u201a',
|
|
'\u2122',
|
|
'\ufb01',
|
|
'\ufb02',
|
|
'\u0141',
|
|
'\u0152',
|
|
'\u0160',
|
|
'\u0178',
|
|
'\u017d',
|
|
'\u0131',
|
|
'\u0142',
|
|
'\u0153',
|
|
'\u0161',
|
|
'\u017e',
|
|
'\u0000',
|
|
'\u20ac',
|
|
'\u00a1',
|
|
'\u00a2',
|
|
'\u00a3',
|
|
'\u00a4',
|
|
'\u00a5',
|
|
'\u00a6',
|
|
'\u00a7',
|
|
'\u00a8',
|
|
'\u00a9',
|
|
'\u00aa',
|
|
'\u00ab',
|
|
'\u00ac',
|
|
'\u0000',
|
|
'\u00ae',
|
|
'\u00af',
|
|
'\u00b0',
|
|
'\u00b1',
|
|
'\u00b2',
|
|
'\u00b3',
|
|
'\u00b4',
|
|
'\u00b5',
|
|
'\u00b6',
|
|
'\u00b7',
|
|
'\u00b8',
|
|
'\u00b9',
|
|
'\u00ba',
|
|
'\u00bb',
|
|
'\u00bc',
|
|
'\u00bd',
|
|
'\u00be',
|
|
'\u00bf',
|
|
'\u00c0',
|
|
'\u00c1',
|
|
'\u00c2',
|
|
'\u00c3',
|
|
'\u00c4',
|
|
'\u00c5',
|
|
'\u00c6',
|
|
'\u00c7',
|
|
'\u00c8',
|
|
'\u00c9',
|
|
'\u00ca',
|
|
'\u00cb',
|
|
'\u00cc',
|
|
'\u00cd',
|
|
'\u00ce',
|
|
'\u00cf',
|
|
'\u00d0',
|
|
'\u00d1',
|
|
'\u00d2',
|
|
'\u00d3',
|
|
'\u00d4',
|
|
'\u00d5',
|
|
'\u00d6',
|
|
'\u00d7',
|
|
'\u00d8',
|
|
'\u00d9',
|
|
'\u00da',
|
|
'\u00db',
|
|
'\u00dc',
|
|
'\u00dd',
|
|
'\u00de',
|
|
'\u00df',
|
|
'\u00e0',
|
|
'\u00e1',
|
|
'\u00e2',
|
|
'\u00e3',
|
|
'\u00e4',
|
|
'\u00e5',
|
|
'\u00e6',
|
|
'\u00e7',
|
|
'\u00e8',
|
|
'\u00e9',
|
|
'\u00ea',
|
|
'\u00eb',
|
|
'\u00ec',
|
|
'\u00ed',
|
|
'\u00ee',
|
|
'\u00ef',
|
|
'\u00f0',
|
|
'\u00f1',
|
|
'\u00f2',
|
|
'\u00f3',
|
|
'\u00f4',
|
|
'\u00f5',
|
|
'\u00f6',
|
|
'\u00f7',
|
|
'\u00f8',
|
|
'\u00f9',
|
|
'\u00fa',
|
|
'\u00fb',
|
|
'\u00fc',
|
|
'\u00fd',
|
|
'\u00fe',
|
|
'\u00ff',
|
|
)
|
|
|
|
assert len(_pdfDocEncoding) == 256
|
|
|
|
_pdfDocEncoding_rev = {char: ix for ix, char in enumerate(_pdfDocEncoding)}
|
|
|
|
pdf_name = NameObject
|
|
PROXYABLE = (TextStringObject, ByteStringObject, DictionaryObject, ArrayObject)
|
|
|
|
|
|
def proxy_encrypted_obj(encrypted_obj, handler):
|
|
if isinstance(encrypted_obj, PROXYABLE):
|
|
return DecryptedObjectProxy(encrypted_obj, handler)
|
|
else:
|
|
return encrypted_obj
|
|
|
|
|
|
class DecryptedObjectProxy(PdfObject):
|
|
"""
|
|
Internal proxy class that allows transparent on-demand encryption
|
|
of objects.
|
|
|
|
.. warning::
|
|
Most public-facing APIs won't leave you to deal with these *directly*
|
|
(that's half the reason this class exists in the first place), and
|
|
the API of this class is considered internal.
|
|
|
|
However, for reasons related to the historical PyPDF2 codebase from
|
|
which pyHanko's object handling code ultimately derives, there are
|
|
some Python builtins that might cause these wrapper objects to
|
|
inadvertently "leak". Please `tell us about such cases
|
|
<https://github.com/MatthiasValvekens/pyHanko/discussions>`_ so we can
|
|
make those types of access more convenient and robust.
|
|
|
|
.. danger::
|
|
The ``__eq__`` implementation on this class is not safe for general use,
|
|
due to the fact that certain structures in PDF are exempt from
|
|
encryption. Only compare proxy objects with ``==`` in areas of the
|
|
document where these exemptions don't apply.
|
|
|
|
:param raw_object:
|
|
A raw object, typically as-parsed from a PDF file.
|
|
:param handler:
|
|
The security handler governing this object.
|
|
"""
|
|
|
|
raw_object: PdfObject
|
|
"""
|
|
The underlying raw object, in its encrypted state.
|
|
"""
|
|
|
|
def __init__(self, raw_object: PdfObject, handler):
|
|
self.raw_object = raw_object
|
|
self._decrypted: Optional[PdfObject] = None
|
|
self.handler = handler
|
|
|
|
@property
|
|
def decrypted(self) -> PdfObject:
|
|
"""
|
|
The decrypted PDF object exposed as a property.
|
|
|
|
If this object is a container object, its constituent parts will be
|
|
wrapped in :class:`.DecryptedObjectProxy` as well, in order to defer
|
|
further decryption until the values are requested through a getter
|
|
method on the container.
|
|
"""
|
|
|
|
if self._decrypted is not None:
|
|
return self._decrypted
|
|
|
|
from .crypt import SecurityHandler
|
|
|
|
decrypted: PdfObject
|
|
|
|
obj = self.raw_object
|
|
handler: SecurityHandler = self.handler
|
|
container_ref = obj.container_ref
|
|
if not isinstance(container_ref, Reference):
|
|
raise ValueError(
|
|
"Proxyable objects must have a container ref pointing to a "
|
|
f"numbered object, not '{container_ref}'."
|
|
) # pragma: nocover
|
|
if isinstance(obj, ByteStringObject) or isinstance(
|
|
obj, TextStringObject
|
|
):
|
|
cf = handler.get_string_filter()
|
|
local_key = cf.derive_object_key(
|
|
container_ref.idnum, container_ref.generation
|
|
)
|
|
decrypted = pdf_string(cf.decrypt(local_key, obj.original_bytes))
|
|
elif isinstance(obj, DictionaryObject):
|
|
decrypted_entries = {
|
|
dictkey: proxy_encrypted_obj(value, handler)
|
|
for dictkey, value in obj.items()
|
|
}
|
|
if isinstance(obj, StreamObject):
|
|
decrypted = obj._implicit_decrypt_stream_content(
|
|
handler, container_ref, decrypted_entries
|
|
)
|
|
else:
|
|
decrypted = DictionaryObject(decrypted_entries)
|
|
elif isinstance(obj, ArrayObject):
|
|
decrypted_map = map(lambda v: proxy_encrypted_obj(v, handler), obj)
|
|
decrypted = ArrayObject(decrypted_map)
|
|
else: # pragma: nocover
|
|
raise TypeError(f'Object of type {type(obj)} is not proxyable.')
|
|
decrypted.container_ref = obj.container_ref
|
|
self._decrypted = decrypted
|
|
return decrypted
|
|
|
|
def write_to_stream(
|
|
self,
|
|
stream,
|
|
handler: Optional['SecurityHandler'] = None,
|
|
container_ref=None,
|
|
):
|
|
# maybe the encryption key for this object changed (due to it being
|
|
# included as part of a larger object or somesuch, without proper
|
|
# dereferencing), so to avoid unexpected shenanigans, let's start from
|
|
# scratch.
|
|
self.decrypted.write_to_stream(stream, handler, container_ref)
|
|
|
|
def get_object(self):
|
|
return self.decrypted.get_object()
|
|
|
|
@property
|
|
def container_ref(self):
|
|
return self.raw_object.container_ref
|
|
|
|
def __eq__(self, other):
|
|
# NOTE: this will fail if the dictionary contains "un-decryptable"
|
|
# descendants! The diff_analysis module is aware of this restriction,
|
|
# but you probably shouldn't use this __eq__ method to compare
|
|
# arbitrary objects in a PDF file.
|
|
return (
|
|
isinstance(other, DecryptedObjectProxy)
|
|
and other.decrypted == self.decrypted
|
|
)
|
|
|
|
|
|
ASN_DT_FORMAT = "D:%Y%m%d%H%M%S"
|
|
|
|
|
|
def pdf_date(dt: datetime) -> TextStringObject:
|
|
"""
|
|
Convert a datetime object into a PDF string.
|
|
This function supports both timezone-aware and naive datetime objects.
|
|
|
|
:param dt:
|
|
The datetime object to convert.
|
|
:return:
|
|
A :class:`TextStringObject` representing the datetime passed in.
|
|
"""
|
|
|
|
base_dt = dt.strftime(ASN_DT_FORMAT)
|
|
utc_offset_string = ''
|
|
utc_offset = dt.utcoffset()
|
|
if utc_offset is not None:
|
|
# compute UTC offset string
|
|
tz_seconds = utc_offset.total_seconds()
|
|
if not tz_seconds:
|
|
utc_offset_string = 'Z'
|
|
else:
|
|
sign = '+'
|
|
if tz_seconds < 0:
|
|
sign = '-'
|
|
tz_seconds = abs(tz_seconds)
|
|
hrs, tz_seconds = divmod(tz_seconds, 3600)
|
|
mins = tz_seconds // 60
|
|
# XXX the apostrophe after the minute part of the offset is NOT
|
|
# what's in the spec, but Adobe Reader DC refuses to validate
|
|
# signatures with a date string that doesn't contain it.
|
|
# No idea why.
|
|
utc_offset_string = sign + ("%02d'%02d'" % (hrs, mins))
|
|
|
|
return TextStringObject(base_dt + utc_offset_string)
|
|
|
|
|
|
# The year field is the only mandatory one
|
|
MIN_DATE_REGEX = re.compile(r'^D:(\d{4})')
|
|
MIN_DATE_REGEX_LENIENT = re.compile(r'^(?:D:)?(\d{4})')
|
|
TWO_DIGIT_START = re.compile(r'^(\d\d)')
|
|
UTC_OFFSET = re.compile(r"(\d\d)(?:'(\d\d))?'?")
|
|
|
|
|
|
def parse_pdf_date(date_str: str, strict: bool = True) -> datetime:
|
|
m = (MIN_DATE_REGEX if strict else MIN_DATE_REGEX_LENIENT).match(date_str)
|
|
if not m:
|
|
raise PdfReadError(f"{date_str} does not appear to be a date string.")
|
|
year = int(m.group(1))
|
|
|
|
# now, there are a number of 2-digit groups (anywhere from 0 to 5)
|
|
date_remaining = date_str[m.end(0) :]
|
|
lower_order = [1, 1, 0, 0, 0]
|
|
|
|
for ix in range(5):
|
|
m = TWO_DIGIT_START.match(date_remaining)
|
|
if not m:
|
|
break
|
|
lower_order[ix] = int(m.group(1))
|
|
date_remaining = date_remaining[2:]
|
|
|
|
# TODO range checks
|
|
month, day, hour, minute, second = lower_order
|
|
|
|
# finally, parse the timezone
|
|
tz_info = None
|
|
if date_remaining:
|
|
sgn = date_remaining[0]
|
|
if sgn == 'Z' and len(date_remaining) == 1:
|
|
tz_offset = timedelta(0)
|
|
elif sgn in ('+', '-'):
|
|
tz_spec = date_remaining[1:]
|
|
tz_match = UTC_OFFSET.fullmatch(tz_spec)
|
|
if not tz_match:
|
|
raise PdfReadError(
|
|
f"Improper timezone specification in {date_str}: {tz_spec}"
|
|
)
|
|
tz_hours = int(tz_match.group(1))
|
|
tz_minutes = int(tz_match.group(2) or 0)
|
|
tz_offset = timedelta(hours=tz_hours, minutes=tz_minutes)
|
|
if sgn == '-':
|
|
tz_offset = -tz_offset
|
|
else:
|
|
raise PdfReadError(f"Improper trailing characters in {date_str}.")
|
|
tz_info = timezone(tz_offset)
|
|
|
|
try:
|
|
return datetime(
|
|
year=year,
|
|
month=month,
|
|
day=day,
|
|
hour=hour,
|
|
minute=minute,
|
|
second=second,
|
|
microsecond=0,
|
|
tzinfo=tz_info,
|
|
)
|
|
except ValueError as e:
|
|
raise PdfReadError("Improper date value", e)
|