526 lines
17 KiB
Python
526 lines
17 KiB
Python
"""
|
|
Utility classes for handling embedded files in PDFs.
|
|
|
|
.. versionadded:: 0.7.0
|
|
"""
|
|
|
|
import hashlib
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import List, Optional
|
|
|
|
from asn1crypto import x509
|
|
|
|
from . import crypt, generic, misc, writer
|
|
from .crypt.pubkey import RecipientEncryptionPolicy
|
|
from .font.basic import get_courier
|
|
from .generic import pdf_name, pdf_string
|
|
|
|
__all__ = [
|
|
'embed_file',
|
|
'EmbeddedFileObject',
|
|
'EmbeddedFileParams',
|
|
'FileSpec',
|
|
'RelatedFileSpec',
|
|
'wrap_encrypted_payload',
|
|
]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EmbeddedFileParams:
|
|
embed_size: bool = True
|
|
"""
|
|
If true, record the file size of the embedded file.
|
|
|
|
.. note::
|
|
This value is computed over the file content before PDF filters
|
|
are applied. This may have performance implications in cases where the
|
|
file stream contents are presented in pre-encoded form.
|
|
"""
|
|
|
|
embed_checksum: bool = True
|
|
"""
|
|
If true, add an MD5 checksum of the file contents.
|
|
|
|
.. note::
|
|
This value is computed over the file content before PDF filters
|
|
are applied. This may have performance implications in cases where the
|
|
file stream contents are presented in pre-encoded form.
|
|
"""
|
|
|
|
creation_date: Optional[datetime] = None
|
|
"""
|
|
Record the creation date of the embedded file.
|
|
"""
|
|
|
|
modification_date: Optional[datetime] = None
|
|
"""
|
|
Record the modification date of the embedded file.
|
|
"""
|
|
|
|
|
|
class EmbeddedFileObject(generic.StreamObject):
|
|
@classmethod
|
|
def from_file_data(
|
|
cls,
|
|
pdf_writer: writer.BasePdfFileWriter,
|
|
data: bytes,
|
|
compress=True,
|
|
params: Optional[EmbeddedFileParams] = None,
|
|
mime_type: Optional[str] = None,
|
|
) -> 'EmbeddedFileObject':
|
|
"""
|
|
Construct an embedded file object from file data.
|
|
|
|
This is a very thin wrapper around the constructor, with a slightly
|
|
less intimidating API.
|
|
|
|
.. note::
|
|
This method will not register the embedded file into the document's
|
|
embedded file namespace, see :func:`.embed_file`.
|
|
|
|
:param pdf_writer:
|
|
PDF writer to use.
|
|
:param data:
|
|
File contents, as a :class:`bytes` object.
|
|
:param compress:
|
|
Whether to compress the embedded file's contents.
|
|
:param params:
|
|
Optional embedded file parameters.
|
|
:param mime_type:
|
|
Optional MIME type string.
|
|
:return:
|
|
An embedded file object.
|
|
"""
|
|
|
|
result = EmbeddedFileObject(
|
|
pdf_writer=pdf_writer,
|
|
stream_data=data,
|
|
params=params,
|
|
mime_type=mime_type,
|
|
)
|
|
if compress:
|
|
result.compress()
|
|
|
|
return result
|
|
|
|
def __init__(
|
|
self,
|
|
pdf_writer: writer.BasePdfFileWriter,
|
|
dict_data=None,
|
|
stream_data=None,
|
|
encoded_data=None,
|
|
params: Optional[EmbeddedFileParams] = None,
|
|
mime_type: Optional[str] = None,
|
|
):
|
|
super().__init__(
|
|
dict_data=dict_data,
|
|
stream_data=stream_data,
|
|
encoded_data=encoded_data,
|
|
handler=pdf_writer.security_handler,
|
|
)
|
|
self['/Type'] = generic.pdf_name('/EmbeddedFile')
|
|
if mime_type is not None:
|
|
self['/Subtype'] = generic.pdf_name('/' + mime_type)
|
|
self.ef_stream_ref = pdf_writer.add_object(self)
|
|
self.params = params
|
|
|
|
def write_to_stream(self, stream, handler=None, container_ref=None):
|
|
# deal with "encrypt embedded files only" mode
|
|
# (we do this here to make sure the user doesn't add any other crypt
|
|
# filters after this one)
|
|
if handler is not None and not self._has_crypt_filter:
|
|
cfc = handler.crypt_filter_config
|
|
ef_filter_name = cfc.embedded_file_filter_name
|
|
stream_filter_name = cfc.stream_filter_name
|
|
if (
|
|
ef_filter_name is not None
|
|
and ef_filter_name != stream_filter_name
|
|
):
|
|
self.add_crypt_filter(ef_filter_name)
|
|
|
|
# apply the parameters before serialisation
|
|
params = self.params
|
|
if params is not None:
|
|
self['/Params'] = param_dict = generic.DictionaryObject()
|
|
if params.embed_size:
|
|
param_dict['/Size'] = generic.NumberObject(len(self.data))
|
|
if params.embed_checksum:
|
|
checksum = hashlib.md5(self.data).digest()
|
|
param_dict['/CheckSum'] = generic.ByteStringObject(checksum)
|
|
if params.creation_date is not None:
|
|
param_dict['/CreationDate'] = generic.pdf_date(
|
|
params.creation_date
|
|
)
|
|
if params.modification_date is not None:
|
|
param_dict['/ModDate'] = generic.pdf_date(
|
|
params.modification_date
|
|
)
|
|
|
|
super().write_to_stream(
|
|
stream, handler=handler, container_ref=container_ref
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class RelatedFileSpec:
|
|
"""
|
|
Dataclass modelling a RelatedFile construct in PDF.
|
|
"""
|
|
|
|
name: str
|
|
"""
|
|
Name of the related file.
|
|
|
|
.. note::
|
|
The encoding requirements of this field depend on whether the related
|
|
file is included via the ``/F`` or ``/UF`` key.
|
|
"""
|
|
|
|
embedded_data: EmbeddedFileObject
|
|
"""
|
|
Reference to a stream object containing the file's data, as embedded
|
|
in the PDF file.
|
|
"""
|
|
|
|
@classmethod
|
|
def fmt_related_files(cls, lst: List['RelatedFileSpec']):
|
|
def _gen():
|
|
for rfs in lst:
|
|
yield generic.pdf_string(rfs.name)
|
|
yield rfs.embedded_data.ef_stream_ref
|
|
|
|
return generic.ArrayObject(_gen())
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FileSpec:
|
|
"""
|
|
Dataclass modelling an embedded file description in a PDF.
|
|
"""
|
|
|
|
# TODO collection item dictionaries
|
|
|
|
# TODO thumbnail support
|
|
|
|
# TODO enforce PDFDocEncoding for file_spec_string etc.
|
|
|
|
file_spec_string: str
|
|
"""
|
|
A path-like file specification string, or URL.
|
|
|
|
.. note::
|
|
For backwards compatibility, this string should be encodable in
|
|
PDFDocEncoding. For names that require general Unicode support, refer
|
|
to :class:`file_name`.
|
|
"""
|
|
|
|
file_name: Optional[str] = None
|
|
"""
|
|
A path-like Unicode file name.
|
|
"""
|
|
|
|
embedded_data: Optional[EmbeddedFileObject] = None
|
|
"""
|
|
Reference to a stream object containing the file's data, as embedded
|
|
in the PDF file.
|
|
"""
|
|
|
|
description: Optional[str] = None
|
|
"""
|
|
Textual description of the file.
|
|
"""
|
|
|
|
af_relationship: Optional[generic.NameObject] = None
|
|
"""
|
|
Associated file relationship specifier.
|
|
"""
|
|
|
|
f_related_files: Optional[List[RelatedFileSpec]] = None
|
|
"""
|
|
Related files with PDFDocEncoded names.
|
|
"""
|
|
|
|
uf_related_files: Optional[List[RelatedFileSpec]] = None
|
|
"""
|
|
Related files with Unicode-encoded names.
|
|
"""
|
|
|
|
def as_pdf_object(self) -> generic.DictionaryObject:
|
|
"""
|
|
Represent the file spec as a PDF dictionary.
|
|
"""
|
|
|
|
result = generic.DictionaryObject(
|
|
{
|
|
pdf_name('/Type'): pdf_name('/Filespec'),
|
|
pdf_name('/F'): pdf_string(self.file_spec_string),
|
|
}
|
|
)
|
|
if self.file_name is not None:
|
|
result['/UF'] = pdf_string(self.file_name)
|
|
|
|
if self.embedded_data is not None:
|
|
result['/EF'] = ef_dict = generic.DictionaryObject(
|
|
{
|
|
pdf_name('/F'): self.embedded_data.ef_stream_ref,
|
|
}
|
|
)
|
|
if self.file_name is not None:
|
|
ef_dict['/UF'] = self.embedded_data.ef_stream_ref
|
|
|
|
if self.description is not None:
|
|
result['/Desc'] = generic.TextStringObject(self.description)
|
|
|
|
if self.af_relationship is not None:
|
|
result['/AFRelationship'] = self.af_relationship
|
|
|
|
f_related = self.f_related_files
|
|
uf_related = self.uf_related_files
|
|
if f_related or uf_related:
|
|
result['/RF'] = rf = generic.DictionaryObject()
|
|
if f_related:
|
|
rf['/F'] = RelatedFileSpec.fmt_related_files(f_related)
|
|
if uf_related and self.file_name is not None:
|
|
rf['/UF'] = RelatedFileSpec.fmt_related_files(uf_related)
|
|
|
|
return result
|
|
|
|
|
|
def embed_file(pdf_writer: writer.BasePdfFileWriter, spec: FileSpec):
|
|
"""
|
|
Embed a file in the document-wide embedded file registry of a PDF writer.
|
|
|
|
:param pdf_writer:
|
|
PDF writer to house the embedded file.
|
|
:param spec:
|
|
File spec describing the embedded file.
|
|
:return:
|
|
"""
|
|
|
|
ef_stream = spec.embedded_data
|
|
|
|
if ef_stream is None:
|
|
raise misc.PdfWriteError(
|
|
"File spec does not have an embedded file stream"
|
|
)
|
|
|
|
spec_obj = spec.as_pdf_object()
|
|
|
|
root = pdf_writer.root
|
|
try:
|
|
names_dict = root['/Names']
|
|
except KeyError:
|
|
names_dict = generic.DictionaryObject()
|
|
root['/Names'] = pdf_writer.add_object(names_dict)
|
|
pdf_writer.update_root()
|
|
|
|
try:
|
|
ef_name_tree = names_dict['/EmbeddedFiles']
|
|
except KeyError:
|
|
ef_name_tree = generic.DictionaryObject()
|
|
names_dict['/EmbeddedFiles'] = pdf_writer.add_object(ef_name_tree)
|
|
pdf_writer.update_container(names_dict)
|
|
|
|
# TODO support updating general name trees!
|
|
# (should probably be refactored into an utility method somewhere)
|
|
if '/Kids' in ef_name_tree:
|
|
raise NotImplementedError(
|
|
"Only flat name trees are supported right now"
|
|
)
|
|
|
|
try:
|
|
ef_name_arr = ef_name_tree['/Names']
|
|
except KeyError:
|
|
ef_name_arr = generic.ArrayObject()
|
|
ef_name_tree['/Names'] = pdf_writer.add_object(ef_name_arr)
|
|
pdf_writer.update_container(ef_name_tree)
|
|
|
|
ef_name_arr.append(generic.pdf_string(spec.file_spec_string))
|
|
spec_obj_ref = pdf_writer.add_object(spec_obj)
|
|
ef_name_arr.append(spec_obj_ref)
|
|
pdf_writer.update_container(ef_name_arr)
|
|
|
|
if spec.af_relationship is not None:
|
|
pdf_writer.ensure_output_version(version=(2, 0))
|
|
# add the filespec to the /AF entry in the document catalog
|
|
# TODO allow associations with objects other than the catalog?
|
|
try:
|
|
root_af_arr = root['/AF']
|
|
except KeyError:
|
|
root_af_arr = generic.ArrayObject()
|
|
root['/AF'] = pdf_writer.add_object(root_af_arr)
|
|
pdf_writer.update_root()
|
|
root_af_arr.append(spec_obj_ref)
|
|
else:
|
|
pdf_writer.ensure_output_version(version=(1, 7))
|
|
|
|
|
|
def wrap_encrypted_payload(
|
|
plaintext_payload: bytes,
|
|
*,
|
|
password: Optional[str] = None,
|
|
certs: Optional[List[x509.Certificate]] = None,
|
|
security_handler: Optional[crypt.SecurityHandler] = None,
|
|
file_spec_string: str = 'attachment.pdf',
|
|
params: Optional[EmbeddedFileParams] = None,
|
|
file_name: Optional[str] = None,
|
|
description='Wrapped document',
|
|
include_explanation_page=True,
|
|
) -> writer.PdfFileWriter:
|
|
"""
|
|
Include a PDF document as an encrypted attachment in a wrapper document.
|
|
|
|
This function sets certain flags in the wrapper document's collection
|
|
dictionary to instruct compliant PDF viewers to display the attachment
|
|
instead of the wrapping document. Viewers that do not fully support
|
|
PDF collections will display a landing page instead, explaining
|
|
how to open the attachment manually.
|
|
|
|
Using this method mitigates some weaknesses in the PDF standard's encryption
|
|
provisions, and makes it harder to manipulate the encrypted attachment
|
|
without knowing the encryption key.
|
|
|
|
.. danger::
|
|
Until PDF supports authenticated encryption mechanisms, this is
|
|
a mitigation strategy, not a foolproof defence mechanism.
|
|
|
|
.. warning::
|
|
While users of viewers that do not support PDF collections can still
|
|
open the attached file manually, the viewer still has to support
|
|
PDF files where only the attachments are encrypted.
|
|
|
|
.. note::
|
|
This is not quite the same as the "unencrypted wrapper document"
|
|
pattern discussed in the PDF 2.0 specification. The latter is intended
|
|
to support nonstandard security handlers. This function uses a standard
|
|
security handler on the wrapping document to encrypt the attachment
|
|
as a binary blob.
|
|
Moreover, the functionality in this function is available in PDF 1.7
|
|
viewers as well.
|
|
|
|
:param plaintext_payload:
|
|
The plaintext payload (a binary representation of a PDF document).
|
|
:param security_handler:
|
|
The security handler to use on the wrapper document.
|
|
If ``None``, a security handler will be constructed based on the
|
|
``password`` or ``certs`` parameter.
|
|
:param password:
|
|
Password to encrypt the attachment with.
|
|
Will be ignored if ``security_handler`` is provided.
|
|
:param certs:
|
|
Encrypt the file using PDF public-key encryption, targeting the
|
|
keys in the provided certificates.
|
|
Will be ignored if ``security_handler`` is provided.
|
|
:param file_spec_string:
|
|
PDFDocEncoded file spec string for the attachment.
|
|
:param params:
|
|
Embedded file parameters to use.
|
|
:param file_name:
|
|
Unicode file name for the attachment.
|
|
:param description:
|
|
Description for the attachment
|
|
:param include_explanation_page:
|
|
If ``False``, do not generate an explanation page in the wrapper
|
|
document. This setting could be useful if you want to customise the
|
|
wrapper document's behaviour yourself.
|
|
:return:
|
|
A :class:`~writer.PdfFileWriter` representing the wrapper document.
|
|
"""
|
|
w = writer.PdfFileWriter()
|
|
|
|
if security_handler is None:
|
|
if (password is None) == (certs is None):
|
|
raise ValueError(
|
|
"If 'security_handler' is not provided, "
|
|
"exactly one of 'password' or 'cert' must be."
|
|
)
|
|
if password is None:
|
|
assert certs is not None
|
|
# set up pubkey security handler
|
|
pubkey_cf = crypt.PubKeyAESCryptFilter(
|
|
keylen=32, acts_as_default=False, encrypt_metadata=False
|
|
)
|
|
pubkey_cf.set_embedded_only()
|
|
security_handler = crypt.PubKeySecurityHandler(
|
|
version=crypt.SecurityHandlerVersion.AES256,
|
|
pubkey_handler_subfilter=crypt.PubKeyAdbeSubFilter.S5,
|
|
legacy_keylen=None,
|
|
encrypt_metadata=False,
|
|
crypt_filter_config=crypt.CryptFilterConfiguration(
|
|
{crypt.DEF_EMBEDDED_FILE: pubkey_cf},
|
|
default_file_filter=crypt.DEF_EMBEDDED_FILE,
|
|
),
|
|
)
|
|
pubkey_cf.add_recipients(certs, policy=RecipientEncryptionPolicy())
|
|
else:
|
|
# set up standard security handler
|
|
std_cf = crypt.StandardAESCryptFilter(keylen=32)
|
|
std_cf.set_embedded_only()
|
|
security_handler = crypt.StandardSecurityHandler.build_from_pw(
|
|
password,
|
|
crypt_filter_config=crypt.CryptFilterConfiguration(
|
|
{crypt.STD_CF: std_cf}, default_file_filter=crypt.STD_CF
|
|
),
|
|
encrypt_metadata=False,
|
|
)
|
|
w._assign_security_handler(security_handler)
|
|
|
|
w.root['/Collection'] = collection_dict = generic.DictionaryObject()
|
|
collection_dict['/Type'] = pdf_name('/Collection')
|
|
collection_dict['/D'] = generic.TextStringObject(file_spec_string)
|
|
collection_dict['/View'] = pdf_name('/H') # hide "Collection" view
|
|
|
|
ef_obj = EmbeddedFileObject.from_file_data(
|
|
w,
|
|
data=plaintext_payload,
|
|
mime_type='application/pdf',
|
|
params=params or EmbeddedFileParams(),
|
|
)
|
|
|
|
spec = FileSpec(
|
|
file_spec_string=file_spec_string,
|
|
file_name=file_name,
|
|
embedded_data=ef_obj,
|
|
description=description,
|
|
)
|
|
embed_file(w, spec)
|
|
|
|
if include_explanation_page:
|
|
resources = generic.DictionaryObject(
|
|
{
|
|
pdf_name('/Font'): generic.DictionaryObject(
|
|
{pdf_name('/F1'): get_courier(w)}
|
|
)
|
|
}
|
|
)
|
|
|
|
# TODO make it easy to customise this
|
|
# (i.e. don't require the user to put together a page object by
|
|
# themselves)
|
|
stream_content = '''
|
|
BT
|
|
/F1 10 Tf 10 830 Td 12 TL
|
|
(This document is a wrapper for an encrypted attachment.) '
|
|
(Your viewer should prompt for a password and ) '
|
|
(open the attached file automatically.) Tj
|
|
(If not, navigate to the attached document manually.) ' T*
|
|
(In addition, your viewer must support encryption ) '
|
|
(scoped to embedded files.) Tj
|
|
ET
|
|
'''
|
|
|
|
stream = generic.StreamObject(
|
|
stream_data=stream_content.encode('latin1')
|
|
)
|
|
explanation_page = writer.PageObject(
|
|
contents=w.add_object(stream),
|
|
# A4 media box
|
|
media_box=(0, 0, 595.28, 841.89),
|
|
resources=resources,
|
|
)
|
|
|
|
w.insert_page(explanation_page)
|
|
|
|
return w
|