156 lines
4.1 KiB
Python
156 lines
4.1 KiB
Python
import logging
|
|
from datetime import datetime
|
|
from typing import Any, Dict, Optional, Union
|
|
|
|
import tzlocal
|
|
|
|
from pyhanko.pdf_utils import generic, misc
|
|
|
|
from . import model
|
|
|
|
__all__ = ['update_info_dict', 'view_from_info_dict']
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _write_meta_string(
|
|
dictionary: generic.DictionaryObject,
|
|
key: str,
|
|
meta_str: model.MetaString,
|
|
existing_only: bool,
|
|
) -> bool:
|
|
if isinstance(meta_str, misc.StringWithLanguage):
|
|
# don't bother with embedding language codes in strings,
|
|
# too few people implement escapes anyway, so meh
|
|
string = meta_str.value
|
|
elif isinstance(meta_str, str):
|
|
string = meta_str
|
|
else:
|
|
return False
|
|
|
|
pdf_str = generic.TextStringObject(string)
|
|
try:
|
|
old_value = dictionary[key]
|
|
mod = old_value != pdf_str
|
|
except KeyError:
|
|
mod = not existing_only
|
|
if mod:
|
|
dictionary[key] = pdf_str
|
|
return mod
|
|
|
|
|
|
def _write_meta_date(
|
|
dictionary: generic.DictionaryObject,
|
|
key: str,
|
|
meta_date: Union[datetime, str, None],
|
|
existing_only: bool,
|
|
) -> bool:
|
|
if isinstance(meta_date, datetime):
|
|
value = meta_date
|
|
elif meta_date == 'now':
|
|
value = datetime.now(tz=tzlocal.get_localzone())
|
|
else:
|
|
return False
|
|
|
|
if not existing_only or key in dictionary:
|
|
dictionary[key] = generic.pdf_date(value)
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def update_info_dict(
|
|
meta: model.DocumentMetadata,
|
|
info: generic.DictionaryObject,
|
|
only_update_existing: bool = False,
|
|
) -> bool:
|
|
mod = _write_meta_date(
|
|
info, "/ModDate", meta.last_modified, existing_only=only_update_existing
|
|
)
|
|
producer = model.VENDOR
|
|
try:
|
|
producer_string = info['/Producer']
|
|
if producer not in producer_string:
|
|
producer_string = generic.TextStringObject(
|
|
f"{producer_string}; {producer}"
|
|
)
|
|
mod = True
|
|
except (KeyError, TypeError):
|
|
producer_string = generic.TextStringObject(producer)
|
|
mod = True
|
|
# always override this
|
|
info['/Producer'] = producer_string
|
|
|
|
if meta.xmp_unmanaged:
|
|
return mod
|
|
|
|
mod |= _write_meta_string(
|
|
info, "/Title", meta.title, existing_only=only_update_existing
|
|
)
|
|
mod |= _write_meta_string(
|
|
info, "/Author", meta.author, existing_only=only_update_existing
|
|
)
|
|
mod |= _write_meta_string(
|
|
info, "/Subject", meta.subject, existing_only=only_update_existing
|
|
)
|
|
mod |= _write_meta_string(
|
|
info, "/Creator", meta.creator, existing_only=only_update_existing
|
|
)
|
|
mod |= _write_meta_date(
|
|
info, "/CreationDate", meta.created, existing_only=only_update_existing
|
|
)
|
|
|
|
if meta.keywords:
|
|
info['/Keywords'] = generic.TextStringObject(','.join(meta.keywords))
|
|
mod = True
|
|
|
|
return mod
|
|
|
|
|
|
def _read_date_from_dict(
|
|
info_dict: generic.DictionaryObject, key: str, strict: bool
|
|
) -> Optional[datetime]:
|
|
try:
|
|
date_str = info_dict[key]
|
|
except KeyError:
|
|
return None
|
|
|
|
try:
|
|
if isinstance(date_str, generic.TextStringObject):
|
|
return generic.parse_pdf_date(date_str, strict=strict)
|
|
except misc.PdfReadError:
|
|
pass
|
|
|
|
logger.warning(
|
|
"Key %s in info dict has value %s, which is not a valid date string",
|
|
key,
|
|
repr(date_str),
|
|
)
|
|
return None
|
|
|
|
|
|
def view_from_info_dict(
|
|
info_dict: generic.DictionaryObject, strict: bool = True
|
|
) -> model.DocumentMetadata:
|
|
kwargs: Dict[str, Any] = {}
|
|
for s_entry in ('title', 'author', 'subject', 'creator'):
|
|
try:
|
|
kwargs[s_entry] = str(info_dict[f"/{s_entry.title()}"])
|
|
except KeyError:
|
|
pass
|
|
|
|
creation_date = _read_date_from_dict(
|
|
info_dict, '/CreationDate', strict=strict
|
|
)
|
|
if creation_date is not None:
|
|
kwargs['created'] = creation_date
|
|
|
|
mod_date = _read_date_from_dict(info_dict, '/ModDate', strict=strict)
|
|
if mod_date is not None:
|
|
kwargs['last_modified'] = mod_date
|
|
|
|
if '/Keywords' in info_dict:
|
|
kwargs['keywords'] = str(info_dict['/Keywords']).split(',')
|
|
|
|
return model.DocumentMetadata(**kwargs)
|