1393 lines
49 KiB
Python
1393 lines
49 KiB
Python
"""
|
|
Internal utilities to handle the processing of cross-reference data and
|
|
document trailer data.
|
|
|
|
This entire module is considered internal API.
|
|
"""
|
|
|
|
|
|
import enum
|
|
import logging
|
|
import os
|
|
import struct
|
|
from dataclasses import dataclass
|
|
from io import BytesIO
|
|
from itertools import chain
|
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
|
|
from pyhanko.pdf_utils import generic, misc
|
|
from pyhanko.pdf_utils.generic import EncryptedObjAccess, pdf_name
|
|
from pyhanko.pdf_utils.misc import PdfReadError, PdfStrictReadError, peek
|
|
from pyhanko.pdf_utils.rw_common import PdfHandler
|
|
|
|
__all__ = [
|
|
'XRefCache',
|
|
'XRefBuilder',
|
|
'XRefType',
|
|
'XRefEntry',
|
|
'ObjStreamRef',
|
|
'ObjectHeaderReadError',
|
|
'XRefSection',
|
|
'XRefSectionData',
|
|
'XRefSectionType',
|
|
'XRefSectionMetaInfo',
|
|
'TrailerDictionary',
|
|
'read_object_header',
|
|
'parse_xref_stream',
|
|
'parse_xref_table',
|
|
'write_xref_table',
|
|
'ObjectStream',
|
|
'XRefStream',
|
|
'OBJSTREAM_FORBIDDEN',
|
|
'PositionDict',
|
|
]
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@enum.unique
|
|
class XRefType(enum.Enum):
|
|
"""
|
|
Different types of cross-reference entries.
|
|
"""
|
|
|
|
FREE = enum.auto()
|
|
"""
|
|
A freeing instruction.
|
|
"""
|
|
|
|
STANDARD = enum.auto()
|
|
"""
|
|
A regular top-level object.
|
|
"""
|
|
|
|
IN_OBJ_STREAM = enum.auto()
|
|
"""
|
|
An object that is part of an object stream.
|
|
"""
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ObjStreamRef:
|
|
"""
|
|
Identifies an object that's part of an object stream.
|
|
"""
|
|
|
|
obj_stream_id: int
|
|
"""
|
|
The ID number of the object stream (its generation number is presumed zero).
|
|
"""
|
|
|
|
ix_in_stream: int
|
|
"""
|
|
The index of the object in the stream.
|
|
"""
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class XRefEntry:
|
|
"""
|
|
Value type representing a single cross-reference entry.
|
|
"""
|
|
|
|
xref_type: XRefType
|
|
"""
|
|
The type of cross-reference entry.
|
|
"""
|
|
|
|
location: Optional[Union[int, ObjStreamRef]]
|
|
"""
|
|
Location the cross-reference points to.
|
|
"""
|
|
|
|
idnum: int
|
|
"""
|
|
The ID of the object being referenced.
|
|
"""
|
|
|
|
generation: int = 0
|
|
"""
|
|
The generation number of the object being referenced.
|
|
"""
|
|
|
|
|
|
def parse_xref_table(stream) -> Iterator[XRefEntry]:
|
|
"""
|
|
Parse a single cross-reference table and yield its entries one by one.
|
|
|
|
This is internal API.
|
|
|
|
:param stream:
|
|
A file-like object pointed to the start of the cross-reference table.
|
|
:return:
|
|
A generator object yielding :class:`.XRefEntry` objects.
|
|
"""
|
|
|
|
misc.read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
while True:
|
|
num = generic.NumberObject.read_from_stream(stream)
|
|
misc.read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
size = generic.NumberObject.read_from_stream(stream)
|
|
misc.read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
for cnt in range(0, size):
|
|
line = stream.read(20)
|
|
|
|
# It's very clear in section 3.4.3 of the PDF spec
|
|
# that all cross-reference table lines are a fixed
|
|
# 20 bytes (as of PDF 1.7). However, some files have
|
|
# 21-byte entries (or more) due to the use of \r\n
|
|
# (CRLF) EOL's. Detect that case, and adjust the line
|
|
# until it does not begin with a \r (CR) or \n (LF).
|
|
while line[0] in b"\x0D\x0A":
|
|
stream.seek(-20 + 1, os.SEEK_CUR)
|
|
line = stream.read(20)
|
|
|
|
# On the other hand, some malformed PDF files
|
|
# use a single character EOL without a preceding
|
|
# space. Detect that case, and seek the stream
|
|
# back one character. (0-9 means we've bled into
|
|
# the next xref entry, t means we've bled into the
|
|
# text "trailer"):
|
|
if line[-1] in b"0123456789t":
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
|
|
offset, generation, marker = line[:18].split(b" ")
|
|
if marker == b'n':
|
|
yield XRefEntry(
|
|
xref_type=XRefType.STANDARD,
|
|
location=int(offset),
|
|
idnum=num,
|
|
generation=int(generation),
|
|
)
|
|
elif marker == b'f':
|
|
yield XRefEntry(
|
|
xref_type=XRefType.FREE,
|
|
location=None,
|
|
idnum=num,
|
|
generation=int(generation),
|
|
)
|
|
num += 1
|
|
misc.read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
trailertag = stream.read(7)
|
|
if trailertag == b"trailer":
|
|
# we're done, finish by reading to the first non-whitespace char
|
|
misc.read_non_whitespace(stream)
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
return
|
|
else:
|
|
# more xrefs!
|
|
stream.seek(-7, os.SEEK_CUR)
|
|
|
|
|
|
def parse_xref_stream(
|
|
xref_stream: generic.StreamObject, strict: bool = True
|
|
) -> Iterator[XRefEntry]:
|
|
"""
|
|
Parse a single cross-reference stream and yield its entries one by one.
|
|
|
|
This is internal API.
|
|
|
|
:param xref_stream:
|
|
A :class:`~generic.StreamObject`.
|
|
:param strict:
|
|
Boolean indicating whether we're running in strict mode.
|
|
:return:
|
|
A generator object yielding :class:`.XRefEntry` objects.
|
|
"""
|
|
|
|
stream_data = BytesIO(xref_stream.data)
|
|
# Index pairs specify the subsections in the dictionary. If
|
|
# none create one subsection that spans everything.
|
|
idx_pairs = xref_stream.get("/Index", [0, xref_stream.get("/Size")])
|
|
entry_sizes = xref_stream.get("/W")
|
|
|
|
def get_entry(ix):
|
|
# Reads the correct number of bytes for each entry. See the
|
|
# discussion of the W parameter in ISO 32000-1 table 17.
|
|
entry_width = entry_sizes[ix]
|
|
if entry_width > 0:
|
|
d = stream_data.read(entry_width)
|
|
if len(d) >= entry_width:
|
|
return convert_to_int(d, entry_width)
|
|
elif strict:
|
|
raise misc.PdfStrictReadError(
|
|
"XRef stream ended prematurely; incomplete entry: "
|
|
f"expected to read {entry_width} bytes, but only got "
|
|
f"{len(d)}."
|
|
)
|
|
|
|
# ISO 32000-1 Table 17: A value of zero for an element in the
|
|
# W array indicates...the default value shall be used
|
|
if ix == 0:
|
|
return 1 # First value defaults to 1
|
|
else:
|
|
return 0
|
|
|
|
# Iterate through each subsection
|
|
last_end = 0
|
|
for start, size in misc.pair_iter(idx_pairs):
|
|
# The subsections must increase
|
|
assert start >= last_end
|
|
last_end = start + size
|
|
for num in range(start, start + size):
|
|
# The first entry is the type
|
|
xref_type = get_entry(0)
|
|
# The rest of the elements depend on the xref_type
|
|
if xref_type == 1:
|
|
# objects that are in use but are not compressed
|
|
location = get_entry(1)
|
|
generation = get_entry(2)
|
|
yield XRefEntry(
|
|
xref_type=XRefType.STANDARD,
|
|
idnum=num,
|
|
location=location,
|
|
generation=generation,
|
|
)
|
|
elif xref_type == 2:
|
|
# compressed objects
|
|
objstr_num = get_entry(1)
|
|
objstr_idx = get_entry(2)
|
|
location = ObjStreamRef(objstr_num, objstr_idx)
|
|
yield XRefEntry(
|
|
xref_type=XRefType.IN_OBJ_STREAM,
|
|
idnum=num,
|
|
location=location,
|
|
)
|
|
elif xref_type == 0:
|
|
# freed object
|
|
# we ignore the linked list aspect anyway, so discard first
|
|
get_entry(1)
|
|
next_generation = get_entry(2)
|
|
yield XRefEntry(
|
|
xref_type=XRefType.FREE,
|
|
idnum=num,
|
|
generation=next_generation,
|
|
location=None,
|
|
)
|
|
else:
|
|
# unknown type (=> ignore).
|
|
get_entry(1)
|
|
get_entry(2)
|
|
|
|
if len(stream_data.read(1)) > 0 and strict:
|
|
raise misc.PdfStrictReadError("Trailing data in cross-reference stream")
|
|
|
|
|
|
@enum.unique
|
|
class XRefSectionType(enum.Enum):
|
|
STANDARD = enum.auto()
|
|
STREAM = enum.auto()
|
|
HYBRID_MAIN = enum.auto()
|
|
HYBRID_STREAM = enum.auto()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class XRefSectionMetaInfo:
|
|
xref_section_type: XRefSectionType
|
|
"""
|
|
The type of cross-reference section.
|
|
"""
|
|
|
|
size: int
|
|
"""
|
|
The highest object ID in scope for this xref section.
|
|
"""
|
|
|
|
declared_startxref: int
|
|
"""
|
|
Location pointed to by the startxref pointer in that revision.
|
|
"""
|
|
|
|
start_location: int
|
|
"""
|
|
Actual start location of the xref data. This should be equal
|
|
to `declared_startxref`, but in broken files that may not be the case.
|
|
"""
|
|
|
|
end_location: int
|
|
"""
|
|
Location where the xref data ended.
|
|
"""
|
|
|
|
stream_ref: Optional[generic.Reference]
|
|
"""
|
|
Reference to the relevant xref stream, if applicable.
|
|
"""
|
|
|
|
|
|
class XRefSectionData:
|
|
"""
|
|
Internal class for bookkeeping on a single cross-reference section,
|
|
independently of the others.
|
|
"""
|
|
|
|
def __init__(self: 'XRefSectionData'):
|
|
# TODO Food for thought: is there an efficient IntMap implementation out
|
|
# there that can beat generic python dicts in real-world scenarios?
|
|
self.freed: Dict[int, int] = {}
|
|
self.standard_xrefs: Dict[int, Tuple[int, int]] = {}
|
|
self.xrefs_in_objstm: Dict[int, ObjStreamRef] = {}
|
|
self.explicit_refs_in_revision: Set[Tuple[int, int]] = set()
|
|
self.obj_streams_used: Set[int] = set()
|
|
self.hybrid: Optional[XRefSection] = None
|
|
|
|
def try_resolve(
|
|
self, ref: Union[generic.Reference, generic.IndirectObject]
|
|
) -> Optional[Union[int, ObjStreamRef]]:
|
|
# The lookups are ordered more or less in the order we expect
|
|
# them to be queried most frequently in a given file.
|
|
|
|
# In files that use object streams in a context where it provides
|
|
# significant savings, we also expect _most_ objects (esp. lots of
|
|
# small ones) to be stored in object streams (case in point: tagged
|
|
# documents). As such, it makes sense to look there first.
|
|
# Note: generation must be zero
|
|
if ref.generation == 0:
|
|
try:
|
|
return self.xrefs_in_objstm[ref.idnum]
|
|
except KeyError:
|
|
pass
|
|
|
|
std_ref = self.standard_xrefs.get(ref.idnum, None)
|
|
if std_ref is not None:
|
|
# check if the generations match
|
|
if std_ref[0] == ref.generation:
|
|
return std_ref[1]
|
|
else:
|
|
raise KeyError(ref)
|
|
|
|
freed_next_generation = self.freed.get(ref.idnum, None)
|
|
# The generation in a free entry is the _next_ gen number at which
|
|
# the reference may be used, hence the -1
|
|
# Otherwise, the freed entry is irrelevant, so we fall through.
|
|
if (
|
|
freed_next_generation is not None
|
|
and ref.generation == freed_next_generation - 1
|
|
):
|
|
# note: 7.5.8.4 is confusingly worded, but we do _not_ need to
|
|
# fall back to the hybrid ref here, because the main section
|
|
# still takes precedence!
|
|
return None
|
|
|
|
# ISO 32000-2:2020, 7.5.8.4 says that we have to look in the hybrid
|
|
# stream section before moving on now.
|
|
if self.hybrid is not None:
|
|
return self.hybrid.xref_data.try_resolve(ref)
|
|
else:
|
|
raise KeyError(ref)
|
|
|
|
def process_entries(self, entries: Iterator[XRefEntry], strict: bool):
|
|
highest_id = 0
|
|
for xref_entry in entries:
|
|
idnum = xref_entry.idnum
|
|
generation = xref_entry.generation
|
|
highest_id = max(idnum, highest_id)
|
|
if generation > 0xFFFF:
|
|
if strict:
|
|
raise PdfStrictReadError(
|
|
f"Illegal generation {generation} for "
|
|
f"object ID {idnum}."
|
|
)
|
|
continue
|
|
if xref_entry.idnum == 0:
|
|
continue # don't bother
|
|
if xref_entry.xref_type == XRefType.STANDARD:
|
|
offset = xref_entry.location
|
|
assert isinstance(offset, int)
|
|
self.standard_xrefs[idnum] = (generation, offset)
|
|
self.explicit_refs_in_revision.add((idnum, generation))
|
|
elif xref_entry.xref_type == XRefType.IN_OBJ_STREAM:
|
|
assert generation == 0
|
|
loc = xref_entry.location
|
|
assert isinstance(loc, ObjStreamRef)
|
|
self.xrefs_in_objstm[idnum] = loc
|
|
self.obj_streams_used.add(loc.obj_stream_id)
|
|
self.explicit_refs_in_revision.add((idnum, 0))
|
|
elif xref_entry.xref_type == XRefType.FREE:
|
|
self.freed[idnum] = generation
|
|
# subtract one, since we're removing one generation before
|
|
# this one.
|
|
self.explicit_refs_in_revision.add((idnum, generation - 1))
|
|
return highest_id
|
|
|
|
def process_hybrid_entries(
|
|
self,
|
|
entries: Iterator[XRefEntry],
|
|
xref_meta_info: XRefSectionMetaInfo,
|
|
strict: bool,
|
|
):
|
|
hybrid = XRefSectionData()
|
|
hybrid.process_entries(entries, strict=strict)
|
|
self.hybrid = XRefSection(xref_meta_info, hybrid)
|
|
|
|
def higher_generation_refs(self):
|
|
for idnum, (generation, _) in self.standard_xrefs.items():
|
|
if generation > 0:
|
|
yield idnum, generation
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class XRefSection:
|
|
"""
|
|
Describes a cross-reference section and describes how it is serialised into
|
|
the PDF file.
|
|
"""
|
|
|
|
meta_info: XRefSectionMetaInfo
|
|
"""
|
|
Metadata about the cross-reference section.
|
|
"""
|
|
|
|
xref_data: XRefSectionData
|
|
"""
|
|
A description of the actual object pointer definitions.
|
|
"""
|
|
|
|
|
|
def _check_freed_refs(
|
|
ix: int, section: XRefSection, all_sections: List[XRefSection]
|
|
):
|
|
# Prevent xref stream objects from being overwritten.
|
|
# (stricter than the spec, but it makes our lives easier at the cost
|
|
# of rejecting some theoretically valid files)
|
|
# The reason is because of the potential for caching issues with
|
|
# encrypted files.
|
|
xstream_ref = section.meta_info.stream_ref
|
|
if xstream_ref:
|
|
as_tuple = (xstream_ref.idnum, xstream_ref.generation)
|
|
for section_ in all_sections[ix + 1 :]:
|
|
data = section_.xref_data
|
|
if as_tuple in data.explicit_refs_in_revision:
|
|
raise misc.PdfStrictReadError(
|
|
"XRef stream objects must not be clobbered in strict "
|
|
"mode."
|
|
)
|
|
|
|
# For all free refs, check that _if_ redefined, they're
|
|
# redefined with a proper generation number
|
|
for idnum, expected_next_generation in section.xref_data.freed.items():
|
|
# When rewriting files & removing dead objects, Acrobat will
|
|
# enter the deleted reference into the Xref table/stream with
|
|
# a 'next generation' ID of 0. It doesn't contradict the spec
|
|
# directly, but I assumed that this was the way to indicate that
|
|
# generation 0xffff had just been freed. Apparently not, because
|
|
# I've seen Acrobat put that same freed reference in later revisions
|
|
# with a next_gen number of 1. Bizarre.
|
|
#
|
|
# Anyhow, given the ubiquity of Adobe (Acrobat|Reader), it's
|
|
# probably prudent to special-case this one.
|
|
# In doing so, we're probably not dealing correctly with the case
|
|
# where the 0xffff'th generation of an object is freed, but I'm
|
|
# happy to assume that that will never happen in a legitimate file.
|
|
|
|
# We will raise an error on any reuse of such "dead" objects
|
|
# in anything else than a 'free'.
|
|
#
|
|
# To be explicit (while still accepting legitimate files)
|
|
# we'll throw an error if this happens in a non-initial revision,
|
|
# until someone complains.
|
|
if expected_next_generation == 0 and ix > 0:
|
|
raise misc.PdfStrictReadError(
|
|
"In strict mode, a free xref with next generation 0 is only"
|
|
"permitted in an initial revision due to unclear semantics."
|
|
)
|
|
|
|
# We have to exempt free refs in sections preceding a HYBRID_MAIN
|
|
# section from conflicting with later overrides in HYBRID_STREAM
|
|
# sections. But also this provision is interpreted more strictly in
|
|
# pyHanko:
|
|
# we only allow the hybrid stream associated with the next section
|
|
# to be used to lift the restriction.
|
|
# This is sufficient to enable the subset of uses of hybrid references
|
|
# that I've seen in the wild.
|
|
|
|
if ix < len(all_sections) - 1:
|
|
next_section = all_sections[ix + 1]
|
|
hybrid: Optional[XRefSectionData] = (
|
|
next_section.xref_data.hybrid.xref_data
|
|
if next_section.xref_data.hybrid is not None
|
|
else None
|
|
)
|
|
if hybrid is not None:
|
|
if (
|
|
idnum in hybrid.standard_xrefs
|
|
or idnum in hybrid.xrefs_in_objstm
|
|
):
|
|
# exemption!
|
|
continue
|
|
|
|
improper_generation = None
|
|
for succ in all_sections[ix + 1 :]:
|
|
data = succ.xref_data
|
|
# don't enforce
|
|
if idnum in data.xrefs_in_objstm:
|
|
improper_generation = 0
|
|
else:
|
|
try:
|
|
next_generation, _ = data.standard_xrefs[idnum]
|
|
# We compare using < instead of forcing equality.
|
|
# Jumps in the generation number will be detected
|
|
# by the higher_gen check further down.
|
|
# For the == 0 check, see comment about dead objects
|
|
# further up.
|
|
if (
|
|
expected_next_generation == 0
|
|
or next_generation < expected_next_generation
|
|
):
|
|
improper_generation = next_generation
|
|
except KeyError:
|
|
pass
|
|
|
|
if improper_generation is not None:
|
|
if expected_next_generation == 0:
|
|
raise misc.PdfStrictReadError(
|
|
f"Object with id {idnum} was listed as dead, "
|
|
f"but is reused later, with generation "
|
|
f"number {improper_generation}."
|
|
)
|
|
else:
|
|
raise misc.PdfStrictReadError(
|
|
f"Object with id {idnum} and generation "
|
|
f"{improper_generation} was found after "
|
|
f"{expected_next_generation - 1} was freed."
|
|
)
|
|
|
|
|
|
def _with_hybrids(sections: List[XRefSection]):
|
|
for section in sections:
|
|
if section.xref_data.hybrid is not None:
|
|
yield section.xref_data.hybrid
|
|
yield section
|
|
|
|
|
|
def _check_xref_consistency(all_sections: List[XRefSection]):
|
|
# expand out the hybrid sections as separate sections for the purposes
|
|
# of the xref consistency check (try_resolve takes hybrids into account,
|
|
# but
|
|
|
|
# put the sections in chronological order for code readability reasons
|
|
for ix, section in enumerate(all_sections):
|
|
# check freed refs
|
|
_check_freed_refs(ix, section, all_sections)
|
|
|
|
expanded_sections = list(_with_hybrids(all_sections))
|
|
prev_size = 0
|
|
for ix, section in enumerate(expanded_sections):
|
|
# collect all higher-generation refs
|
|
higher_gen = set(section.xref_data.higher_generation_refs())
|
|
sz = section.meta_info.size
|
|
xref_type = section.meta_info.xref_section_type
|
|
if sz < prev_size and xref_type != XRefSectionType.HYBRID_STREAM:
|
|
# We allow stream sections in hybrid files to declare a smaller
|
|
# xref size. I've seen this happen in cases where the xref stream
|
|
# in a hybrid file doesn't contain itself.
|
|
# Since this is common (*cough*MS Word*cough*) and not explicitly
|
|
# prohibited by the standard, we let that slide.
|
|
raise misc.PdfStrictReadError(
|
|
f"XRef section sizes must be nondecreasing; found XRef section "
|
|
f"of size {sz} after section of size {prev_size}."
|
|
)
|
|
prev_size = max(sz, prev_size)
|
|
|
|
# Verify that all such higher-generation refs are
|
|
# preceded by an appropriate free instruction of the previous generation
|
|
# HOWEVER: the generation match is disabled when matching against
|
|
# freeings in the previous revision, to allow room for the way
|
|
# the free placeholders are commonly declared in hybrid reference
|
|
# files (i.e. as xxxxxxxxx 65535 f)
|
|
is_hybrid_stream = (
|
|
section.meta_info.xref_section_type == XRefSectionType.HYBRID_STREAM
|
|
)
|
|
for idnum, generation in higher_gen:
|
|
preceding = reversed(expanded_sections[:ix])
|
|
|
|
# this is the hybrid stream special case (see above)
|
|
if is_hybrid_stream:
|
|
prec = next(preceding)
|
|
# don't apply generation check, just verify whether a free
|
|
# was present.
|
|
if idnum in prec.xref_data.freed:
|
|
break
|
|
|
|
for prec in preceding:
|
|
try:
|
|
next_generation = prec.xref_data.freed[idnum]
|
|
if next_generation == generation:
|
|
break # we've found the appropriate 'free'
|
|
except KeyError:
|
|
continue
|
|
else:
|
|
raise misc.PdfStrictReadError(
|
|
f"Object with id {idnum} has an orphaned "
|
|
f"generation: generation {generation} was "
|
|
f"not preceded by a free instruction for "
|
|
f"generation {generation - 1}."
|
|
)
|
|
|
|
|
|
class XRefBuilder:
|
|
err_limit = 10
|
|
|
|
def __init__(
|
|
self, handler: PdfHandler, stream, strict: bool, last_startxref: int
|
|
):
|
|
self.handler = handler
|
|
self.stream = stream
|
|
self.strict = strict
|
|
self.last_startxref = last_startxref
|
|
self.sections: List[XRefSection] = []
|
|
|
|
self.trailer = TrailerDictionary()
|
|
self.trailer.container_ref = generic.TrailerReference(handler)
|
|
self.has_xref_stream = False
|
|
|
|
def _read_xref_stream_object(self):
|
|
stream = self.stream
|
|
idnum, generation = read_object_header(stream, strict=self.strict)
|
|
xrefstream_ref = generic.Reference(idnum, generation, pdf=self.handler)
|
|
xrefstream = generic.StreamObject.read_from_stream(
|
|
stream, xrefstream_ref
|
|
)
|
|
xrefstream.container_ref = xrefstream_ref
|
|
assert xrefstream.raw_get("/Type") == "/XRef"
|
|
return xrefstream_ref, xrefstream
|
|
|
|
def _read_xref_stream(self, declared_startxref: int):
|
|
stream = self.stream
|
|
start_location = stream.tell()
|
|
xrefstream_ref, xrefstream = self._read_xref_stream_object()
|
|
|
|
xref_section_data = XRefSectionData()
|
|
xref_section_data.process_entries(
|
|
parse_xref_stream(xrefstream, strict=self.strict),
|
|
strict=self.strict,
|
|
)
|
|
xref_meta_info = XRefSectionMetaInfo(
|
|
xref_section_type=XRefSectionType.STREAM,
|
|
# in a stream, the number of entries is enforced, so we don't need
|
|
# to check it explicitly
|
|
size=int(xrefstream.raw_get('/Size')),
|
|
declared_startxref=declared_startxref,
|
|
start_location=start_location,
|
|
end_location=stream.tell(),
|
|
stream_ref=xrefstream_ref,
|
|
)
|
|
self.sections.append(XRefSection(xref_meta_info, xref_section_data))
|
|
|
|
self.trailer.add_trailer_revision(xrefstream)
|
|
return xrefstream.get('/Prev')
|
|
|
|
def _read_xref_table(self, declared_startxref: int):
|
|
stream = self.stream
|
|
xref_start = stream.tell()
|
|
xref_section_data = XRefSectionData()
|
|
highest = xref_section_data.process_entries(
|
|
parse_xref_table(stream), strict=self.strict
|
|
)
|
|
xref_end = stream.tell()
|
|
|
|
new_trailer = generic.DictionaryObject.read_from_stream(
|
|
stream, generic.TrailerReference(self.handler)
|
|
)
|
|
assert isinstance(new_trailer, generic.DictionaryObject)
|
|
declared_size = int(new_trailer.raw_get('/Size'))
|
|
|
|
if self.strict and highest >= declared_size:
|
|
raise misc.PdfStrictReadError(
|
|
f"Xref table size mismatch: table allocated object with id "
|
|
f"{highest}, but according to the trailer {declared_size - 1} "
|
|
f"is the maximal allowed object id."
|
|
)
|
|
|
|
try:
|
|
hybrid_xref_stm_loc = int(new_trailer.raw_get('/XRefStm'))
|
|
except (KeyError, ValueError):
|
|
hybrid_xref_stm_loc = None
|
|
|
|
if hybrid_xref_stm_loc is not None:
|
|
stream_pos = stream.tell()
|
|
|
|
# TODO: employ similar off-by-n tolerances for hybrid xref sections
|
|
# as for regular ones? Probably YAGNI, but putting it out there
|
|
# for future review.
|
|
stream.seek(hybrid_xref_stm_loc)
|
|
stream_ref, xrefstream = self._read_xref_stream_object()
|
|
|
|
# we'll treat those as regular XRef streams in terms of metadata
|
|
hybrid_stream_meta = XRefSectionMetaInfo(
|
|
xref_section_type=XRefSectionType.HYBRID_STREAM,
|
|
size=int(xrefstream.raw_get('/Size')),
|
|
declared_startxref=hybrid_xref_stm_loc,
|
|
start_location=hybrid_xref_stm_loc,
|
|
end_location=stream.tell(),
|
|
stream_ref=stream_ref,
|
|
)
|
|
xref_section_data.process_hybrid_entries(
|
|
parse_xref_stream(xrefstream),
|
|
hybrid_stream_meta,
|
|
strict=self.strict,
|
|
)
|
|
stream.seek(stream_pos)
|
|
xref_type = XRefSectionType.HYBRID_MAIN
|
|
else:
|
|
stream_ref = None
|
|
xref_type = XRefSectionType.STANDARD
|
|
|
|
xref_meta_info = XRefSectionMetaInfo(
|
|
xref_section_type=xref_type,
|
|
size=declared_size,
|
|
declared_startxref=declared_startxref,
|
|
start_location=xref_start,
|
|
end_location=xref_end,
|
|
stream_ref=stream_ref,
|
|
)
|
|
self.sections.append(XRefSection(xref_meta_info, xref_section_data))
|
|
|
|
self.trailer.add_trailer_revision(new_trailer)
|
|
return new_trailer.get('/Prev')
|
|
|
|
def read_xrefs(self):
|
|
# read all cross reference tables and their trailers
|
|
stream = self.stream
|
|
declared_startxref = startxref = self.last_startxref
|
|
# Tracks the number of times we retried to read a particular xref
|
|
# section
|
|
err_count = 0
|
|
while startxref is not None:
|
|
if (self.strict and err_count) or err_count > self.err_limit:
|
|
raise PdfStrictReadError("Failed to locate xref section")
|
|
if not err_count:
|
|
declared_startxref = startxref
|
|
stream.seek(startxref)
|
|
if misc.skip_over_whitespace(stream):
|
|
# This is common in linearised files, so we're not marking
|
|
# this as an error, even in strict mode.
|
|
logger.debug(
|
|
"Encountered unexpected whitespace when looking "
|
|
"for xref stream"
|
|
)
|
|
startxref = stream.tell()
|
|
x = stream.read(1)
|
|
if x == b"x":
|
|
# standard cross-reference table
|
|
ref = stream.read(4)
|
|
if ref[:3] != b"ref":
|
|
raise PdfReadError("xref table read error")
|
|
startxref = self._read_xref_table(declared_startxref)
|
|
elif x.isdigit():
|
|
# PDF 1.5+ Cross-Reference Stream
|
|
stream.seek(-1, os.SEEK_CUR)
|
|
try:
|
|
startxref = self._read_xref_stream(declared_startxref)
|
|
except ObjectHeaderReadError as e:
|
|
logger.debug(
|
|
"Failed to read xref stream header, attempting to "
|
|
"correct...",
|
|
exc_info=e,
|
|
)
|
|
# can be caused by an OBO error (pointing too far)
|
|
startxref = _attempt_startxref_correction(stream, startxref)
|
|
err_count += 1
|
|
continue
|
|
self.has_xref_stream = True
|
|
else:
|
|
startxref = _attempt_startxref_correction(stream, startxref)
|
|
err_count += 1
|
|
continue
|
|
err_count = 0
|
|
|
|
# put the sections in chronological order from this point onwards
|
|
# (better readability)
|
|
chrono_sections = list(reversed(self.sections))
|
|
if self.strict:
|
|
_check_xref_consistency(chrono_sections)
|
|
return chrono_sections
|
|
|
|
|
|
class ObjectHeaderReadError(misc.PdfReadError):
|
|
pass
|
|
|
|
|
|
def _read_object_header(stream, strict):
|
|
# Should never be necessary to read out whitespace, since the
|
|
# cross-reference table should put us in the right spot to read the
|
|
# object header. In reality... some files have stupid cross reference
|
|
# tables that are off by whitespace bytes.
|
|
extra = False
|
|
misc.skip_over_comment(stream)
|
|
extra |= misc.skip_over_whitespace(stream)
|
|
idnum_bytes = misc.read_until_whitespace(stream)
|
|
idnum = int(idnum_bytes)
|
|
extra |= misc.skip_over_whitespace(stream)
|
|
generation_bytes = misc.read_until_whitespace(stream)
|
|
generation = int(generation_bytes)
|
|
stream.read(3)
|
|
misc.read_non_whitespace(stream, seek_back=True)
|
|
|
|
if extra and strict:
|
|
logger.warning(
|
|
f"Superfluous whitespace found in object header "
|
|
f"{idnum} {generation}"
|
|
)
|
|
return idnum, generation
|
|
|
|
|
|
def read_object_header(stream, strict):
|
|
pos = stream.tell()
|
|
try:
|
|
return _read_object_header(stream, strict)
|
|
except ValueError as e:
|
|
raise ObjectHeaderReadError(
|
|
f"Failed to read object header at {pos}"
|
|
) from e
|
|
|
|
|
|
class TrailerDictionary(generic.PdfObject):
|
|
"""
|
|
The standard mandates that each trailer shall contain
|
|
at least all keys used in the preceding trailer, even if unmodified.
|
|
Of course, we cannot trust documents to actually follow this rule, so
|
|
this class implements fallbacks.
|
|
"""
|
|
|
|
# These keys shouldn't really be considered part of the trailer dictionary,
|
|
# and in particular are not subject to inheritance rules.
|
|
non_trailer_keys = {
|
|
'/Length',
|
|
'/Filter',
|
|
'/DecodeParms',
|
|
'/W',
|
|
'/Type',
|
|
'/Index',
|
|
'/XRefStm',
|
|
}
|
|
|
|
def __init__(self: 'TrailerDictionary'):
|
|
# trailer revisions, numbered backwards (i.e. in processing order)
|
|
# The element at index 0 is the most recent one.
|
|
self._trailer_revisions: List[generic.DictionaryObject] = []
|
|
self._new_changes = generic.DictionaryObject()
|
|
|
|
def add_trailer_revision(self, trailer_dict: generic.DictionaryObject):
|
|
self._trailer_revisions.append(trailer_dict)
|
|
|
|
def __getitem__(self, item):
|
|
# the decrypt parameter doesn't matter, get_object() decrypts
|
|
# as necessary.
|
|
return self.raw_get(item).get_object()
|
|
|
|
def raw_get(
|
|
self,
|
|
key,
|
|
decrypt: EncryptedObjAccess = EncryptedObjAccess.TRANSPARENT,
|
|
revision=None,
|
|
):
|
|
revisions = self._trailer_revisions
|
|
if revision is None:
|
|
try:
|
|
return self._new_changes.raw_get(key, decrypt)
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
# xref sections are numbered backwards
|
|
section = len(revisions) - 1 - revision
|
|
revisions = revisions[section:]
|
|
|
|
if key in self.non_trailer_keys:
|
|
# These are not subject to trailer inheritance;
|
|
# only look in the most recent revision
|
|
return revisions[0].raw_get(key, decrypt)
|
|
|
|
for revision in revisions:
|
|
try:
|
|
return revision.raw_get(key, decrypt)
|
|
except KeyError:
|
|
continue
|
|
raise KeyError(key)
|
|
|
|
def __setitem__(self, item, value):
|
|
self._new_changes[item] = value
|
|
|
|
def __delitem__(self, item):
|
|
try:
|
|
# deleting stuff from _new_changes should be OK.
|
|
del self._new_changes[item]
|
|
except KeyError:
|
|
raise misc.PdfError(
|
|
"Cannot remove existing entries from trailer dictionary, only "
|
|
"update them."
|
|
)
|
|
|
|
def flatten(self, revision=None) -> generic.DictionaryObject:
|
|
relevant_revisions = self._trailer_revisions
|
|
if revision is not None:
|
|
relevant_revisions = relevant_revisions[-revision - 1 :]
|
|
trailer = generic.DictionaryObject(
|
|
{
|
|
k: v
|
|
for revision in reversed(relevant_revisions)
|
|
for k, v in revision.items()
|
|
}
|
|
)
|
|
if revision is None:
|
|
trailer.update(self._new_changes)
|
|
|
|
# ensure that the trailer isn't polluted using stream
|
|
# compression / XRef parameters
|
|
for key in self.non_trailer_keys:
|
|
trailer.pop(key, None)
|
|
return trailer
|
|
|
|
def __contains__(self, item):
|
|
try:
|
|
self.raw_get(item, decrypt=EncryptedObjAccess.PROXY)
|
|
return True
|
|
except KeyError:
|
|
return False
|
|
|
|
def keys(self):
|
|
return frozenset(chain(self._new_changes, *self._trailer_revisions))
|
|
|
|
def __iter__(self):
|
|
return iter(self.keys())
|
|
|
|
def items(self):
|
|
return self.flatten().items()
|
|
|
|
def write_to_stream(self, stream, handler=None, container_ref=None):
|
|
raise NotImplementedError(
|
|
"TrailerDictionary object cannot be written directly"
|
|
)
|
|
|
|
|
|
def _attempt_startxref_correction(stream, startxref):
|
|
# couldn't process data at location pointed to by startxref.
|
|
# Let's see if we can find the xref table nearby, as we've observed this
|
|
# error with an off-by-one before.
|
|
stream.seek(-11, os.SEEK_CUR)
|
|
tmp = stream.read(20)
|
|
xref_loc = tmp.find(b"xref")
|
|
if xref_loc != -1:
|
|
return startxref - (10 - xref_loc)
|
|
# No explicit xref table, try finding a cross-reference stream, with a
|
|
# negative offset to account for the startxref being wrong in either
|
|
# direction
|
|
xrefstm_readback = 5
|
|
stream.seek(startxref - xrefstm_readback)
|
|
tmp = stream.read(2 * xrefstm_readback)
|
|
newline_loc = tmp.rfind(b"\n")
|
|
if newline_loc != -1:
|
|
stream.seek(startxref - xrefstm_readback + newline_loc - 1)
|
|
misc.skip_over_whitespace(stream)
|
|
line_start = stream.tell()
|
|
for look in range(5):
|
|
if stream.read(1).isdigit():
|
|
# This is not a standard PDF, consider adding a warning
|
|
return line_start + look
|
|
|
|
# no xref table found at specified location
|
|
raise PdfReadError("Could not find xref table at specified location")
|
|
|
|
|
|
def convert_to_int(d, size):
|
|
if size <= 8:
|
|
padding = bytes(8 - size)
|
|
return struct.unpack(">q", padding + d)[0]
|
|
else:
|
|
return sum(digit * 256 ** (size - ix - 1) for ix, digit in enumerate(d))
|
|
|
|
|
|
class XRefCache:
|
|
"""
|
|
Internal class to parse & store information from the xref section(s) of a
|
|
PDF document.
|
|
|
|
Stores both the most recent status of all xrefs in addition to their
|
|
historical values.
|
|
|
|
All members of this class are considered internal API and are subject
|
|
to change without notice.
|
|
"""
|
|
|
|
def __init__(self, reader, xref_sections: List[XRefSection]):
|
|
self.reader = reader
|
|
self._xref_sections = xref_sections
|
|
|
|
# Our consistency checker forbids these from being clobbered in strict
|
|
# mode even though the spec allows it. It's too much of a pain
|
|
# to deal with the impact on encryption semantics correctly, and
|
|
# for the time being, I'm willing to deal with the possibility of
|
|
# having to reject a few potentially legitimate files because of this.
|
|
self.xref_stream_refs = {
|
|
section.meta_info.stream_ref
|
|
for section in xref_sections
|
|
if section.meta_info.stream_ref is not None
|
|
}
|
|
|
|
@property
|
|
def total_revisions(self):
|
|
return len(self._xref_sections)
|
|
|
|
def get_last_change(self, ref: generic.Reference):
|
|
for ix, section in enumerate(reversed(self._xref_sections)):
|
|
try:
|
|
section.xref_data.try_resolve(ref)
|
|
return len(self._xref_sections) - 1 - ix
|
|
except KeyError:
|
|
# nothing in this section
|
|
pass
|
|
raise KeyError(ref)
|
|
|
|
def object_streams_used_in(self, revision):
|
|
section = self._xref_sections[revision]
|
|
return {
|
|
generic.Reference(objstm_id, pdf=self.reader)
|
|
for objstm_id in section.xref_data.obj_streams_used
|
|
}
|
|
|
|
def get_introducing_revision(self, ref: generic.Reference):
|
|
for ix, section in enumerate(self._xref_sections):
|
|
try:
|
|
result = section.xref_data.try_resolve(ref)
|
|
if result is not None:
|
|
return ix
|
|
except KeyError:
|
|
# nothing in this section
|
|
pass
|
|
raise KeyError(ref)
|
|
|
|
def get_xref_container_info(self, revision) -> XRefSectionMetaInfo:
|
|
section = self._xref_sections[revision]
|
|
return section.meta_info
|
|
|
|
def get_xref_data(self, revision) -> XRefSectionData:
|
|
section = self._xref_sections[revision]
|
|
return section.xref_data
|
|
|
|
def explicit_refs_in_revision(self, revision) -> Set[generic.Reference]:
|
|
"""
|
|
Look up the object refs for all objects explicitly added or overwritten
|
|
in a given revision.
|
|
|
|
:param revision:
|
|
A revision number. The oldest revision is zero.
|
|
:return:
|
|
A set of Reference objects.
|
|
"""
|
|
section = self._xref_sections[revision]
|
|
result = {
|
|
generic.Reference(idnum, generation, pdf=self.reader)
|
|
for idnum, generation in section.xref_data.explicit_refs_in_revision
|
|
}
|
|
hybrid = section.xref_data.hybrid
|
|
if hybrid is not None:
|
|
# make sure we also account for refs in hybrid sections
|
|
result |= {
|
|
generic.Reference(idnum, generation, pdf=self.reader)
|
|
for idnum, generation in hybrid.xref_data.explicit_refs_in_revision
|
|
}
|
|
return result
|
|
|
|
def refs_freed_in_revision(self, revision) -> Set[generic.Reference]:
|
|
"""
|
|
Look up the object refs for all objects explicitly freed
|
|
in a given revision.
|
|
|
|
:param revision:
|
|
A revision number. The oldest revision is zero.
|
|
:return:
|
|
A set of Reference objects.
|
|
"""
|
|
section = self._xref_sections[revision]
|
|
return {
|
|
generic.Reference(idnum, gen - 1, pdf=self.reader)
|
|
for idnum, gen in section.xref_data.freed.items()
|
|
if gen > 0 # don't acknowledge "dead" objects as freeings
|
|
}
|
|
|
|
def get_startxref_for_revision(self, revision) -> int:
|
|
"""
|
|
Look up the location of the XRef table/stream associated with a specific
|
|
revision, as indicated by startxref or /Prev.
|
|
|
|
:param revision:
|
|
A revision number. The oldest revision is zero.
|
|
:return:
|
|
An integer pointer
|
|
"""
|
|
section = self._xref_sections[revision]
|
|
return section.meta_info.declared_startxref
|
|
|
|
def get_historical_ref(
|
|
self, ref, revision
|
|
) -> Optional[Union[int, ObjStreamRef]]:
|
|
"""
|
|
Look up the location of the historical value of an object.
|
|
|
|
.. note::
|
|
This method is not suitable for determining whether or not
|
|
a particular object ID is available in a given revision, since
|
|
it treats unused objects and freed objects the same way.
|
|
|
|
:param ref:
|
|
An object reference.
|
|
:param revision:
|
|
A revision number. The oldest revision is zero.
|
|
:return:
|
|
An integer offset, an object stream reference, or ``None`` if
|
|
the reference does not resolve in the specified revision.
|
|
"""
|
|
for section in reversed(self._xref_sections[: revision + 1]):
|
|
try:
|
|
result = section.xref_data.try_resolve(ref)
|
|
return result
|
|
except KeyError:
|
|
continue
|
|
|
|
return None
|
|
|
|
def __getitem__(self, ref):
|
|
# No need to make this more efficient, since the reader caches
|
|
# objects for us.
|
|
return self.get_historical_ref(ref, len(self._xref_sections) - 1)
|
|
|
|
@property
|
|
def hybrid_xrefs_present(self) -> bool:
|
|
"""
|
|
Determine if a file uses hybrid references anywhere.
|
|
|
|
:return:
|
|
``True`` if hybrid references were detected, ``False`` otherwise.
|
|
"""
|
|
return any(
|
|
section.meta_info.xref_section_type == XRefSectionType.HYBRID_MAIN
|
|
for section in self._xref_sections
|
|
)
|
|
|
|
|
|
PositionDict = Dict[Tuple[int, int], Union[int, Tuple[int, int]]]
|
|
# Note: position_dict is (generation, objId) -> pos
|
|
# for historical reasons (inherited from refactored PyPDF2 logic).
|
|
# Position can be an int (absolute file offset) or a
|
|
# tuple (object stream reference)
|
|
|
|
|
|
OBJSTREAM_FORBIDDEN = (generic.IndirectObject, generic.StreamObject)
|
|
|
|
|
|
class ObjectStream:
|
|
"""
|
|
Utility class to collect objects into a PDF object stream.
|
|
|
|
Object streams are mainly useful for space efficiency reasons.
|
|
They allow related objects to be grouped & compressed together in a
|
|
more flexible manner.
|
|
|
|
|
|
.. warning::
|
|
Object streams can only be used in files with a cross-reference
|
|
stream, as opposed to a classical XRef table.
|
|
In particular, this means that incremental updates to files with a
|
|
legacy XRef table cannot contain object streams either.
|
|
See § 7.5.7 in ISO 32000-1 for further details.
|
|
|
|
.. danger::
|
|
Use :meth:`.BasePdfFileWriter.prepare_object_stream` to create instances
|
|
of object streams. The `__init__` function is internal API.
|
|
|
|
"""
|
|
|
|
def __init__(self, compress=True):
|
|
self._obj_refs = {}
|
|
self.compress = compress
|
|
self.ref = None
|
|
|
|
def __bool__(self):
|
|
return bool(self._obj_refs)
|
|
|
|
def __iter__(self):
|
|
return iter(self._obj_refs.items())
|
|
|
|
def add_object(self, idnum: int, obj: generic.PdfObject):
|
|
"""
|
|
Add an object to an object stream.
|
|
Note that objects in object streams always have their generation number
|
|
set to `0` by definition.
|
|
|
|
:param idnum:
|
|
The object's ID number.
|
|
:param obj:
|
|
The object to embed into the object stream.
|
|
:raise TypeError:
|
|
Raised if ``obj`` is an instance of :class:`~.generic.StreamObject`
|
|
or :class:`~.generic.IndirectObject`.
|
|
"""
|
|
|
|
if isinstance(obj, OBJSTREAM_FORBIDDEN):
|
|
raise TypeError(
|
|
'Stream objects and bare references cannot be embedded into '
|
|
'object streams.'
|
|
)
|
|
self._obj_refs[idnum] = obj
|
|
|
|
def as_pdf_object(self) -> generic.StreamObject:
|
|
"""
|
|
Render the object stream to a PDF stream object
|
|
|
|
:return: An instance of :class:`~.generic.StreamObject`.
|
|
"""
|
|
stream_header = BytesIO()
|
|
main_body = BytesIO()
|
|
for idnum, obj in self._obj_refs.items():
|
|
offset = main_body.tell()
|
|
obj.write_to_stream(main_body, None)
|
|
stream_header.write(b'%d %d ' % (idnum, offset))
|
|
|
|
first_obj_offset = stream_header.tell()
|
|
stream_header.seek(0)
|
|
sh_bytes = stream_header.read(first_obj_offset)
|
|
stream_data = sh_bytes + main_body.getvalue()
|
|
stream_object = generic.StreamObject(
|
|
{
|
|
pdf_name('/Type'): pdf_name('/ObjStm'),
|
|
pdf_name('/N'): generic.NumberObject(len(self._obj_refs)),
|
|
pdf_name('/First'): generic.NumberObject(first_obj_offset),
|
|
},
|
|
stream_data=stream_data,
|
|
)
|
|
if self.compress:
|
|
stream_object.compress()
|
|
return stream_object
|
|
|
|
|
|
def _contiguous_xref_chunks(position_dict):
|
|
"""
|
|
Helper method to divide the XRef table (or stream) into contiguous chunks.
|
|
"""
|
|
previous_idnum = None
|
|
current_chunk = []
|
|
|
|
if not position_dict.keys():
|
|
# return immediately, there are no objects
|
|
return
|
|
|
|
# iterate over keys in object ID order
|
|
key_iter = sorted(position_dict.keys(), key=lambda t: t[1])
|
|
(_, first_idnum), key_iter = peek(key_iter)
|
|
for ix in key_iter:
|
|
generation, idnum = ix
|
|
|
|
# the idnum jumped, so yield the current chunk
|
|
# and start a new one
|
|
if current_chunk and idnum != previous_idnum + 1:
|
|
yield first_idnum, current_chunk
|
|
current_chunk = []
|
|
first_idnum = idnum
|
|
|
|
# append the object reference to the current chunk
|
|
# (xref table requires position and generation entries)
|
|
current_chunk.append((position_dict[ix], generation))
|
|
previous_idnum = idnum
|
|
|
|
# there is always at least one chunk, so this is fine
|
|
yield first_idnum, current_chunk
|
|
|
|
|
|
def write_xref_table(stream, position_dict: Dict[Tuple[int, int], int]):
|
|
xref_location = stream.tell()
|
|
stream.write(b'xref\n')
|
|
# Insert xref table subsections in contiguous chunks.
|
|
# This is necessarily more complicated than the implementation
|
|
# in PyPDF2 (see ISO 32000 § 7.5.4, esp. on updates), since
|
|
# we need to handle incremental updates correctly.
|
|
subsections = _contiguous_xref_chunks(position_dict)
|
|
|
|
def write_header(idnum, length):
|
|
header = '%d %d\n' % (idnum, length)
|
|
stream.write(header.encode('ascii'))
|
|
|
|
def write_subsection(chunk):
|
|
for position, generation in chunk:
|
|
entry = "%010d %05d n \n" % (position, generation)
|
|
stream.write(entry.encode('ascii'))
|
|
|
|
try:
|
|
first_idnum, subsection = next(subsections)
|
|
except StopIteration:
|
|
# no updates, just write '0 0' and be done with it
|
|
stream.write(b'0 0\n')
|
|
return xref_location
|
|
# TODO support deleting objects
|
|
# case distinction: in contrast with the above we have to ensure that
|
|
# everything is written in one chunk when *not* doing incremental updates.
|
|
# In particular, this applies to the null object
|
|
null_obj_ref = b'0000000000 65535 f \n'
|
|
if first_idnum == 1:
|
|
# integrate the null object into the first subsection
|
|
write_header(0, len(subsection) + 1)
|
|
stream.write(null_obj_ref)
|
|
write_subsection(subsection)
|
|
else:
|
|
# insert origin of linked list of freed objects, and then the first
|
|
# subsection, as usual
|
|
stream.write(b'0 1\n')
|
|
stream.write(null_obj_ref)
|
|
write_header(first_idnum, len(subsection))
|
|
write_subsection(subsection)
|
|
for first_idnum, subsection in subsections:
|
|
# subsection header: list first object ID + length of subsection
|
|
write_header(first_idnum, len(subsection))
|
|
write_subsection(subsection)
|
|
|
|
return xref_location
|
|
|
|
|
|
class XRefStream(generic.StreamObject):
|
|
def __init__(self, position_dict: PositionDict):
|
|
super().__init__()
|
|
self.position_dict = position_dict
|
|
|
|
# type indicator is one byte wide
|
|
# we use longs to indicate positions of objects (>Q)
|
|
# two more bytes for the generation number of an uncompressed object
|
|
widths = map(generic.NumberObject, (1, 8, 2))
|
|
self.update(
|
|
{
|
|
pdf_name('/W'): generic.ArrayObject(widths),
|
|
pdf_name('/Type'): pdf_name('/XRef'),
|
|
}
|
|
)
|
|
|
|
def write_to_stream(self, stream, handler=None, container_ref=None):
|
|
# the caller is responsible for making sure that the stream
|
|
# is registered in the position dictionary
|
|
|
|
index = [0, 1]
|
|
subsections = _contiguous_xref_chunks(self.position_dict)
|
|
stream_content = BytesIO()
|
|
# write null object
|
|
stream_content.write(b'\x00' * 9 + b'\xff\xff')
|
|
for first_idnum, subsection in subsections:
|
|
index += [first_idnum, len(subsection)]
|
|
for position, generation in subsection:
|
|
if isinstance(position, tuple):
|
|
# reference to object in object stream
|
|
assert generation == 0
|
|
obj_stream_num, ix = position
|
|
stream_content.write(b'\x02')
|
|
stream_content.write(struct.pack('>Q', obj_stream_num))
|
|
stream_content.write(struct.pack('>H', ix))
|
|
else:
|
|
stream_content.write(b'\x01')
|
|
stream_content.write(struct.pack('>Q', position))
|
|
stream_content.write(struct.pack('>H', generation))
|
|
index_entry = generic.ArrayObject(map(generic.NumberObject, index))
|
|
|
|
self[pdf_name('/Index')] = index_entry
|
|
self._data = stream_content.getbuffer()
|
|
super().write_to_stream(stream, None)
|