257 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Page labels are shown by PDF viewers as "the page number".
A page has a numeric index, starting with 0. Additionally to that, the page
has a label. In the most simple case:
label = index + 1
However, the title page and the table of contents might have roman numerals as
page label. This makes things more complicated.
Example 1
---------
>>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
[0, IndirectObject(18, 0, 139929798197504),
8, IndirectObject(19, 0, 139929798197504)]
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
{'/S': '/r'}
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
{'/S': '/D'}
Example 2
---------
The following example shows a document with pages labeled
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
1 0 obj
<< /Type /Catalog
/PageLabels << /Nums [
0 << /S /r >>
4 << /S /D >>
7 << /S /D
/P ( A- )
/St 8
>>
% A number tree containing
% three page label dictionaries
]
>>
...
>>
endobj
PDF Specification 1.7
=====================
Table 159 Entries in a page label dictionary
----------------------------------------------
The S-key:
D Decimal arabic numerals
R Uppercase roman numerals
r Lowercase roman numerals
A Uppercase letters (A to Z for the first 26 pages,
AA to ZZ for the next 26, and so on)
a Lowercase letters (a to z for the first 26 pages,
aa to zz for the next 26, and so on)
"""
from typing import Iterator, Optional, Tuple
from ._protocols import PdfReaderProtocol
from ._utils import logger_warning
from .generic import ArrayObject, DictionaryObject, NumberObject
def number2uppercase_roman_numeral(num: int) -> str:
roman = [
(1000, "M"),
(900, "CM"),
(500, "D"),
(400, "CD"),
(100, "C"),
(90, "XC"),
(50, "L"),
(40, "XL"),
(10, "X"),
(9, "IX"),
(5, "V"),
(4, "IV"),
(1, "I"),
]
def roman_num(num: int) -> Iterator[str]:
for decimal, roman_repr in roman:
x, _ = divmod(num, decimal)
yield roman_repr * x
num -= decimal * x
if num <= 0:
break
return "".join(list(roman_num(num)))
def number2lowercase_roman_numeral(number: int) -> str:
return number2uppercase_roman_numeral(number).lower()
def number2uppercase_letter(number: int) -> str:
if number <= 0:
raise ValueError("Expecting a positive number")
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
rep = ""
while number > 0:
remainder = number % 26
if remainder == 0:
remainder = 26
rep = alphabet[remainder - 1] + rep
# update
number -= remainder
number = number // 26
return rep
def number2lowercase_letter(number: int) -> str:
return number2uppercase_letter(number).lower()
def index2label(reader: PdfReaderProtocol, index: int) -> str:
"""
See 7.9.7 "Number Trees".
Args:
reader: The PdfReader
index: The index of the page
Returns:
The label of the page, e.g. "iv" or "4".
"""
root = reader.trailer["/Root"]
if "/PageLabels" not in root:
return str(index + 1) # Fallback
number_tree = root["/PageLabels"]
if "/Nums" in number_tree:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = number_tree["/Nums"]
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)
if "/Kids" in number_tree or "/Limits" in number_tree:
logger_warning(
(
"/Kids or /Limits found in PageLabels. "
"Please share this PDF with pypdf: "
"https://github.com/py-pdf/pypdf/pull/1519"
),
__name__,
)
# TODO: Implement /Kids and /Limits for number tree
return str(index + 1) # Fallback if /Nums is not in the number_tree
def nums_insert(
key: NumberObject,
value: DictionaryObject,
nums: ArrayObject,
) -> None:
"""
Insert a key, value pair in a Nums array.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
value: value of the entry
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
i = len(nums)
while i != 0 and key <= nums[i - 2]:
i = i - 2
if i < len(nums) and key == nums[i]:
nums[i + 1] = value
else:
nums.insert(i, key)
nums.insert(i + 1, value)
def nums_clear_range(
key: NumberObject,
page_index_to: int,
nums: ArrayObject,
) -> None:
"""
Remove all entries in a number tree in a range after an entry.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry before the range
page_index_to: The page index of the upper limit of the range
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
if page_index_to < key:
raise ValueError("page_index_to must be greater or equal than key")
i = nums.index(key) + 2
while i < len(nums) and nums[i] <= page_index_to:
nums.pop(i)
nums.pop(i)
def nums_next(
key: NumberObject,
nums: ArrayObject,
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
"""
Return the (key, value) pair of the entry after the given one.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
nums: Nums array
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
i = nums.index(key) + 2
if i < len(nums):
return (nums[i], nums[i + 1])
else:
return (None, None)