257 lines
7.0 KiB
Python
257 lines
7.0 KiB
Python
"""
|
||
Page labels are shown by PDF viewers as "the page number".
|
||
|
||
A page has a numeric index, starting with 0. Additionally to that, the page
|
||
has a label. In the most simple case:
|
||
label = index + 1
|
||
|
||
However, the title page and the table of contents might have roman numerals as
|
||
page label. This makes things more complicated.
|
||
|
||
Example 1
|
||
---------
|
||
|
||
>>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
|
||
[0, IndirectObject(18, 0, 139929798197504),
|
||
8, IndirectObject(19, 0, 139929798197504)]
|
||
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
|
||
{'/S': '/r'}
|
||
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
|
||
{'/S': '/D'}
|
||
|
||
Example 2
|
||
---------
|
||
The following example shows a document with pages labeled
|
||
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
|
||
|
||
1 0 obj
|
||
<< /Type /Catalog
|
||
/PageLabels << /Nums [
|
||
0 << /S /r >>
|
||
4 << /S /D >>
|
||
7 << /S /D
|
||
/P ( A- )
|
||
/St 8
|
||
>>
|
||
% A number tree containing
|
||
% three page label dictionaries
|
||
]
|
||
>>
|
||
...
|
||
>>
|
||
endobj
|
||
|
||
|
||
PDF Specification 1.7
|
||
=====================
|
||
|
||
Table 159 – Entries in a page label dictionary
|
||
----------------------------------------------
|
||
The S-key:
|
||
D Decimal arabic numerals
|
||
R Uppercase roman numerals
|
||
r Lowercase roman numerals
|
||
A Uppercase letters (A to Z for the first 26 pages,
|
||
AA to ZZ for the next 26, and so on)
|
||
a Lowercase letters (a to z for the first 26 pages,
|
||
aa to zz for the next 26, and so on)
|
||
"""
|
||
|
||
from typing import Iterator, Optional, Tuple
|
||
|
||
from ._protocols import PdfReaderProtocol
|
||
from ._utils import logger_warning
|
||
from .generic import ArrayObject, DictionaryObject, NumberObject
|
||
|
||
|
||
def number2uppercase_roman_numeral(num: int) -> str:
|
||
roman = [
|
||
(1000, "M"),
|
||
(900, "CM"),
|
||
(500, "D"),
|
||
(400, "CD"),
|
||
(100, "C"),
|
||
(90, "XC"),
|
||
(50, "L"),
|
||
(40, "XL"),
|
||
(10, "X"),
|
||
(9, "IX"),
|
||
(5, "V"),
|
||
(4, "IV"),
|
||
(1, "I"),
|
||
]
|
||
|
||
def roman_num(num: int) -> Iterator[str]:
|
||
for decimal, roman_repr in roman:
|
||
x, _ = divmod(num, decimal)
|
||
yield roman_repr * x
|
||
num -= decimal * x
|
||
if num <= 0:
|
||
break
|
||
|
||
return "".join(list(roman_num(num)))
|
||
|
||
|
||
def number2lowercase_roman_numeral(number: int) -> str:
|
||
return number2uppercase_roman_numeral(number).lower()
|
||
|
||
|
||
def number2uppercase_letter(number: int) -> str:
|
||
if number <= 0:
|
||
raise ValueError("Expecting a positive number")
|
||
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
|
||
rep = ""
|
||
while number > 0:
|
||
remainder = number % 26
|
||
if remainder == 0:
|
||
remainder = 26
|
||
rep = alphabet[remainder - 1] + rep
|
||
# update
|
||
number -= remainder
|
||
number = number // 26
|
||
return rep
|
||
|
||
|
||
def number2lowercase_letter(number: int) -> str:
|
||
return number2uppercase_letter(number).lower()
|
||
|
||
|
||
def index2label(reader: PdfReaderProtocol, index: int) -> str:
|
||
"""
|
||
See 7.9.7 "Number Trees".
|
||
|
||
Args:
|
||
reader: The PdfReader
|
||
index: The index of the page
|
||
|
||
Returns:
|
||
The label of the page, e.g. "iv" or "4".
|
||
"""
|
||
root = reader.trailer["/Root"]
|
||
if "/PageLabels" not in root:
|
||
return str(index + 1) # Fallback
|
||
number_tree = root["/PageLabels"]
|
||
if "/Nums" in number_tree:
|
||
# [Nums] shall be an array of the form
|
||
# [ key 1 value 1 key 2 value 2 ... key n value n ]
|
||
# where each key_i is an integer and the corresponding
|
||
# value_i shall be the object associated with that key.
|
||
# The keys shall be sorted in numerical order,
|
||
# analogously to the arrangement of keys in a name tree
|
||
# as described in 7.9.6, "Name Trees."
|
||
nums = number_tree["/Nums"]
|
||
i = 0
|
||
value = None
|
||
start_index = 0
|
||
while i < len(nums):
|
||
start_index = nums[i]
|
||
value = nums[i + 1].get_object()
|
||
if i + 2 == len(nums):
|
||
break
|
||
if nums[i + 2] > index:
|
||
break
|
||
i += 2
|
||
m = {
|
||
None: lambda n: "",
|
||
"/D": lambda n: str(n),
|
||
"/R": number2uppercase_roman_numeral,
|
||
"/r": number2lowercase_roman_numeral,
|
||
"/A": number2uppercase_letter,
|
||
"/a": number2lowercase_letter,
|
||
}
|
||
# if /Nums array is not following the specification or if /Nums is empty
|
||
if not isinstance(value, dict):
|
||
return str(index + 1) # Fallback
|
||
start = value.get("/St", 1)
|
||
prefix = value.get("/P", "")
|
||
return prefix + m[value.get("/S")](index - start_index + start)
|
||
if "/Kids" in number_tree or "/Limits" in number_tree:
|
||
logger_warning(
|
||
(
|
||
"/Kids or /Limits found in PageLabels. "
|
||
"Please share this PDF with pypdf: "
|
||
"https://github.com/py-pdf/pypdf/pull/1519"
|
||
),
|
||
__name__,
|
||
)
|
||
# TODO: Implement /Kids and /Limits for number tree
|
||
return str(index + 1) # Fallback if /Nums is not in the number_tree
|
||
|
||
|
||
def nums_insert(
|
||
key: NumberObject,
|
||
value: DictionaryObject,
|
||
nums: ArrayObject,
|
||
) -> None:
|
||
"""
|
||
Insert a key, value pair in a Nums array.
|
||
|
||
See 7.9.7 "Number Trees".
|
||
|
||
Args:
|
||
key: number key of the entry
|
||
value: value of the entry
|
||
nums: Nums array to modify
|
||
"""
|
||
if len(nums) % 2 != 0:
|
||
raise ValueError("a nums like array must have an even number of elements")
|
||
|
||
i = len(nums)
|
||
while i != 0 and key <= nums[i - 2]:
|
||
i = i - 2
|
||
|
||
if i < len(nums) and key == nums[i]:
|
||
nums[i + 1] = value
|
||
else:
|
||
nums.insert(i, key)
|
||
nums.insert(i + 1, value)
|
||
|
||
|
||
def nums_clear_range(
|
||
key: NumberObject,
|
||
page_index_to: int,
|
||
nums: ArrayObject,
|
||
) -> None:
|
||
"""
|
||
Remove all entries in a number tree in a range after an entry.
|
||
|
||
See 7.9.7 "Number Trees".
|
||
|
||
Args:
|
||
key: number key of the entry before the range
|
||
page_index_to: The page index of the upper limit of the range
|
||
nums: Nums array to modify
|
||
"""
|
||
if len(nums) % 2 != 0:
|
||
raise ValueError("a nums like array must have an even number of elements")
|
||
if page_index_to < key:
|
||
raise ValueError("page_index_to must be greater or equal than key")
|
||
|
||
i = nums.index(key) + 2
|
||
while i < len(nums) and nums[i] <= page_index_to:
|
||
nums.pop(i)
|
||
nums.pop(i)
|
||
|
||
|
||
def nums_next(
|
||
key: NumberObject,
|
||
nums: ArrayObject,
|
||
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
|
||
"""
|
||
Return the (key, value) pair of the entry after the given one.
|
||
|
||
See 7.9.7 "Number Trees".
|
||
|
||
Args:
|
||
key: number key of the entry
|
||
nums: Nums array
|
||
"""
|
||
if len(nums) % 2 != 0:
|
||
raise ValueError("a nums like array must have an even number of elements")
|
||
|
||
i = nums.index(key) + 2
|
||
if i < len(nums):
|
||
return (nums[i], nums[i + 1])
|
||
else:
|
||
return (None, None)
|