ProyectoServicioArmamento/venv/Lib/site-packages/xhtml2pdf/parser.py

# Copyright 2010 Dirk Holtwick, holtwick.it
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

import copy
import logging
import re
import xml.dom.minidom
from xml.dom import Node

import html5lib
from html5lib import treebuilders
from reportlab.platypus.doctemplate import FrameBreak, NextPageTemplate
from reportlab.platypus.flowables import KeepInFrame, PageBreak

from xhtml2pdf.default import (
    BOOL,
    BOX,
    COLOR,
    FILE,
    FONT,
    INT,
    MUST,
    POS,
    SIZE,
    STRING,
    TAGS,
)
from xhtml2pdf.files import pisaTempFile

# TODO: Why do we need to import these Tags here? They aren't uses in this file or any other file,
#  but if we don't import them, the tests fail. Very strange (fbernhart)
from xhtml2pdf.tables import (  # noqa: F401
    TableData,
    pisaTagTABLE,
    pisaTagTD,
    pisaTagTH,
    pisaTagTR,
)
from xhtml2pdf.tags import (  # noqa: F401
    pisaTag,
    pisaTagA,
    pisaTagBODY,
    pisaTagBR,
    pisaTagCANVAS,
    pisaTagDIV,
    pisaTagFONT,
    pisaTagH1,
    pisaTagH2,
    pisaTagH3,
    pisaTagH4,
    pisaTagH5,
    pisaTagH6,
    pisaTagHR,
    pisaTagIMG,
    pisaTagINPUT,
    pisaTagLI,
    pisaTagMETA,
    pisaTagOL,
    pisaTagP,
    pisaTagPDFBARCODE,
    pisaTagPDFFONT,
    pisaTagPDFFRAME,
    pisaTagPDFLANGUAGE,
    pisaTagPDFNEXTFRAME,
    pisaTagPDFNEXTPAGE,
    pisaTagPDFNEXTTEMPLATE,
    pisaTagPDFPAGECOUNT,
    pisaTagPDFPAGENUMBER,
    pisaTagPDFSPACER,
    pisaTagPDFTEMPLATE,
    pisaTagPDFTOC,
    pisaTagSTYLE,
    pisaTagSUB,
    pisaTagSUP,
    pisaTagTEXTAREA,
    pisaTagTITLE,
    pisaTagUL,
)
from xhtml2pdf.util import (
    getAlign,
    getBool,
    getBox,
    getColor,
    getPos,
    getSize,
    toList,
    transform_attrs,
)
from xhtml2pdf.w3c import cssDOMElementInterface
from xhtml2pdf.xhtml2pdf_reportlab import PmlLeftPageBreak, PmlRightPageBreak

log = logging.getLogger(__name__)

CSSAttrCache: dict[str, dict] = {}

rxhttpstrip = re.compile("https?://[^/]+(.*)", re.M | re.I)


class AttrContainer(dict):
    def __getattr__(self, name):
        try:
            return dict.__getattr__(self, name)
        except Exception:
            return self[name]


def pisaGetAttributes(c, tag, attributes):
    attrs = {}
    if attributes:
        for k, v in attributes.items():
            try:
                # XXX no Unicode! Reportlab fails with template names
                attrs[str(k)] = str(v)
            except Exception as e:  # noqa: PERF203
                log.debug(
                    "%s during string conversion for %s=%s", e, k, v, exc_info=True
                )
                attrs[k] = v

    nattrs = {}
    if tag in TAGS:
        block, adef = TAGS[tag]
        adef["id"] = STRING

        for k, v in adef.items():
            nattrs[k] = None
            # print k, v
            # defaults, wenn vorhanden
            if isinstance(v, tuple):
                if v[1] == MUST and k not in attrs:
                    log.warning(c.warning("Attribute '%s' must be set!", k))
                    nattrs[k] = None
                    continue
                nv = attrs.get(k, v[1])
                dfl = v[1]
                v = v[0]
            else:
                nv = attrs.get(k, None)
                dfl = None

            if nv is not None:
                if isinstance(v, list):
                    nv = nv.strip().lower()
                    if nv not in v:
                        # ~ raise PML_EXCEPTION, "attribute '%s' of wrong value, allowed is one of: %s" % (k, repr(v))
                        log.warning(
                            c.warning(
                                "Attribute '%s' of wrong value, allowed is one of: %s",
                                k,
                                repr(v),
                            )
                        )
                        nv = dfl

                elif v == BOOL:
                    nv = nv.strip().lower()
                    nv = nv in ("1", "y", "yes", "true", str(k))

                elif v == SIZE:
                    try:
                        nv = getSize(nv)
                    except Exception:
                        log.warning(c.warning("Attribute '%s' expects a size value", k))

                elif v == BOX:
                    nv = getBox(nv, c.pageSize)

                elif v == POS:
                    nv = getPos(nv, c.pageSize)

                elif v == INT:
                    nv = int(nv)

                elif v == COLOR:
                    nv = getColor(nv)

                elif v == FILE:
                    nv = c.getFile(nv)

                elif v == FONT:
                    nv = c.getFontName(nv)

                nattrs[k] = nv

    return AttrContainer(nattrs)


attrNames = """
    color
    font-family
    font-size
    font-weight
    font-style
    text-decoration
    line-height
    letter-spacing
    background-color
    display
    margin-left
    margin-right
    margin-top
    margin-bottom
    padding-left
    padding-right
    padding-top
    padding-bottom
    border-top-color
    border-top-style
    border-top-width
    border-bottom-color
    border-bottom-style
    border-bottom-width
    border-left-color
    border-left-style
    border-left-width
    border-right-color
    border-right-style
    border-right-width
    text-align
    vertical-align
    width
    height
    zoom
    page-break-after
    page-break-before
    list-style-type
    list-style-image
    white-space
    text-indent
    -pdf-page-break
    -pdf-frame-break
    -pdf-next-page
    -pdf-keep-with-next
    -pdf-outline
    -pdf-outline-level
    -pdf-outline-open
    -pdf-line-spacing
    -pdf-keep-in-frame-mode
    -pdf-word-wrap
    """.strip().split()


def getCSSAttr(self, cssCascade, attrName, default=NotImplemented):
    if attrName in self.cssAttrs:
        return self.cssAttrs[attrName]

    try:
        result = cssCascade.findStyleFor(self.cssElement, attrName, default)
    except LookupError:
        result = None

    # XXX Workaround for inline styles
    try:
        style = self.cssStyle
    except Exception:
        style = self.cssStyle = cssCascade.parser.parseInline(
            self.cssElement.getStyleAttr() or ""
        )[0]
    if attrName in style:
        result = style[attrName]

    if result == "inherit":
        if hasattr(self.parentNode, "getCSSAttr"):
            result = self.parentNode.getCSSAttr(cssCascade, attrName, default)
        elif default is not NotImplemented:
            return default
        msg = f"Could not find inherited CSS attribute value for '{attrName}'"
        raise LookupError(msg)

    if result is not None:
        self.cssAttrs[attrName] = result
    return result


# TODO: Monkeypatching standard lib should go away.
xml.dom.minidom.Element.getCSSAttr = getCSSAttr  # type: ignore[attr-defined]

# Create an aliasing system.  Many sources use non-standard tags, because browsers allow
# them to.  This allows us to map a nonstandard name to the standard one.
nonStandardAttrNames = {"bgcolor": "background-color"}


def mapNonStandardAttrs(c, _node, attrList):
    for attr in nonStandardAttrNames:
        if attr in attrList and nonStandardAttrNames[attr] not in c:
            c[nonStandardAttrNames[attr]] = attrList[attr]
    return c


def getCSSAttrCacheKey(node):
    _cl = _id = _st = ""
    for k, v in node.attributes.items():
        if k == "class":
            _cl = v
        elif k == "id":
            _id = v
        elif k == "style":
            _st = v
    return f"{id(node.parentNode)}#{node.tagName.lower()}#{_cl}#{_id}#{_st}"


def CSSCollect(node, c):
    # node.cssAttrs = {}
    # return node.cssAttrs

    if c.css:
        key = getCSSAttrCacheKey(node)

        if (
            hasattr(node.parentNode, "tagName")
            and node.parentNode.tagName.lower() != "html"
        ):
            CachedCSSAttr = CSSAttrCache.get(key, None)
            if CachedCSSAttr is not None:
                node.cssAttrs = CachedCSSAttr
                return CachedCSSAttr

        node.cssElement = cssDOMElementInterface.CSSDOMElementInterface(node)
        node.cssAttrs = {}
        # node.cssElement.onCSSParserVisit(c.cssCascade.parser)
        cssAttrMap = {}
        for cssAttrName in attrNames:
            try:
                cssAttrMap[cssAttrName] = node.getCSSAttr(c.cssCascade, cssAttrName)
            # except LookupError:
            #    pass
            except Exception as e:  # noqa: PERF203
                log.debug("%r during CSS attr '%s'", e, cssAttrName, exc_info=True)

        CSSAttrCache[key] = node.cssAttrs
    return node.cssAttrs


def lower(sequence):
    if isinstance(sequence, str):
        return sequence.lower()
    return sequence[0].lower()


def CSS2Frag(c, kw, isBlock):
    # COLORS
    if "color" in c.cssAttr:
        c.frag.textColor = getColor(c.cssAttr["color"], "#000000")
    if "background-color" in c.cssAttr:
        c.frag.backColor = getColor(c.cssAttr["background-color"], "#ffffff")
        # FONT SIZE, STYLE, WEIGHT
    if "font-family" in c.cssAttr:
        c.frag.fontName = c.getFontName(c.cssAttr["font-family"])
    if "font-size" in c.cssAttr:
        # XXX inherit
        c.frag.fontSize = max(
            getSize("".join(c.cssAttr["font-size"]), c.frag.fontSize, c.baseFontSize),
            1.0,
        )
    if "line-height" in c.cssAttr:
        leading = "".join(c.cssAttr["line-height"])
        c.frag.leading = getSize(leading, c.frag.fontSize)
        c.frag.leadingSource = leading
    else:
        c.frag.leading = getSize(c.frag.leadingSource, c.frag.fontSize)
    if "letter-spacing" in c.cssAttr:
        c.frag.letterSpacing = c.cssAttr["letter-spacing"]
    if "-pdf-line-spacing" in c.cssAttr:
        c.frag.leadingSpace = getSize("".join(c.cssAttr["-pdf-line-spacing"]))
        # print "line-spacing", c.cssAttr["-pdf-line-spacing"], c.frag.leading
    if "font-weight" in c.cssAttr:
        value = lower(c.cssAttr["font-weight"])
        if value in ("bold", "bolder", "500", "600", "700", "800", "900"):
            c.frag.bold = 1
        else:
            c.frag.bold = 0
    for value in toList(c.cssAttr.get("text-decoration", "")):
        if "underline" in value:
            c.frag.underline = 1
        if "line-through" in value:
            c.frag.strike = 1
        if "none" in value:
            c.frag.underline = 0
            c.frag.strike = 0
    if "font-style" in c.cssAttr:
        value = lower(c.cssAttr["font-style"])
        if value in ("italic", "oblique"):
            c.frag.italic = 1
        else:
            c.frag.italic = 0
    if "white-space" in c.cssAttr:
        # normal | pre | nowrap
        c.frag.whiteSpace = str(c.cssAttr["white-space"]).lower()
        # ALIGN & VALIGN
    if "text-align" in c.cssAttr:
        c.frag.alignment = getAlign(c.cssAttr["text-align"])
    if "vertical-align" in c.cssAttr:
        c.frag.vAlign = c.cssAttr["vertical-align"]
        # HEIGHT & WIDTH
    if "height" in c.cssAttr:
        try:
            # XXX Relative is not correct!
            c.frag.height = "".join(toList(c.cssAttr["height"]))
        except TypeError:
            # sequence item 0: expected string, tuple found
            c.frag.height = "".join(toList(c.cssAttr["height"][0]))
        if c.frag.height in ("auto",):
            c.frag.height = None
    if "width" in c.cssAttr:
        try:
            # XXX Relative is not correct!
            c.frag.width = "".join(toList(c.cssAttr["width"]))
        except TypeError:
            c.frag.width = "".join(toList(c.cssAttr["width"][0]))
        if c.frag.width in ("auto",):
            c.frag.width = None
        # ZOOM
    if "zoom" in c.cssAttr:
        # XXX Relative is not correct!
        zoom = "".join(toList(c.cssAttr["zoom"]))
        if zoom.endswith("%"):
            zoom = float(zoom[:-1]) / 100.0
        c.frag.zoom = float(zoom)
        # MARGINS & LIST INDENT, STYLE
    if isBlock:
        transform_attrs(
            c.frag,
            (
                ("spaceBefore", "margin-top"),
                ("spaceAfter", "margin-bottom"),
                ("firstLineIndent", "text-indent"),
            ),
            c.cssAttr,
            getSize,
            extras=c.frag.fontSize,
        )

        if "margin-left" in c.cssAttr:
            c.frag.bulletIndent = kw["margin-left"]  # For lists
            kw["margin-left"] += getSize(c.cssAttr["margin-left"], c.frag.fontSize)
            c.frag.leftIndent = kw["margin-left"]
        if "margin-right" in c.cssAttr:
            kw["margin-right"] += getSize(c.cssAttr["margin-right"], c.frag.fontSize)
            c.frag.rightIndent = kw["margin-right"]

        if "list-style-type" in c.cssAttr:
            c.frag.listStyleType = str(c.cssAttr["list-style-type"]).lower()
        if "list-style-image" in c.cssAttr:
            c.frag.listStyleImage = c.getFile(c.cssAttr["list-style-image"])
        # PADDINGS
    if isBlock:
        transform_attrs(
            c.frag,
            (
                ("paddingTop", "padding-top"),
                ("paddingBottom", "padding-bottom"),
                ("paddingLeft", "padding-left"),
                ("paddingRight", "padding-right"),
            ),
            c.cssAttr,
            getSize,
            extras=c.frag.fontSize,
        )

        # BORDERS
    if isBlock:
        transform_attrs(
            c.frag,
            (
                ("borderTopWidth", "border-top-width"),
                ("borderBottomWidth", "border-bottom-width"),
                ("borderLeftWidth", "border-left-width"),
                ("borderRightWidth", "border-right-width"),
            ),
            c.cssAttr,
            getSize,
            extras=c.frag.fontSize,
        )
        transform_attrs(
            c.frag,
            (
                ("borderTopStyle", "border-top-style"),
                ("borderBottomStyle", "border-bottom-style"),
                ("borderLeftStyle", "border-left-style"),
                ("borderRightStyle", "border-right-style"),
            ),
            c.cssAttr,
            lambda x: x,
        )

        transform_attrs(
            c.frag,
            (
                ("borderTopColor", "border-top-color"),
                ("borderBottomColor", "border-bottom-color"),
                ("borderLeftColor", "border-left-color"),
                ("borderRightColor", "border-right-color"),
            ),
            c.cssAttr,
            getColor,
        )


def pisaPreLoop(node, context, *, collect=False):
    """Collect all CSS definitions."""
    data = ""
    if node.nodeType == Node.TEXT_NODE and collect:
        data = node.data

    elif node.nodeType == Node.ELEMENT_NODE:
        name = node.tagName.lower()

        if name in ("style", "link"):
            attr = pisaGetAttributes(context, name, node.attributes)
            media = [x.strip() for x in attr.media.lower().split(",") if x.strip()]

            if attr.get("type", "").lower() in ("", "text/css") and (
                not media or "all" in media or "print" in media or "pdf" in media
            ):
                if name == "style":
                    for node in node.childNodes:
                        data += pisaPreLoop(node, context, collect=True)
                    context.addCSS(data)
                    return ""

                if name == "link" and attr.href and attr.rel.lower() == "stylesheet":
                    # print "CSS LINK", attr
                    context.addCSS(
                        '\n@import "{}" {};'.format(attr.href, ",".join(media))
                    )

    for node in node.childNodes:
        result = pisaPreLoop(node, context, collect=collect)
        if collect:
            data += result

    return data


def pisaLoop(node, context, path=None, **kw):
    if path is None:
        path = []

    # Initialize KW
    if not kw:
        kw = {"margin-top": 0, "margin-bottom": 0, "margin-left": 0, "margin-right": 0}
    else:
        kw = copy.copy(kw)

    # indent = len(path) * "  " # only used for debug print statements

    # TEXT
    if node.nodeType == Node.TEXT_NODE:
        # print indent, "#", repr(node.data) #, context.frag
        context.addFrag(node.data)
        # context.text.append(node.value)

    # ELEMENT
    elif node.nodeType == Node.ELEMENT_NODE:
        node.tagName = node.tagName.replace(":", "").lower()

        if node.tagName in ("style", "script"):
            return

        path = [*copy.copy(path), node.tagName]

        # Prepare attributes
        attr = pisaGetAttributes(context, node.tagName, node.attributes)
        # log.debug(indent + "<%s %s>" % (node.tagName, attr) +
        # repr(node.attributes.items())) #, path

        # Calculate styles
        context.cssAttr = CSSCollect(node, context)
        context.cssAttr = mapNonStandardAttrs(context.cssAttr, node, attr)
        context.node = node

        # Block?
        PAGE_BREAK = 1
        PAGE_BREAK_RIGHT = 2
        PAGE_BREAK_LEFT = 3

        pageBreakAfter = False
        frameBreakAfter = False
        display = lower(context.cssAttr.get("display", "inline"))
        # print indent, node.tagName, display,
        # context.cssAttr.get("background-color", None), attr
        isBlock = display == "block"

        if isBlock:
            context.addPara()

            # Page break by CSS
            if "-pdf-next-page" in context.cssAttr:
                context.addStory(
                    NextPageTemplate(str(context.cssAttr["-pdf-next-page"]))
                )
            if (
                "-pdf-page-break" in context.cssAttr
                and str(context.cssAttr["-pdf-page-break"]).lower() == "before"
            ):
                context.addStory(PageBreak())
            if "-pdf-frame-break" in context.cssAttr:
                if str(context.cssAttr["-pdf-frame-break"]).lower() == "before":
                    context.addStory(FrameBreak())
                if str(context.cssAttr["-pdf-frame-break"]).lower() == "after":
                    frameBreakAfter = True
            if "page-break-before" in context.cssAttr:
                if str(context.cssAttr["page-break-before"]).lower() == "always":
                    context.addStory(PageBreak())
                if str(context.cssAttr["page-break-before"]).lower() == "right":
                    context.addStory(PageBreak())
                    context.addStory(PmlRightPageBreak())
                if str(context.cssAttr["page-break-before"]).lower() == "left":
                    context.addStory(PageBreak())
                    context.addStory(PmlLeftPageBreak())
            if "page-break-after" in context.cssAttr:
                if str(context.cssAttr["page-break-after"]).lower() == "always":
                    pageBreakAfter = PAGE_BREAK
                if str(context.cssAttr["page-break-after"]).lower() == "right":
                    pageBreakAfter = PAGE_BREAK_RIGHT
                if str(context.cssAttr["page-break-after"]).lower() == "left":
                    pageBreakAfter = PAGE_BREAK_LEFT

        if display == "none":
            # print "none!"
            return

        # Translate CSS to frags

        # Save previous frag styles
        context.pushFrag()

        # Map styles to Reportlab fragment properties
        CSS2Frag(context, kw, isBlock=isBlock)

        # EXTRAS
        transform_attrs(
            context.frag,
            (
                ("keepWithNext", "-pdf-keep-with-next"),
                ("outline", "-pdf-outline"),
                # ("borderLeftColor", "-pdf-outline-open"),
            ),
            context.cssAttr,
            getBool,
        )

        if "-pdf-outline-level" in context.cssAttr:
            context.frag.outlineLevel = int(context.cssAttr["-pdf-outline-level"])

        if "-pdf-word-wrap" in context.cssAttr:
            context.frag.wordWrap = context.cssAttr["-pdf-word-wrap"]

        # handle keep-in-frame
        keepInFrameMode = None
        keepInFrameMaxWidth = 0
        keepInFrameMaxHeight = 0
        if "-pdf-keep-in-frame-mode" in context.cssAttr:
            value = str(context.cssAttr["-pdf-keep-in-frame-mode"]).strip().lower()
            if value in ("shrink", "error", "overflow", "truncate"):
                keepInFrameMode = value
            else:
                keepInFrameMode = "shrink"
            # Added because we need a default value.

        if "-pdf-keep-in-frame-max-width" in context.cssAttr:
            keepInFrameMaxWidth = getSize(
                "".join(context.cssAttr["-pdf-keep-in-frame-max-width"])
            )
        if "-pdf-keep-in-frame-max-height" in context.cssAttr:
            keepInFrameMaxHeight = getSize(
                "".join(context.cssAttr["-pdf-keep-in-frame-max-height"])
            )

        # ignore nested keep-in-frames, tables have their own KIF handling
        keepInFrame = keepInFrameMode is not None and context.keepInFrameIndex is None
        if keepInFrame:
            # keep track of current story index, so we can wrap everythink
            # added after this point in a KeepInFrame
            context.keepInFrameIndex = len(context.story)

        # BEGIN tag
        klass = globals().get("pisaTag%s" % node.tagName.replace(":", "").upper(), None)
        obj = None

        # Static block
        elementId = attr.get("id", None)
        staticFrame = context.frameStatic.get(elementId, None)
        if staticFrame:
            context.frag.insideStaticFrame += 1
            oldStory = context.swapStory()

        # Tag specific operations
        if klass is not None:
            obj = klass(node, attr)
            obj.start(context)

        # Visit child nodes
        context.fragBlock = fragBlock = copy.copy(context.frag)
        for nnode in node.childNodes:
            pisaLoop(nnode, context, path, **kw)
        context.fragBlock = fragBlock

        # END tag
        if obj:
            obj.end(context)

        # Block?
        if isBlock:
            context.addPara()

            # XXX Buggy!

            # Page break by CSS
            if pageBreakAfter:
                context.addStory(PageBreak())
                if pageBreakAfter == PAGE_BREAK_RIGHT:
                    context.addStory(PmlRightPageBreak())
                if pageBreakAfter == PAGE_BREAK_LEFT:
                    context.addStory(PmlLeftPageBreak())
            if frameBreakAfter:
                context.addStory(FrameBreak())

        if keepInFrame:
            # get all content added after start of -pdf-keep-in-frame and wrap
            # it in a KeepInFrame
            substory = context.story[context.keepInFrameIndex :]
            context.story = context.story[: context.keepInFrameIndex]
            context.story.append(
                KeepInFrame(
                    content=substory,
                    maxWidth=keepInFrameMaxWidth,
                    maxHeight=keepInFrameMaxHeight,
                    mode=keepInFrameMode,
                )
            )
            # mode wasn't being used; it is necessary for tables or images at
            # end of page.
            context.keepInFrameIndex = None

        # Static block, END
        if staticFrame:
            context.addPara()
            for frame in staticFrame:
                frame.pisaStaticStory = context.story
            context.swapStory(oldStory)
            context.frag.insideStaticFrame -= 1

        # context.debug(1, indent, "</%s>" % (node.tagName))

        # Reset frag style
        context.pullFrag()

    # Unknown or not handled
    else:
        # context.debug(1, indent, "???", node, node.nodeType, repr(node))
        # Loop over children
        for node in node.childNodes:
            pisaLoop(node, context, path, **kw)


def pisaParser(
    src,
    context,
    default_css="",
    xhtml=False,  # noqa: FBT002
    encoding="utf8",
    xml_output=None,
):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object.
    """
    global CSSAttrCache  # noqa: PLW0603
    CSSAttrCache = {}

    if xhtml:
        log.warning("xhtml parameter will be removed on next release 0.2.8")
        # TODO: XHTMLParser doesn't seem to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    parser_kwargs = {}
    if isinstance(src, str):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)
        # To pass the encoding used to convert the text_type src to binary_type
        # on to html5lib's parser to ensure proper decoding
        parser_kwargs["transport_encoding"] = encoding

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
    #         if not inputstream.isValidEncoding(encoding):
    #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
    #     else:
    #         if inputstream.codecName(encoding) is None:
    #             log.error("%r is not a valid encoding", encoding)
    document = parser.parse(src, **parser_kwargs)  # encoding=encoding)

    if xml_output:
        xml_output.write(document.toprettyxml(encoding=encoding))

    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    context.parseCSS()
    pisaLoop(document, context)
    return context


# Shortcuts

HTML2PDF = pisaParser


def XHTML2PDF(*a, **kw):
    kw["xhtml"] = True
    return HTML2PDF(*a, **kw)


XML2PDF = XHTML2PDF