# Copyright 2010 Dirk Holtwick, holtwick.it # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import copy import logging import re import xml.dom.minidom from xml.dom import Node import html5lib from html5lib import treebuilders from reportlab.platypus.doctemplate import FrameBreak, NextPageTemplate from reportlab.platypus.flowables import KeepInFrame, PageBreak from xhtml2pdf.default import ( BOOL, BOX, COLOR, FILE, FONT, INT, MUST, POS, SIZE, STRING, TAGS, ) from xhtml2pdf.files import pisaTempFile # TODO: Why do we need to import these Tags here? They aren't uses in this file or any other file, # but if we don't import them, the tests fail. Very strange (fbernhart) from xhtml2pdf.tables import ( # noqa: F401 TableData, pisaTagTABLE, pisaTagTD, pisaTagTH, pisaTagTR, ) from xhtml2pdf.tags import ( # noqa: F401 pisaTag, pisaTagA, pisaTagBODY, pisaTagBR, pisaTagCANVAS, pisaTagDIV, pisaTagFONT, pisaTagH1, pisaTagH2, pisaTagH3, pisaTagH4, pisaTagH5, pisaTagH6, pisaTagHR, pisaTagIMG, pisaTagINPUT, pisaTagLI, pisaTagMETA, pisaTagOL, pisaTagP, pisaTagPDFBARCODE, pisaTagPDFFONT, pisaTagPDFFRAME, pisaTagPDFLANGUAGE, pisaTagPDFNEXTFRAME, pisaTagPDFNEXTPAGE, pisaTagPDFNEXTTEMPLATE, pisaTagPDFPAGECOUNT, pisaTagPDFPAGENUMBER, pisaTagPDFSPACER, pisaTagPDFTEMPLATE, pisaTagPDFTOC, pisaTagSTYLE, pisaTagSUB, pisaTagSUP, pisaTagTEXTAREA, pisaTagTITLE, pisaTagUL, ) from xhtml2pdf.util import ( getAlign, getBool, getBox, getColor, getPos, getSize, toList, transform_attrs, ) from xhtml2pdf.w3c import cssDOMElementInterface from xhtml2pdf.xhtml2pdf_reportlab import PmlLeftPageBreak, PmlRightPageBreak log = logging.getLogger(__name__) CSSAttrCache: dict[str, dict] = {} rxhttpstrip = re.compile("https?://[^/]+(.*)", re.M | re.I) class AttrContainer(dict): def __getattr__(self, name): try: return dict.__getattr__(self, name) except Exception: return self[name] def pisaGetAttributes(c, tag, attributes): attrs = {} if attributes: for k, v in attributes.items(): try: # XXX no Unicode! Reportlab fails with template names attrs[str(k)] = str(v) except Exception as e: # noqa: PERF203 log.debug( "%s during string conversion for %s=%s", e, k, v, exc_info=True ) attrs[k] = v nattrs = {} if tag in TAGS: block, adef = TAGS[tag] adef["id"] = STRING for k, v in adef.items(): nattrs[k] = None # print k, v # defaults, wenn vorhanden if isinstance(v, tuple): if v[1] == MUST and k not in attrs: log.warning(c.warning("Attribute '%s' must be set!", k)) nattrs[k] = None continue nv = attrs.get(k, v[1]) dfl = v[1] v = v[0] else: nv = attrs.get(k, None) dfl = None if nv is not None: if isinstance(v, list): nv = nv.strip().lower() if nv not in v: # ~ raise PML_EXCEPTION, "attribute '%s' of wrong value, allowed is one of: %s" % (k, repr(v)) log.warning( c.warning( "Attribute '%s' of wrong value, allowed is one of: %s", k, repr(v), ) ) nv = dfl elif v == BOOL: nv = nv.strip().lower() nv = nv in ("1", "y", "yes", "true", str(k)) elif v == SIZE: try: nv = getSize(nv) except Exception: log.warning(c.warning("Attribute '%s' expects a size value", k)) elif v == BOX: nv = getBox(nv, c.pageSize) elif v == POS: nv = getPos(nv, c.pageSize) elif v == INT: nv = int(nv) elif v == COLOR: nv = getColor(nv) elif v == FILE: nv = c.getFile(nv) elif v == FONT: nv = c.getFontName(nv) nattrs[k] = nv return AttrContainer(nattrs) attrNames = """ color font-family font-size font-weight font-style text-decoration line-height letter-spacing background-color display margin-left margin-right margin-top margin-bottom padding-left padding-right padding-top padding-bottom border-top-color border-top-style border-top-width border-bottom-color border-bottom-style border-bottom-width border-left-color border-left-style border-left-width border-right-color border-right-style border-right-width text-align vertical-align width height zoom page-break-after page-break-before list-style-type list-style-image white-space text-indent -pdf-page-break -pdf-frame-break -pdf-next-page -pdf-keep-with-next -pdf-outline -pdf-outline-level -pdf-outline-open -pdf-line-spacing -pdf-keep-in-frame-mode -pdf-word-wrap """.strip().split() def getCSSAttr(self, cssCascade, attrName, default=NotImplemented): if attrName in self.cssAttrs: return self.cssAttrs[attrName] try: result = cssCascade.findStyleFor(self.cssElement, attrName, default) except LookupError: result = None # XXX Workaround for inline styles try: style = self.cssStyle except Exception: style = self.cssStyle = cssCascade.parser.parseInline( self.cssElement.getStyleAttr() or "" )[0] if attrName in style: result = style[attrName] if result == "inherit": if hasattr(self.parentNode, "getCSSAttr"): result = self.parentNode.getCSSAttr(cssCascade, attrName, default) elif default is not NotImplemented: return default msg = f"Could not find inherited CSS attribute value for '{attrName}'" raise LookupError(msg) if result is not None: self.cssAttrs[attrName] = result return result # TODO: Monkeypatching standard lib should go away. xml.dom.minidom.Element.getCSSAttr = getCSSAttr # type: ignore[attr-defined] # Create an aliasing system. Many sources use non-standard tags, because browsers allow # them to. This allows us to map a nonstandard name to the standard one. nonStandardAttrNames = {"bgcolor": "background-color"} def mapNonStandardAttrs(c, _node, attrList): for attr in nonStandardAttrNames: if attr in attrList and nonStandardAttrNames[attr] not in c: c[nonStandardAttrNames[attr]] = attrList[attr] return c def getCSSAttrCacheKey(node): _cl = _id = _st = "" for k, v in node.attributes.items(): if k == "class": _cl = v elif k == "id": _id = v elif k == "style": _st = v return f"{id(node.parentNode)}#{node.tagName.lower()}#{_cl}#{_id}#{_st}" def CSSCollect(node, c): # node.cssAttrs = {} # return node.cssAttrs if c.css: key = getCSSAttrCacheKey(node) if ( hasattr(node.parentNode, "tagName") and node.parentNode.tagName.lower() != "html" ): CachedCSSAttr = CSSAttrCache.get(key, None) if CachedCSSAttr is not None: node.cssAttrs = CachedCSSAttr return CachedCSSAttr node.cssElement = cssDOMElementInterface.CSSDOMElementInterface(node) node.cssAttrs = {} # node.cssElement.onCSSParserVisit(c.cssCascade.parser) cssAttrMap = {} for cssAttrName in attrNames: try: cssAttrMap[cssAttrName] = node.getCSSAttr(c.cssCascade, cssAttrName) # except LookupError: # pass except Exception as e: # noqa: PERF203 log.debug("%r during CSS attr '%s'", e, cssAttrName, exc_info=True) CSSAttrCache[key] = node.cssAttrs return node.cssAttrs def lower(sequence): if isinstance(sequence, str): return sequence.lower() return sequence[0].lower() def CSS2Frag(c, kw, isBlock): # COLORS if "color" in c.cssAttr: c.frag.textColor = getColor(c.cssAttr["color"], "#000000") if "background-color" in c.cssAttr: c.frag.backColor = getColor(c.cssAttr["background-color"], "#ffffff") # FONT SIZE, STYLE, WEIGHT if "font-family" in c.cssAttr: c.frag.fontName = c.getFontName(c.cssAttr["font-family"]) if "font-size" in c.cssAttr: # XXX inherit c.frag.fontSize = max( getSize("".join(c.cssAttr["font-size"]), c.frag.fontSize, c.baseFontSize), 1.0, ) if "line-height" in c.cssAttr: leading = "".join(c.cssAttr["line-height"]) c.frag.leading = getSize(leading, c.frag.fontSize) c.frag.leadingSource = leading else: c.frag.leading = getSize(c.frag.leadingSource, c.frag.fontSize) if "letter-spacing" in c.cssAttr: c.frag.letterSpacing = c.cssAttr["letter-spacing"] if "-pdf-line-spacing" in c.cssAttr: c.frag.leadingSpace = getSize("".join(c.cssAttr["-pdf-line-spacing"])) # print "line-spacing", c.cssAttr["-pdf-line-spacing"], c.frag.leading if "font-weight" in c.cssAttr: value = lower(c.cssAttr["font-weight"]) if value in ("bold", "bolder", "500", "600", "700", "800", "900"): c.frag.bold = 1 else: c.frag.bold = 0 for value in toList(c.cssAttr.get("text-decoration", "")): if "underline" in value: c.frag.underline = 1 if "line-through" in value: c.frag.strike = 1 if "none" in value: c.frag.underline = 0 c.frag.strike = 0 if "font-style" in c.cssAttr: value = lower(c.cssAttr["font-style"]) if value in ("italic", "oblique"): c.frag.italic = 1 else: c.frag.italic = 0 if "white-space" in c.cssAttr: # normal | pre | nowrap c.frag.whiteSpace = str(c.cssAttr["white-space"]).lower() # ALIGN & VALIGN if "text-align" in c.cssAttr: c.frag.alignment = getAlign(c.cssAttr["text-align"]) if "vertical-align" in c.cssAttr: c.frag.vAlign = c.cssAttr["vertical-align"] # HEIGHT & WIDTH if "height" in c.cssAttr: try: # XXX Relative is not correct! c.frag.height = "".join(toList(c.cssAttr["height"])) except TypeError: # sequence item 0: expected string, tuple found c.frag.height = "".join(toList(c.cssAttr["height"][0])) if c.frag.height in ("auto",): c.frag.height = None if "width" in c.cssAttr: try: # XXX Relative is not correct! c.frag.width = "".join(toList(c.cssAttr["width"])) except TypeError: c.frag.width = "".join(toList(c.cssAttr["width"][0])) if c.frag.width in ("auto",): c.frag.width = None # ZOOM if "zoom" in c.cssAttr: # XXX Relative is not correct! zoom = "".join(toList(c.cssAttr["zoom"])) if zoom.endswith("%"): zoom = float(zoom[:-1]) / 100.0 c.frag.zoom = float(zoom) # MARGINS & LIST INDENT, STYLE if isBlock: transform_attrs( c.frag, ( ("spaceBefore", "margin-top"), ("spaceAfter", "margin-bottom"), ("firstLineIndent", "text-indent"), ), c.cssAttr, getSize, extras=c.frag.fontSize, ) if "margin-left" in c.cssAttr: c.frag.bulletIndent = kw["margin-left"] # For lists kw["margin-left"] += getSize(c.cssAttr["margin-left"], c.frag.fontSize) c.frag.leftIndent = kw["margin-left"] if "margin-right" in c.cssAttr: kw["margin-right"] += getSize(c.cssAttr["margin-right"], c.frag.fontSize) c.frag.rightIndent = kw["margin-right"] if "list-style-type" in c.cssAttr: c.frag.listStyleType = str(c.cssAttr["list-style-type"]).lower() if "list-style-image" in c.cssAttr: c.frag.listStyleImage = c.getFile(c.cssAttr["list-style-image"]) # PADDINGS if isBlock: transform_attrs( c.frag, ( ("paddingTop", "padding-top"), ("paddingBottom", "padding-bottom"), ("paddingLeft", "padding-left"), ("paddingRight", "padding-right"), ), c.cssAttr, getSize, extras=c.frag.fontSize, ) # BORDERS if isBlock: transform_attrs( c.frag, ( ("borderTopWidth", "border-top-width"), ("borderBottomWidth", "border-bottom-width"), ("borderLeftWidth", "border-left-width"), ("borderRightWidth", "border-right-width"), ), c.cssAttr, getSize, extras=c.frag.fontSize, ) transform_attrs( c.frag, ( ("borderTopStyle", "border-top-style"), ("borderBottomStyle", "border-bottom-style"), ("borderLeftStyle", "border-left-style"), ("borderRightStyle", "border-right-style"), ), c.cssAttr, lambda x: x, ) transform_attrs( c.frag, ( ("borderTopColor", "border-top-color"), ("borderBottomColor", "border-bottom-color"), ("borderLeftColor", "border-left-color"), ("borderRightColor", "border-right-color"), ), c.cssAttr, getColor, ) def pisaPreLoop(node, context, *, collect=False): """Collect all CSS definitions.""" data = "" if node.nodeType == Node.TEXT_NODE and collect: data = node.data elif node.nodeType == Node.ELEMENT_NODE: name = node.tagName.lower() if name in ("style", "link"): attr = pisaGetAttributes(context, name, node.attributes) media = [x.strip() for x in attr.media.lower().split(",") if x.strip()] if attr.get("type", "").lower() in ("", "text/css") and ( not media or "all" in media or "print" in media or "pdf" in media ): if name == "style": for node in node.childNodes: data += pisaPreLoop(node, context, collect=True) context.addCSS(data) return "" if name == "link" and attr.href and attr.rel.lower() == "stylesheet": # print "CSS LINK", attr context.addCSS( '\n@import "{}" {};'.format(attr.href, ",".join(media)) ) for node in node.childNodes: result = pisaPreLoop(node, context, collect=collect) if collect: data += result return data def pisaLoop(node, context, path=None, **kw): if path is None: path = [] # Initialize KW if not kw: kw = {"margin-top": 0, "margin-bottom": 0, "margin-left": 0, "margin-right": 0} else: kw = copy.copy(kw) # indent = len(path) * " " # only used for debug print statements # TEXT if node.nodeType == Node.TEXT_NODE: # print indent, "#", repr(node.data) #, context.frag context.addFrag(node.data) # context.text.append(node.value) # ELEMENT elif node.nodeType == Node.ELEMENT_NODE: node.tagName = node.tagName.replace(":", "").lower() if node.tagName in ("style", "script"): return path = [*copy.copy(path), node.tagName] # Prepare attributes attr = pisaGetAttributes(context, node.tagName, node.attributes) # log.debug(indent + "<%s %s>" % (node.tagName, attr) + # repr(node.attributes.items())) #, path # Calculate styles context.cssAttr = CSSCollect(node, context) context.cssAttr = mapNonStandardAttrs(context.cssAttr, node, attr) context.node = node # Block? PAGE_BREAK = 1 PAGE_BREAK_RIGHT = 2 PAGE_BREAK_LEFT = 3 pageBreakAfter = False frameBreakAfter = False display = lower(context.cssAttr.get("display", "inline")) # print indent, node.tagName, display, # context.cssAttr.get("background-color", None), attr isBlock = display == "block" if isBlock: context.addPara() # Page break by CSS if "-pdf-next-page" in context.cssAttr: context.addStory( NextPageTemplate(str(context.cssAttr["-pdf-next-page"])) ) if ( "-pdf-page-break" in context.cssAttr and str(context.cssAttr["-pdf-page-break"]).lower() == "before" ): context.addStory(PageBreak()) if "-pdf-frame-break" in context.cssAttr: if str(context.cssAttr["-pdf-frame-break"]).lower() == "before": context.addStory(FrameBreak()) if str(context.cssAttr["-pdf-frame-break"]).lower() == "after": frameBreakAfter = True if "page-break-before" in context.cssAttr: if str(context.cssAttr["page-break-before"]).lower() == "always": context.addStory(PageBreak()) if str(context.cssAttr["page-break-before"]).lower() == "right": context.addStory(PageBreak()) context.addStory(PmlRightPageBreak()) if str(context.cssAttr["page-break-before"]).lower() == "left": context.addStory(PageBreak()) context.addStory(PmlLeftPageBreak()) if "page-break-after" in context.cssAttr: if str(context.cssAttr["page-break-after"]).lower() == "always": pageBreakAfter = PAGE_BREAK if str(context.cssAttr["page-break-after"]).lower() == "right": pageBreakAfter = PAGE_BREAK_RIGHT if str(context.cssAttr["page-break-after"]).lower() == "left": pageBreakAfter = PAGE_BREAK_LEFT if display == "none": # print "none!" return # Translate CSS to frags # Save previous frag styles context.pushFrag() # Map styles to Reportlab fragment properties CSS2Frag(context, kw, isBlock=isBlock) # EXTRAS transform_attrs( context.frag, ( ("keepWithNext", "-pdf-keep-with-next"), ("outline", "-pdf-outline"), # ("borderLeftColor", "-pdf-outline-open"), ), context.cssAttr, getBool, ) if "-pdf-outline-level" in context.cssAttr: context.frag.outlineLevel = int(context.cssAttr["-pdf-outline-level"]) if "-pdf-word-wrap" in context.cssAttr: context.frag.wordWrap = context.cssAttr["-pdf-word-wrap"] # handle keep-in-frame keepInFrameMode = None keepInFrameMaxWidth = 0 keepInFrameMaxHeight = 0 if "-pdf-keep-in-frame-mode" in context.cssAttr: value = str(context.cssAttr["-pdf-keep-in-frame-mode"]).strip().lower() if value in ("shrink", "error", "overflow", "truncate"): keepInFrameMode = value else: keepInFrameMode = "shrink" # Added because we need a default value. if "-pdf-keep-in-frame-max-width" in context.cssAttr: keepInFrameMaxWidth = getSize( "".join(context.cssAttr["-pdf-keep-in-frame-max-width"]) ) if "-pdf-keep-in-frame-max-height" in context.cssAttr: keepInFrameMaxHeight = getSize( "".join(context.cssAttr["-pdf-keep-in-frame-max-height"]) ) # ignore nested keep-in-frames, tables have their own KIF handling keepInFrame = keepInFrameMode is not None and context.keepInFrameIndex is None if keepInFrame: # keep track of current story index, so we can wrap everythink # added after this point in a KeepInFrame context.keepInFrameIndex = len(context.story) # BEGIN tag klass = globals().get("pisaTag%s" % node.tagName.replace(":", "").upper(), None) obj = None # Static block elementId = attr.get("id", None) staticFrame = context.frameStatic.get(elementId, None) if staticFrame: context.frag.insideStaticFrame += 1 oldStory = context.swapStory() # Tag specific operations if klass is not None: obj = klass(node, attr) obj.start(context) # Visit child nodes context.fragBlock = fragBlock = copy.copy(context.frag) for nnode in node.childNodes: pisaLoop(nnode, context, path, **kw) context.fragBlock = fragBlock # END tag if obj: obj.end(context) # Block? if isBlock: context.addPara() # XXX Buggy! # Page break by CSS if pageBreakAfter: context.addStory(PageBreak()) if pageBreakAfter == PAGE_BREAK_RIGHT: context.addStory(PmlRightPageBreak()) if pageBreakAfter == PAGE_BREAK_LEFT: context.addStory(PmlLeftPageBreak()) if frameBreakAfter: context.addStory(FrameBreak()) if keepInFrame: # get all content added after start of -pdf-keep-in-frame and wrap # it in a KeepInFrame substory = context.story[context.keepInFrameIndex :] context.story = context.story[: context.keepInFrameIndex] context.story.append( KeepInFrame( content=substory, maxWidth=keepInFrameMaxWidth, maxHeight=keepInFrameMaxHeight, mode=keepInFrameMode, ) ) # mode wasn't being used; it is necessary for tables or images at # end of page. context.keepInFrameIndex = None # Static block, END if staticFrame: context.addPara() for frame in staticFrame: frame.pisaStaticStory = context.story context.swapStory(oldStory) context.frag.insideStaticFrame -= 1 # context.debug(1, indent, "" % (node.tagName)) # Reset frag style context.pullFrag() # Unknown or not handled else: # context.debug(1, indent, "???", node, node.nodeType, repr(node)) # Loop over children for node in node.childNodes: pisaLoop(node, context, path, **kw) def pisaParser( src, context, default_css="", xhtml=False, # noqa: FBT002 encoding="utf8", xml_output=None, ): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object. """ global CSSAttrCache # noqa: PLW0603 CSSAttrCache = {} if xhtml: log.warning("xhtml parameter will be removed on next release 0.2.8") # TODO: XHTMLParser doesn't seem to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) parser_kwargs = {} if isinstance(src, str): # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # To pass the encoding used to convert the text_type src to binary_type # on to html5lib's parser to ensure proper decoding parser_kwargs["transport_encoding"] = encoding # # Test for the restrictions of html5lib # if encoding: # # Workaround for html5lib<0.11.1 # if hasattr(inputstream, "isValidEncoding"): # if encoding.strip().lower() == "utf8": # encoding = "utf-8" # if not inputstream.isValidEncoding(encoding): # log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) # else: # if inputstream.codecName(encoding) is None: # log.error("%r is not a valid encoding", encoding) document = parser.parse(src, **parser_kwargs) # encoding=encoding) if xml_output: xml_output.write(document.toprettyxml(encoding=encoding)) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) context.parseCSS() pisaLoop(document, context) return context # Shortcuts HTML2PDF = pisaParser def XHTML2PDF(*a, **kw): kw["xhtml"] = True return HTML2PDF(*a, **kw) XML2PDF = XHTML2PDF