from __future__ import absolute_import from copy import copy from itertools import chain import os import re import sys import shlex from tempfile import NamedTemporaryFile from django.utils.encoding import smart_str try: from urllib.request import pathname2url from urllib.parse import urljoin except ImportError: # Python2 from urllib import pathname2url from urlparse import urljoin import django from django.conf import settings from django.template import loader from django.template.context import Context, RequestContext import six from .subprocess import check_output NO_ARGUMENT_OPTIONS = ['--collate', '--no-collate', '-H', '--extended-help', '-g', '--grayscale', '-h', '--help', '--htmldoc', '--license', '-l', '--lowquality', '--manpage', '--no-pdf-compression', '-q', '--quiet', '--read-args-from-stdin', '--readme', '--use-xserver', '-V', '--version', '--dump-default-toc-xsl', '--outline', '--no-outline', '--background', '--no-background', '--custom-header-propagation', '--no-custom-header-propagation', '--debug-javascript', '--no-debug-javascript', '--default-header', '--disable-external-links', '--enable-external-links', '--disable-forms', '--enable-forms', '--images', '--no-images', '--disable-internal-links', '--enable-internal-links', '-n', '--disable-javascript', '--enable-javascript', '--keep-relative-links', '--disable-local-file-access', '--enable-local-file-access', '--exclude-from-outline', '--include-in-outline', '--disable-plugins', '--enable-plugins', '--print-media-type', '--no-print-media-type', '--resolve-relative-links', '--disable-smart-shrinking', '--enable-smart-shrinking', '--stop-slow-scripts', '--no-stop-slow-scripts', '--disable-toc-back-links', '--enable-toc-back-links', '--footer-line', '--no-footer-line', '--header-line', '--no-header-line', '--disable-dotted-lines', '--disable-toc-links', '--verbose'] def _options_to_args(**options): """ Converts ``options`` into a list of command-line arguments. Skip arguments where no value is provided For flag-type (No argument) variables, pass only the name and only then if the value is True """ flags = [] for name in sorted(options): value = options[name] formatted_flag = '--%s' % name if len(name) > 1 else '-%s' % name formatted_flag = formatted_flag.replace('_', '-') accepts_no_arguments = formatted_flag in NO_ARGUMENT_OPTIONS if value is None or (value is False and accepts_no_arguments): continue flags.append(formatted_flag) if accepts_no_arguments: continue flags.append(six.text_type(value)) return flags def wkhtmltopdf(pages, output=None, **kwargs): """ Converts html to PDF using http://wkhtmltopdf.org/. pages: List of file paths or URLs of the html to be converted. output: Optional output file path. If None, the output is returned. **kwargs: Passed to wkhtmltopdf via _extra_args() (See https://github.com/antialize/wkhtmltopdf/blob/master/README_WKHTMLTOPDF for acceptable args.) Kwargs is passed through as arguments. e.g.: {'footer_html': 'http://example.com/foot.html'} becomes '--footer-html http://example.com/foot.html' Where there is no value passed, use True. e.g.: {'disable_javascript': True} becomes: '--disable-javascript' To disable a default option, use None. e.g: {'quiet': None'} becomes: '' example usage: wkhtmltopdf(pages=['/tmp/example.html'], dpi=300, orientation='Landscape', disable_javascript=True) """ if isinstance(pages, six.string_types): # Support a single page. pages = [pages] if output is None: # Standard output. output = '-' has_cover = kwargs.pop('has_cover', False) # Default options: options = getattr(settings, 'WKHTMLTOPDF_CMD_OPTIONS', None) if options is None: options = {'quiet': True} else: options = copy(options) options.update(kwargs) # Force --encoding utf8 unless the user has explicitly overridden this. options.setdefault('encoding', 'utf8') env = getattr(settings, 'WKHTMLTOPDF_ENV', None) if env is not None: env = dict(os.environ, **env) cmd = 'WKHTMLTOPDF_CMD' cmd = getattr(settings, cmd, os.environ.get(cmd, 'wkhtmltopdf')) # Adding 'cover' option to add cover_file to the pdf to generate. if has_cover: pages.insert(0, 'cover') ck_args = list(chain(shlex.split(cmd), _options_to_args(**options), list(pages), [output])) ck_kwargs = {'env': env} # Handling of fileno() attr. based on https://github.com/GrahamDumpleton/mod_wsgi/issues/85 try: i = sys.stderr.fileno() ck_kwargs['stderr'] = sys.stderr except (AttributeError, IOError): # can't call fileno() on mod_wsgi stderr object pass return check_output(ck_args, **ck_kwargs) def convert_to_pdf(filename, header_filename=None, footer_filename=None, cmd_options=None, cover_filename=None): # Clobber header_html and footer_html only if filenames are # provided. These keys may be in self.cmd_options as hardcoded # static files. # The argument `filename` may be a string or a list. However, wkhtmltopdf # will coerce it into a list if a string is passed. cmd_options = cmd_options if cmd_options else {} if cover_filename: pages = [cover_filename, filename] cmd_options['has_cover'] = True else: pages = [filename] if header_filename is not None: cmd_options['header_html'] = header_filename if footer_filename is not None: cmd_options['footer_html'] = footer_filename return wkhtmltopdf(pages=pages, **cmd_options) class RenderedFile(object): """ Create a temporary file resource of the rendered template with context. The filename will be used for later conversion to PDF. """ temporary_file = None filename = '' def __init__(self, template, context, request=None): debug = getattr(settings, 'WKHTMLTOPDF_DEBUG', settings.DEBUG) self.temporary_file = render_to_temporary_file( template=template, context=context, request=request, prefix='wkhtmltopdf', suffix='.html', delete=(not debug) ) self.filename = self.temporary_file.name def __del__(self): # Always close the temporary_file on object destruction. if self.temporary_file is not None: self.temporary_file.close() def render_pdf_from_template(input_template, header_template, footer_template, context, request=None, cmd_options=None, cover_template=None): # For basic usage. Performs all the actions necessary to create a single # page PDF from a single template and context. cmd_options = cmd_options if cmd_options else {} header_filename = footer_filename = None # Main content. input_file = RenderedFile( template=input_template, context=context, request=request ) # Optional. For header template argument. if header_template: header_file = RenderedFile( template=header_template, context=context, request=request ) header_filename = header_file.filename # Optional. For footer template argument. if footer_template: footer_file = RenderedFile( template=footer_template, context=context, request=request ) footer_filename = footer_file.filename cover = None if cover_template: cover = RenderedFile( template=cover_template, context=context, request=request ) return convert_to_pdf(filename=input_file.filename, header_filename=header_filename, footer_filename=footer_filename, cmd_options=cmd_options, cover_filename=cover.filename if cover else None) def content_disposition_filename(filename): """ Sanitize a file name to be used in the Content-Disposition HTTP header. Even if the standard is quite permissive in terms of characters, there are a lot of edge cases that are not supported by different browsers. See http://greenbytes.de/tech/tc2231/#attmultinstances for more details. """ filename = filename.replace(';', '').replace('"', '') return http_quote(filename) def http_quote(string): """ Given a unicode string, will do its dandiest to give you back a valid ascii charset string you can use in, say, http headers and the like. """ if isinstance(string, six.text_type): try: import unidecode except ImportError: pass else: string = unidecode.unidecode(string) string = string.encode('ascii', 'replace') # Wrap in double-quotes for ; , and the like string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"') return '"{0!s}"'.format(string.decode()) def pathname2fileurl(pathname): """Returns a file:// URL for pathname. Handles OS-specific conversions.""" return urljoin('file:', pathname2url(pathname)) def make_absolute_paths(content): """Convert all MEDIA files into a file://URL paths in order to correctly get it displayed in PDFs.""" overrides = [ { 'root': settings.MEDIA_ROOT, 'url': settings.MEDIA_URL, }, { 'root': settings.STATIC_ROOT, 'url': settings.STATIC_URL, } ] has_scheme = re.compile(r'^[^:/]+://') for x in overrides: if not x['url'] or has_scheme.match(x['url']): continue root = str(x['root']) if not root.endswith('/'): root += '/' occur_pattern = '''(["|']{0}.*?["|'])''' occurences = re.findall(occur_pattern.format(x['url']), content) occurences = list(set(occurences)) # Remove dups for occur in occurences: content = content.replace(occur, '"%s"' % ( pathname2fileurl(root) + occur[1 + len(x['url']): -1])) return content def render_to_temporary_file(template, context, request=None, mode='w+b', bufsize=-1, suffix='.html', prefix='tmp', dir=None, delete=True): try: render = template.render except AttributeError: content = loader.render_to_string(template, context) else: if django.VERSION < (1, 8): # If using a version of Django prior to 1.8, ensure ``context`` is an # instance of ``Context`` if not isinstance(context, Context): if request: context = RequestContext(request, context) else: context = Context(context) # Handle error when ``request`` is None content = render(context) else: content = render(context, request) content = smart_str(content) content = make_absolute_paths(content) try: # Python3 has 'buffering' arg instead of 'bufsize' tempfile = NamedTemporaryFile(mode=mode, buffering=bufsize, suffix=suffix, prefix=prefix, dir=dir, delete=delete) except TypeError: tempfile = NamedTemporaryFile(mode=mode, bufsize=bufsize, suffix=suffix, prefix=prefix, dir=dir, delete=delete) try: tempfile.write(content.encode('utf-8')) tempfile.flush() return tempfile except: # Clean-up tempfile if an Exception is raised. tempfile.close() raise