352 lines
12 KiB
Python

from __future__ import absolute_import
from copy import copy
from itertools import chain
import os
import re
import sys
import shlex
from tempfile import NamedTemporaryFile
from django.utils.encoding import smart_str
try:
from urllib.request import pathname2url
from urllib.parse import urljoin
except ImportError: # Python2
from urllib import pathname2url
from urlparse import urljoin
import django
from django.conf import settings
from django.template import loader
from django.template.context import Context, RequestContext
import six
from .subprocess import check_output
NO_ARGUMENT_OPTIONS = ['--collate', '--no-collate', '-H', '--extended-help', '-g',
'--grayscale', '-h', '--help', '--htmldoc', '--license', '-l',
'--lowquality', '--manpage', '--no-pdf-compression', '-q',
'--quiet', '--read-args-from-stdin', '--readme',
'--use-xserver', '-V', '--version', '--dump-default-toc-xsl',
'--outline', '--no-outline', '--background', '--no-background',
'--custom-header-propagation', '--no-custom-header-propagation',
'--debug-javascript', '--no-debug-javascript', '--default-header',
'--disable-external-links', '--enable-external-links',
'--disable-forms', '--enable-forms', '--images', '--no-images',
'--disable-internal-links', '--enable-internal-links', '-n',
'--disable-javascript', '--enable-javascript', '--keep-relative-links',
'--disable-local-file-access', '--enable-local-file-access',
'--exclude-from-outline', '--include-in-outline', '--disable-plugins',
'--enable-plugins', '--print-media-type', '--no-print-media-type',
'--resolve-relative-links', '--disable-smart-shrinking',
'--enable-smart-shrinking', '--stop-slow-scripts',
'--no-stop-slow-scripts', '--disable-toc-back-links',
'--enable-toc-back-links', '--footer-line', '--no-footer-line',
'--header-line', '--no-header-line', '--disable-dotted-lines',
'--disable-toc-links', '--verbose']
def _options_to_args(**options):
"""
Converts ``options`` into a list of command-line arguments.
Skip arguments where no value is provided
For flag-type (No argument) variables, pass only the name and only then if the value is True
"""
flags = []
for name in sorted(options):
value = options[name]
formatted_flag = '--%s' % name if len(name) > 1 else '-%s' % name
formatted_flag = formatted_flag.replace('_', '-')
accepts_no_arguments = formatted_flag in NO_ARGUMENT_OPTIONS
if value is None or (value is False and accepts_no_arguments):
continue
flags.append(formatted_flag)
if accepts_no_arguments:
continue
flags.append(six.text_type(value))
return flags
def wkhtmltopdf(pages, output=None, **kwargs):
"""
Converts html to PDF using http://wkhtmltopdf.org/.
pages: List of file paths or URLs of the html to be converted.
output: Optional output file path. If None, the output is returned.
**kwargs: Passed to wkhtmltopdf via _extra_args() (See
https://github.com/antialize/wkhtmltopdf/blob/master/README_WKHTMLTOPDF
for acceptable args.)
Kwargs is passed through as arguments. e.g.:
{'footer_html': 'http://example.com/foot.html'}
becomes
'--footer-html http://example.com/foot.html'
Where there is no value passed, use True. e.g.:
{'disable_javascript': True}
becomes:
'--disable-javascript'
To disable a default option, use None. e.g:
{'quiet': None'}
becomes:
''
example usage:
wkhtmltopdf(pages=['/tmp/example.html'],
dpi=300,
orientation='Landscape',
disable_javascript=True)
"""
if isinstance(pages, six.string_types):
# Support a single page.
pages = [pages]
if output is None:
# Standard output.
output = '-'
has_cover = kwargs.pop('has_cover', False)
# Default options:
options = getattr(settings, 'WKHTMLTOPDF_CMD_OPTIONS', None)
if options is None:
options = {'quiet': True}
else:
options = copy(options)
options.update(kwargs)
# Force --encoding utf8 unless the user has explicitly overridden this.
options.setdefault('encoding', 'utf8')
env = getattr(settings, 'WKHTMLTOPDF_ENV', None)
if env is not None:
env = dict(os.environ, **env)
cmd = 'WKHTMLTOPDF_CMD'
cmd = getattr(settings, cmd, os.environ.get(cmd, 'wkhtmltopdf'))
# Adding 'cover' option to add cover_file to the pdf to generate.
if has_cover:
pages.insert(0, 'cover')
ck_args = list(chain(shlex.split(cmd),
_options_to_args(**options),
list(pages),
[output]))
ck_kwargs = {'env': env}
# Handling of fileno() attr. based on https://github.com/GrahamDumpleton/mod_wsgi/issues/85
try:
i = sys.stderr.fileno()
ck_kwargs['stderr'] = sys.stderr
except (AttributeError, IOError):
# can't call fileno() on mod_wsgi stderr object
pass
return check_output(ck_args, **ck_kwargs)
def convert_to_pdf(filename, header_filename=None, footer_filename=None, cmd_options=None, cover_filename=None):
# Clobber header_html and footer_html only if filenames are
# provided. These keys may be in self.cmd_options as hardcoded
# static files.
# The argument `filename` may be a string or a list. However, wkhtmltopdf
# will coerce it into a list if a string is passed.
cmd_options = cmd_options if cmd_options else {}
if cover_filename:
pages = [cover_filename, filename]
cmd_options['has_cover'] = True
else:
pages = [filename]
if header_filename is not None:
cmd_options['header_html'] = header_filename
if footer_filename is not None:
cmd_options['footer_html'] = footer_filename
return wkhtmltopdf(pages=pages, **cmd_options)
class RenderedFile(object):
"""
Create a temporary file resource of the rendered template with context.
The filename will be used for later conversion to PDF.
"""
temporary_file = None
filename = ''
def __init__(self, template, context, request=None):
debug = getattr(settings, 'WKHTMLTOPDF_DEBUG', settings.DEBUG)
self.temporary_file = render_to_temporary_file(
template=template,
context=context,
request=request,
prefix='wkhtmltopdf', suffix='.html',
delete=(not debug)
)
self.filename = self.temporary_file.name
def __del__(self):
# Always close the temporary_file on object destruction.
if self.temporary_file is not None:
self.temporary_file.close()
def render_pdf_from_template(input_template, header_template, footer_template, context, request=None, cmd_options=None,
cover_template=None):
# For basic usage. Performs all the actions necessary to create a single
# page PDF from a single template and context.
cmd_options = cmd_options if cmd_options else {}
header_filename = footer_filename = None
# Main content.
input_file = RenderedFile(
template=input_template,
context=context,
request=request
)
# Optional. For header template argument.
if header_template:
header_file = RenderedFile(
template=header_template,
context=context,
request=request
)
header_filename = header_file.filename
# Optional. For footer template argument.
if footer_template:
footer_file = RenderedFile(
template=footer_template,
context=context,
request=request
)
footer_filename = footer_file.filename
cover = None
if cover_template:
cover = RenderedFile(
template=cover_template,
context=context,
request=request
)
return convert_to_pdf(filename=input_file.filename,
header_filename=header_filename,
footer_filename=footer_filename,
cmd_options=cmd_options,
cover_filename=cover.filename if cover else None)
def content_disposition_filename(filename):
"""
Sanitize a file name to be used in the Content-Disposition HTTP
header.
Even if the standard is quite permissive in terms of
characters, there are a lot of edge cases that are not supported by
different browsers.
See http://greenbytes.de/tech/tc2231/#attmultinstances for more details.
"""
filename = filename.replace(';', '').replace('"', '')
return http_quote(filename)
def http_quote(string):
"""
Given a unicode string, will do its dandiest to give you back a
valid ascii charset string you can use in, say, http headers and the
like.
"""
if isinstance(string, six.text_type):
try:
import unidecode
except ImportError:
pass
else:
string = unidecode.unidecode(string)
string = string.encode('ascii', 'replace')
# Wrap in double-quotes for ; , and the like
string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
return '"{0!s}"'.format(string.decode())
def pathname2fileurl(pathname):
"""Returns a file:// URL for pathname. Handles OS-specific conversions."""
return urljoin('file:', pathname2url(pathname))
def make_absolute_paths(content):
"""Convert all MEDIA files into a file://URL paths in order to
correctly get it displayed in PDFs."""
overrides = [
{
'root': settings.MEDIA_ROOT,
'url': settings.MEDIA_URL,
},
{
'root': settings.STATIC_ROOT,
'url': settings.STATIC_URL,
}
]
has_scheme = re.compile(r'^[^:/]+://')
for x in overrides:
if not x['url'] or has_scheme.match(x['url']):
continue
root = str(x['root'])
if not root.endswith('/'):
root += '/'
occur_pattern = '''(["|']{0}.*?["|'])'''
occurences = re.findall(occur_pattern.format(x['url']), content)
occurences = list(set(occurences)) # Remove dups
for occur in occurences:
content = content.replace(occur, '"%s"' % (
pathname2fileurl(root) +
occur[1 + len(x['url']): -1]))
return content
def render_to_temporary_file(template, context, request=None, mode='w+b',
bufsize=-1, suffix='.html', prefix='tmp',
dir=None, delete=True):
try:
render = template.render
except AttributeError:
content = loader.render_to_string(template, context)
else:
if django.VERSION < (1, 8):
# If using a version of Django prior to 1.8, ensure ``context`` is an
# instance of ``Context``
if not isinstance(context, Context):
if request:
context = RequestContext(request, context)
else:
context = Context(context)
# Handle error when ``request`` is None
content = render(context)
else:
content = render(context, request)
content = smart_str(content)
content = make_absolute_paths(content)
try:
# Python3 has 'buffering' arg instead of 'bufsize'
tempfile = NamedTemporaryFile(mode=mode, buffering=bufsize,
suffix=suffix, prefix=prefix,
dir=dir, delete=delete)
except TypeError:
tempfile = NamedTemporaryFile(mode=mode, bufsize=bufsize,
suffix=suffix, prefix=prefix,
dir=dir, delete=delete)
try:
tempfile.write(content.encode('utf-8'))
tempfile.flush()
return tempfile
except:
# Clean-up tempfile if an Exception is raised.
tempfile.close()
raise