-
Notifications
You must be signed in to change notification settings - Fork 330
/
Copy pathpdfkit.py
309 lines (251 loc) · 10.8 KB
/
pdfkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# -*- coding: utf-8 -*-
import re
import subprocess
import sys
from collections import OrderedDict
from .source import Source
from .configuration import Configuration
import io
import codecs
try:
# Python 2.x and 3.x support for checking string types
basestring
unicode
except NameError:
basestring = str
unicode = str
class PDFKit(object):
"""
Main class that does all generation routine.
:param url_or_file: str - either a URL, a path to a file or a string containing HTML
to convert
:param type_: str - either 'url', 'file' or 'string'
:param options: dict (optional) with wkhtmltopdf options, with or w/o '--'
:param toc: dict (optional) - toc-specific wkhtmltopdf options, with or w/o '--'
:param cover: str (optional) - url/filename with a cover html page
:param configuration: (optional) instance of pdfkit.configuration.Configuration()
"""
class ImproperSourceError(Exception):
"""Wrong source type for stylesheets"""
def __init__(self, msg):
self.msg = msg
def __str__(self):
return self.msg
def __init__(self, url_or_file, type_, options=None, toc=None, cover=None, css=None, configuration=None,
cover_first=False, verbose=False, raise_exceptions=True):
self.source = Source(url_or_file, type_)
self.configuration = (Configuration() if configuration is None
else configuration)
try:
self.wkhtmltopdf = self.configuration.wkhtmltopdf.decode('utf-8')
except AttributeError:
self.wkhtmltopdf = self.configuration.wkhtmltopdf
self.options = OrderedDict()
if self.source.isString():
self.options.update(self._find_options_in_meta(url_or_file))
self.environ = self.configuration.environ
if options is not None:
self.options.update(options)
self.toc = {} if toc is None else toc
self.cover = cover
self.cover_first = cover_first
self.verbose = verbose
self.css = css
self.stylesheets = []
self.raise_exceptions = raise_exceptions
def _genargs(self, opts):
"""
Generator of args parts based on options specification.
Note: Empty parts will be filtered out at _command generator
"""
for optkey, optval in self._normalize_options(opts):
yield optkey
if isinstance(optval, (list, tuple)):
assert len(optval) == 2 and optval[0] and optval[1], 'Option value can only be either a string or a (tuple, list) of 2 items'
yield optval[0]
yield optval[1]
else:
yield optval
def _command(self, path=None):
"""
Generator of all command parts
"""
if self.css:
self._prepend_css(self.css)
yield self.wkhtmltopdf
if not self.verbose:
self.options.update({'--quiet': ''})
for argpart in self._genargs(self.options):
if argpart:
yield argpart
if self.cover and self.cover_first:
yield 'cover'
yield self.cover
if self.toc:
yield 'toc'
for argpart in self._genargs(self.toc):
if argpart:
yield argpart
if self.cover and not self.cover_first:
yield 'cover'
yield self.cover
# If the source is a string then we will pipe it into wkhtmltopdf
# If the source is file-like then we will read from it and pipe it in
if self.source.isString() or self.source.isFileObj():
yield '-'
else:
if isinstance(self.source.source, basestring):
yield self.source.to_s()
else:
for s in self.source.source:
yield s
# If output_path evaluates to False append '-' to end of args
# and wkhtmltopdf will pass generated PDF to stdout
if path:
yield path
else:
yield '-'
def command(self, path=None):
return list(self._command(path))
@staticmethod
def handle_error(exit_code, stderr):
if exit_code == 0:
return
stderr_lines = stderr.splitlines()
# Sometimes wkhtmltopdf will exit with non-zero
# even if it finishes generation.
# If will display 'Done' in the second last line
if len(stderr_lines) > 1 and stderr.splitlines()[-2].strip() == 'Done':
return
if 'cannot connect to X server' in stderr:
raise IOError('%s\n'
'You will need to run wkhtmltopdf within a "virtual" X server.\n'
'Go to the link below for more information\n'
'https://github.com/JazzCore/python-pdfkit/wiki/Using-wkhtmltopdf-without-X-server' % stderr)
if 'Error' in stderr:
raise IOError('wkhtmltopdf reported an error:\n' + stderr)
error_msg = stderr or 'Unknown Error'
raise IOError("wkhtmltopdf exited with non-zero code {0}. error:\n{1}".format(exit_code, error_msg))
def to_pdf(self, path=None):
args = self.command(path)
if sys.platform == 'win32':
#hide cmd window
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
result = subprocess.Popen(
args,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=self.environ,
startupinfo=startupinfo
)
else:
result = subprocess.Popen(
args,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=self.environ
)
# If the source is a string then we will pipe it into wkhtmltopdf.
# If we want to add custom CSS to file then we read input file to
# string and prepend css to it and then pass it to stdin.
# This is a workaround for a bug in wkhtmltopdf (look closely in README)
if self.source.isString() or (self.source.isFile() and self.css):
input = self.source.to_s().encode('utf-8')
elif self.source.isFileObj():
input = self.source.source.read().encode('utf-8')
else:
input = None
stdout, stderr = result.communicate(input=input)
stderr = stderr or stdout or b""
stderr = stderr.decode('utf-8', errors='replace')
exit_code = result.returncode
# In some cases we don't want to handle errors if we want clean wkhtmltopdf output,
# but if we don't have stdout, we have to do it anyway
if not stdout or self.raise_exceptions:
self.handle_error(exit_code, stderr)
# Since wkhtmltopdf sends its output to stderr we will capture it
# and properly send to stdout
if '--quiet' not in args:
sys.stdout.write(stderr)
if not path:
return stdout
try:
with codecs.open(path, encoding='utf-8') as f:
# read 4 bytes to get PDF signature '%PDF'
text = f.read(4)
if text == '':
raise IOError('Command failed: %s\n'
'Check whhtmltopdf output without \'quiet\' '
'option' % ' '.join(args))
return True
except (IOError, OSError) as e:
raise IOError('Command failed: %s\n'
'Check whhtmltopdf output without \'quiet\' option\n'
'%s ' % (' '.join(args), e))
def _normalize_options(self, options):
""" Generator of 2-tuples (option-key, option-value).
When options spec is a list, generate a 2-tuples per list item.
:param options: dict {option name: value}
returns:
iterator (option-key, option-value)
- option names lower cased and prepended with
'--' if necessary. Non-empty values cast to str
"""
for key, value in list(options.items()):
if '--' not in key:
normalized_key = '--%s' % self._normalize_arg(key)
else:
normalized_key = self._normalize_arg(key)
if isinstance(value, (list, tuple)):
for optval in value:
yield (normalized_key, optval)
else:
normalized_value = '' if isinstance(value,bool) else value
yield (normalized_key, unicode(normalized_value) if value else value)
def _normalize_arg(self, arg):
return arg.lower()
def _style_tag_for(self, stylesheet):
return "<style>%s</style>" % stylesheet
def _prepend_css(self, path):
if self.source.isUrl() or isinstance(self.source.source, list):
raise self.ImproperSourceError('CSS files can be added only to a single '
'file or string')
if not isinstance(path, list):
path = [path]
css_data = []
for p in path:
with codecs.open(p, encoding="UTF-8") as f:
css_data.append(f.read())
css_data = "\n".join(css_data)
if self.source.isFile():
with codecs.open(self.source.to_s(), encoding="UTF-8") as f:
inp = f.read()
self.source = Source(
inp.replace('</head>', self._style_tag_for(css_data) + '</head>'),
'string')
elif self.source.isString():
if '</head>' in self.source.to_s():
self.source.source = self.source.to_s().replace(
'</head>', self._style_tag_for(css_data) + '</head>')
else:
self.source.source = self._style_tag_for(css_data) + self.source.to_s()
def _find_options_in_meta(self, content):
"""Reads 'content' and extracts options encoded in HTML meta tags
:param content: str or file-like object - contains HTML to parse
returns:
dict: {config option: value}
"""
if (isinstance(content, io.IOBase)
or content.__class__.__name__ == 'StreamReaderWriter'):
content = content.read()
found = {}
for x in re.findall('<meta [^>]*>', content):
if re.search('name=["\']%s' % self.configuration.meta_tag_prefix, x):
name = re.findall('name=["\']%s([^"\']*)' %
self.configuration.meta_tag_prefix, x)[0]
found[name] = re.findall('content=["\']([^"\']*)', x)[0]
return found