Dont use WebKit to render HTML for EPUB covers

This commit is contained in:
Kovid Goyal 2019-06-26 15:02:12 +05:30
parent 27798beaf6
commit 6f86896da8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 88 additions and 126 deletions

View File

@ -8,8 +8,8 @@ Code for the conversion of ebook formats and the reading of metadata
from various formats. from various formats.
''' '''
import traceback, os, re, numbers import os, re, numbers, sys
from calibre import CurrentDir, prints from calibre import prints
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
@ -41,40 +41,6 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx'] 'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx']
class HTMLRenderer(object):
def __init__(self, page, loop):
self.page, self.loop = page, loop
self.data = ''
self.exception = self.tb = None
def __call__(self, ok):
from PyQt5.Qt import QImage, QPainter, QByteArray, QBuffer
try:
if not ok:
raise RuntimeError('Rendering of HTML failed.')
de = self.page.mainFrame().documentElement()
pe = de.findFirst('parsererror')
if not pe.isNull():
raise ParserError(pe.toPlainText())
image = QImage(self.page.viewportSize(), QImage.Format_ARGB32)
image.setDotsPerMeterX(96*(100/2.54))
image.setDotsPerMeterY(96*(100/2.54))
painter = QPainter(image)
self.page.mainFrame().render(painter)
painter.end()
ba = QByteArray()
buf = QBuffer(ba)
buf.open(QBuffer.WriteOnly)
image.save(buf, 'JPEG')
self.data = ba.data()
except Exception as e:
self.exception = e
self.traceback = traceback.format_exc()
finally:
self.loop.exit(0)
def return_raster_image(path): def return_raster_image(path):
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
if os.access(path, os.R_OK): if os.access(path, os.R_OK):
@ -145,63 +111,33 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
pass pass
if data is None: if data is None:
from calibre.gui2 import is_ok_to_use_qt
if is_ok_to_use_qt():
data = render_html_data(path_to_html, width, height) data = render_html_data(path_to_html, width, height)
else:
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
try:
result = fork_job('calibre.ebooks',
'render_html_data',
(path_to_html, width, height),
no_output=True)
data = result['result']
except WorkerError as err:
prints(err.orig_tb)
except:
traceback.print_exc()
return data return data
def render_html_data(path_to_html, width, height): def render_html_data(path_to_html, width, height):
renderer = render_html(path_to_html, width, height) from calibre.ptempfile import TemporaryDirectory
return getattr(renderer, 'data', None) from calibre.utils.ipc.simple_worker import fork_job, WorkerError
def report_error(text=''):
prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr)
if text:
prints(text, file=sys.stderr)
if result['stdout_stderr']:
with open(result['stdout_stderr'], 'rb') as f:
prints(f.read(), file=sys.stderr)
def render_html(path_to_html, width=590, height=750, as_xhtml=True): with TemporaryDirectory('-render-html') as tdir:
from PyQt5.QtWebKitWidgets import QWebPage try:
from PyQt5.Qt import QEventLoop, QPalette, Qt, QUrl, QSize result = fork_job('calibre.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg'))
from calibre.gui2 import is_ok_to_use_qt, secure_web_page except WorkerError as e:
if not is_ok_to_use_qt(): report_error(e.orig_tb)
return None
path_to_html = os.path.abspath(path_to_html)
with CurrentDir(os.path.dirname(path_to_html)):
page = QWebPage()
settings = page.settings()
secure_web_page(settings)
pal = page.palette()
pal.setBrush(QPalette.Background, Qt.white)
page.setPalette(pal)
page.setViewportSize(QSize(width, height))
page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
loop = QEventLoop()
renderer = HTMLRenderer(page, loop)
page.loadFinished.connect(renderer, type=Qt.QueuedConnection)
if as_xhtml:
page.mainFrame().setContent(open(path_to_html, 'rb').read(),
'application/xhtml+xml', QUrl.fromLocalFile(path_to_html))
else: else:
page.mainFrame().load(QUrl.fromLocalFile(path_to_html)) if result['result']:
loop.exec_() with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f:
renderer.loop = renderer.page = None return f.read()
page.loadFinished.disconnect() else:
del page report_error()
del loop
if isinstance(renderer.exception, ParserError) and as_xhtml:
return render_html(path_to_html, width=width, height=height,
as_xhtml=False)
return renderer
def check_ebook_format(stream, current_guess): def check_ebook_format(stream, current_guess):

View File

@ -10,13 +10,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import io import io
import os import os
import posixpath import posixpath
import re
from contextlib import closing from contextlib import closing
from lxml import etree from lxml import etree
from calibre import CurrentDir, walk from calibre import CurrentDir
from calibre.constants import isosx
from calibre.ebooks.metadata.opf import ( from calibre.ebooks.metadata.opf import (
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
) )
@ -198,39 +196,6 @@ def render_cover(cpage, zf, reader=None):
cpage = os.path.join(tdir, cpage) cpage = os.path.join(tdir, cpage)
if not os.path.exists(cpage): if not os.path.exists(cpage):
return return
if isosx:
# On OS X trying to render a HTML cover which uses embedded
# fonts more than once in the same process causes a crash in Qt
# so be safe and remove the fonts as well as any @font-face
# rules
for f in walk('.'):
if os.path.splitext(f)[1].lower() in ('.ttf', '.otf'):
os.remove(f)
ffpat = re.compile(br'@font-face.*?{.*?}',
re.DOTALL|re.IGNORECASE)
with lopen(cpage, 'r+b') as f:
raw = f.read()
f.truncate(0)
f.seek(0)
raw = ffpat.sub(b'', raw)
f.write(raw)
from calibre.ebooks.chardet import xml_to_unicode
raw = xml_to_unicode(raw,
strip_encoding_pats=True, resolve_entities=True)[0]
from lxml import html
for link in html.fromstring(raw).xpath('//link'):
href = link.get('href', '')
if href:
path = os.path.join(os.path.dirname(cpage), href)
if os.path.exists(path):
with lopen(path, 'r+b') as f:
raw = f.read()
f.truncate(0)
f.seek(0)
raw = ffpat.sub(b'', raw)
f.write(raw)
return render_html_svg_workaround(cpage, default_log) return render_html_svg_workaround(cpage, default_log)

View File

@ -82,7 +82,7 @@ def read_info(outputdir, get_cover):
return ans return ans
def page_images(pdfpath, outputdir, first=1, last=1): def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
pdftoppm = get_tools()[1] pdftoppm = get_tools()[1]
outputdir = os.path.abspath(outputdir) outputdir = os.path.abspath(outputdir)
args = {} args = {}
@ -90,9 +90,10 @@ def page_images(pdfpath, outputdir, first=1, last=1):
import win32process as w import win32process as w
args['creationflags'] = w.HIGH_PRIORITY_CLASS | w.CREATE_NO_WINDOW args['creationflags'] = w.HIGH_PRIORITY_CLASS | w.CREATE_NO_WINDOW
try: try:
subprocess.check_call([pdftoppm, '-cropbox', '-jpeg', '-f', unicode_type(first), subprocess.check_call([
'-l', unicode_type(last), pdfpath, pdftoppm, '-cropbox', '-' + image_format, '-f', unicode_type(first),
os.path.join(outputdir, 'page-images')], **args) '-l', unicode_type(last), pdfpath, os.path.join(outputdir, prefix)
], **args)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode) raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import sys
from PyQt5.Qt import QApplication, QMarginsF, QPageLayout, QPageSize, Qt, QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from calibre.ebooks.metadata.pdf import page_images
from calibre.gui2 import must_use_qt
from calibre.gui2.webengine import secure_webengine
class Render(QWebEnginePage):
def __init__(self):
QWebEnginePage.__init__(self)
secure_webengine(self)
self.loadFinished.connect(self.load_finished, type=Qt.QueuedConnection)
self.pdfPrintingFinished.connect(self.print_finished)
def load_finished(self, ok):
if ok:
self.start_print()
else:
QApplication.instance().exit(1)
def start_print(self):
margins = QMarginsF(0, 0, 0, 0)
page_layout = QPageLayout(QPageSize(QPageSize.A4), QPageLayout.Portrait, margins)
self.printToPdf('rendered.pdf', page_layout)
def print_finished(self, path, ok):
QApplication.instance().exit(0 if ok else 2)
def main(path_to_html, tdir, image_format='jpeg'):
if image_format not in ('jpeg', 'png'):
raise ValueError('Image format must be either jpeg or png')
must_use_qt()
path_to_html = os.path.abspath(path_to_html)
os.chdir(tdir)
renderer = Render()
renderer.load(QUrl.fromLocalFile(path_to_html))
ret = QApplication.instance().exec_()
if ret == 0:
page_images('rendered.pdf', image_format=image_format)
ext = {'jpeg': 'jpg'}.get(image_format, image_format)
os.rename('page-images-1.' + ext, 'rendered.' + image_format)
return ret == 0
if __name__ == '__main__':
main(sys.argv[-1], '.')

View File

@ -23,6 +23,8 @@ def secure_webengine(view_or_page_or_settings, for_viewer=False):
if not for_viewer: if not for_viewer:
a(s.JavascriptEnabled, False) a(s.JavascriptEnabled, False)
s.setUnknownUrlSchemePolicy(s.DisallowUnknownUrlSchemes) s.setUnknownUrlSchemePolicy(s.DisallowUnknownUrlSchemes)
if hasattr(view_or_page_or_settings, 'setAudioMuted'):
view_or_page_or_settings.setAudioMuted(True)
a(s.JavascriptCanOpenWindows, False) a(s.JavascriptCanOpenWindows, False)
a(s.JavascriptCanAccessClipboard, False) a(s.JavascriptCanAccessClipboard, False)
# ensure javascript cannot read from local files # ensure javascript cannot read from local files