Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook.

This commit is contained in:
Kovid Goyal 2009-03-29 21:09:04 -07:00
parent b98ada75f7
commit 44799e05ef
8 changed files with 242 additions and 107 deletions

View File

@ -1,5 +1,5 @@
" Project wide builtins " Project wide builtins
let g:pyflakes_builtins += ["dynamic_property"] let g:pyflakes_builtins += ["dynamic_property", '__']
python << EOFPY python << EOFPY
import os import os

View File

@ -4,8 +4,6 @@ Defines the plugin system for conversions.
''' '''
import re, os, shutil import re, os, shutil
from lxml import html
from calibre import CurrentDir from calibre import CurrentDir
from calibre.customize import Plugin from calibre.customize import Plugin
@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
#: (option_name, recommended_value, recommendation_level) #: (option_name, recommended_value, recommendation_level)
recommendations = set([]) recommendations = set([])
def convert(self, stream, options, file_ext, parse_cache, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in the path to the created OPF file. All output should be contained in
@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
is guaranteed to be one of the `file_types` supported is guaranteed to be one of the `file_types` supported
by this plugin. by this plugin.
:param parse_cache: A dictionary that maps absolute file paths to
parsed representations of their contents. For
HTML the representation is an lxml element of
the root of the tree. For CSS it is a cssutils
stylesheet. If this plugin parses any of the
output files, it should add them to the cache
so that later stages of the conversion wont
have to re-parse them. If a parsed representation
is in the cache, there is no need to actually
write the file to disk.
:param log: A :class:`calibre.utils.logging.Log` object. All output :param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object. should use this object.
@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log, def __call__(self, stream, options, file_ext, log,
accelerators, output_dir): accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ') log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'): if hasattr(stream, 'name'):
@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache, ret = self.convert(stream, options, file_ext,
log, accelerators) log, accelerators)
for key in list(parse_cache.keys()):
if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a '
'relative path: %s')%(self.name, key)
)
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
if options.debug_input is not None: if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input) options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input): if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input) os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input) shutil.rmtree(options.debug_input)
for f, obj in parse_cache.items():
if hasattr(obj, 'cssText'):
raw = obj.cssText
else:
raw = html.tostring(obj, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(f, 'wb').write(raw)
shutil.copytree('.', options.debug_input) shutil.copytree('.', options.debug_input)
return ret return ret

View File

@ -39,7 +39,7 @@ class OutputProfile(Plugin):
epub_flow_size = sys.maxint epub_flow_size = sys.maxint
screen_size = None screen_size = None
remove_special_chars = False remove_special_chars = None
remove_object_tags = False remove_object_tags = False
class SonyReader(OutputProfile): class SonyReader(OutputProfile):

View File

@ -8,6 +8,7 @@ import os
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles, output_profiles, \ from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format plugin_for_input_format, plugin_for_output_format
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
class OptionValues(object): class OptionValues(object):
pass pass
@ -258,16 +259,17 @@ OptionRecommendation(name='language',
# heavy lifting. # heavy lifting.
from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
parse_cache, accelerators = {}, {} accelerators = {}
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, parse_cache, self.log, self.input_fmt, self.log,
accelerators) accelerators)
html_preprocessor = HTMLPreProcessor()
self.reader = OEBReader() self.reader = OEBReader()
self.oeb = OEBBook(self.log, parse_cache=parse_cache) self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
self.reader(self.oeb, opfpath) self.reader(self.oeb, opfpath)

View File

@ -0,0 +1,123 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, functools
from calibre import entity_to_unicode
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n'+x+'\n</head>'
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
def __call__(self, data):
data = self.PAGE_PAT.sub('', data)
return data
class HTMLPreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if \
re.match('<', match.group(1).lstrip()) or \
len(match.group(1)) < 40 else match.group(1)),
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
]
# Fix Book Designer markup
BOOK_DESIGNER = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
rules = self.PDFTOHTML
else:
rules = []
for rule in self.PREPROCESS + rules:
html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
return html

View File

@ -12,19 +12,22 @@ class MOBIInput(InputFormatPlugin):
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw']) file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader import MobiReader
from lxml import html
mr = MobiReader(stream, log, options.input_encoding, mr = MobiReader(stream, log, options.input_encoding,
options.debug_input) options.debug_input)
parse_cache = {}
mr.extract_content('.', parse_cache) mr.extract_content('.', parse_cache)
raw = parse_cache.get('calibre_raw_mobi_markup', False) raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw: if raw:
if isinstance(raw, unicode): if isinstance(raw, unicode):
raw = raw.encode('utf-8') raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw) open('debug-raw.html', 'wb').write(raw)
for f, root in parse_cache.items(): for f, root in parse_cache.items():
if '.' in f: with open(f, 'wb') as q:
accelerators[f] = {'pagebreaks':root.xpath( q.write(html.tostring(root, encoding='utf-8', method='xml',
'//div[@class="mbp_pagebreak"]')} include_meta_content_type=False))
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
return mr.created_opf_path return mr.created_opf_path

View File

@ -20,6 +20,8 @@ from cssutils import CSSParser
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
CSSPreProcessor
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -207,6 +209,10 @@ class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
pass pass
class NotHTML(OEBError):
'''Raised when a file that should be HTML (as per manifest) is not'''
pass
class NullContainer(object): class NullContainer(object):
"""An empty container. """An empty container.
@ -575,14 +581,7 @@ class Manifest(object):
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data) data = self.oeb.html_preprocessor(data)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in data and XLINK_NS not in data:
data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
try: try:
data = etree.fromstring(data) data = etree.fromstring(data)
@ -606,7 +605,7 @@ class Manifest(object):
data = etree.fromstring(data, parser=RECOVER_PARSER) data = etree.fromstring(data, parser=RECOVER_PARSER)
# Force into the XHTML namespace # Force into the XHTML namespace
if barename(data.tag) != 'html': if barename(data.tag) != 'html':
raise OEBError( raise NotHTML(
'File %r does not appear to be (X)HTML' % self.href) 'File %r does not appear to be (X)HTML' % self.href)
elif not namespace(data.tag): elif not namespace(data.tag):
data.attrib['xmlns'] = XHTML_NS data.attrib['xmlns'] = XHTML_NS
@ -659,6 +658,7 @@ class Manifest(object):
def _parse_css(self, data): def _parse_css(self, data):
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.CSSPreProcessor(data)
data = XHTML_CSS_NAMESPACE + data data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING, parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
fetcher=self._fetch_css) fetcher=self._fetch_css)
@ -793,7 +793,7 @@ class Manifest(object):
MIME type which is not one of the OPS core media types. Either the MIME type which is not one of the OPS core media types. Either the
item's data itself may be provided with :param:`data`, or a loader item's data itself may be provided with :param:`data`, or a loader
function for the data may be provided with :param:`loader`, or the function for the data may be provided with :param:`loader`, or the
item's data may latter be set manually via the :attr:`data` attribute. item's data may later be set manually via the :attr:`data` attribute.
""" """
item = self.Item( item = self.Item(
self.oeb, id, href, media_type, fallback, loader, data) self.oeb, id, href, media_type, fallback, loader, data)
@ -840,6 +840,9 @@ class Manifest(object):
for item in self.items: for item in self.items:
yield item yield item
def __len__(self):
return len(self.items)
def values(self): def values(self):
return list(self.items) return list(self.items)
@ -1255,17 +1258,22 @@ class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger, parse_cache={}, encoding='utf-8', def __init__(self, logger,
pretty_print=False): html_preprocessor=HTMLPreProcessor(),
"""Create empty book. Optional arguments: css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments:
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
paths to the cached files and values are lxml root objects and
cssutils stylesheets.
:param:`encoding`: Default encoding for textual content read :param:`encoding`: Default encoding for textual content read
from an external container. from an external container.
:param:`pretty_print`: Whether or not the canonical string form :param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed. of XML markup is pretty-printed.
:param html_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all html files
before they are parsed.
:param css_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all CSS files
before they are parsed.
:param:`logger`: A Log object to use for logging all messages :param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible related to the processing of this book. It is accessible
via the instance data members :attr:`logger,log`. via the instance data members :attr:`logger,log`.
@ -1286,6 +1294,8 @@ class OEBBook(object):
""" """
self.encoding = encoding self.encoding = encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print self.pretty_print = pretty_print
self.logger = self.log = logger self.logger = self.log = logger
self.version = '2.0' self.version = '2.0'

View File

@ -161,10 +161,30 @@ class OEBReader(object):
self.logger.warn('Title not specified') self.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown'))) metadata.add('title', self.oeb.translate(__('Unknown')))
def _manifest_add_missing(self): def _manifest_prune_invalid(self):
'''
Remove items from manifest that contain invalid data. This prevents
catastrophic conversion failure, when a few files contain corrupted
data.
'''
bad = []
check = OEB_DOCS+OEB_STYLES
for item in list(self.oeb.manifest.values()):
if item.media_type in check:
try:
item.data
except:
self.logger.exception('Failed to parse content in %s'%
item.href)
bad.append(item)
self.oeb.manifest.remove(item)
return bad
def _manifest_add_missing(self, invalid):
manifest = self.oeb.manifest manifest = self.oeb.manifest
known = set(manifest.hrefs) known = set(manifest.hrefs)
unchecked = set(manifest.values()) unchecked = set(manifest.values())
bad = []
while unchecked: while unchecked:
new = set() new = set()
for item in unchecked: for item in unchecked:
@ -190,6 +210,13 @@ class OEBReader(object):
unchecked.clear() unchecked.clear()
for href in new: for href in new:
known.add(href) known.add(href)
is_invalid = False
for item in invalid:
if href == item.abshref(urlnormalize(href)):
is_invalid = True
break
if is_invalid:
continue
if not self.oeb.container.exists(href): if not self.oeb.container.exists(href):
self.logger.warn('Referenced file %r not found' % href) self.logger.warn('Referenced file %r not found' % href)
continue continue
@ -222,7 +249,8 @@ class OEBReader(object):
self.logger.warn(u'Duplicate manifest id %r' % id) self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href) id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback) manifest.add(id, href, media_type, fallback)
self._manifest_add_missing() invalid = self._manifest_prune_invalid()
self._manifest_add_missing(invalid)
def _spine_add_extra(self): def _spine_add_extra(self):
manifest = self.oeb.manifest manifest = self.oeb.manifest