Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook.

This commit is contained in:
Kovid Goyal 2009-03-29 21:09:04 -07:00
parent b98ada75f7
commit 44799e05ef
8 changed files with 242 additions and 107 deletions

View File

@ -1,5 +1,5 @@
" Project wide builtins
let g:pyflakes_builtins += ["dynamic_property"]
let g:pyflakes_builtins += ["dynamic_property", '__']
python << EOFPY
import os

View File

@ -4,8 +4,6 @@ Defines the plugin system for conversions.
'''
import re, os, shutil
from lxml import html
from calibre import CurrentDir
from calibre.customize import Plugin
@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in
@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
is guaranteed to be one of the `file_types` supported
by this plugin.
:param parse_cache: A dictionary that maps absolute file paths to
parsed representations of their contents. For
HTML the representation is an lxml element of
the root of the tree. For CSS it is a cssutils
stylesheet. If this plugin parses any of the
output files, it should add them to the cache
so that later stages of the conversion wont
have to re-parse them. If a parsed representation
is in the cache, there is no need to actually
write the file to disk.
:param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object.
@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log,
def __call__(self, stream, options, file_ext, log,
accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'):
@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache,
ret = self.convert(stream, options, file_ext,
log, accelerators)
for key in list(parse_cache.keys()):
if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a '
'relative path: %s')%(self.name, key)
)
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input)
for f, obj in parse_cache.items():
if hasattr(obj, 'cssText'):
raw = obj.cssText
else:
raw = html.tostring(obj, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(f, 'wb').write(raw)
shutil.copytree('.', options.debug_input)
return ret

View File

@ -39,7 +39,7 @@ class OutputProfile(Plugin):
epub_flow_size = sys.maxint
screen_size = None
remove_special_chars = False
remove_special_chars = None
remove_object_tags = False
class SonyReader(OutputProfile):

View File

@ -8,6 +8,7 @@ import os
from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
class OptionValues(object):
pass
@ -258,16 +259,17 @@ OptionRecommendation(name='language',
# heavy lifting.
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
parse_cache, accelerators = {}, {}
accelerators = {}
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, parse_cache, self.log,
self.input_fmt, self.log,
accelerators)
html_preprocessor = HTMLPreProcessor()
self.reader = OEBReader()
self.oeb = OEBBook(self.log, parse_cache=parse_cache)
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
# Read OEB Book into OEBBook
self.reader(self.oeb, opfpath)

View File

@ -0,0 +1,123 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, functools
from calibre import entity_to_unicode
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n'+x+'\n</head>'
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
def __call__(self, data):
data = self.PAGE_PAT.sub('', data)
return data
class HTMLPreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if \
re.match('<', match.group(1).lstrip()) or \
len(match.group(1)) < 40 else match.group(1)),
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
]
# Fix Book Designer markup
BOOK_DESIGNER = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
rules = self.PDFTOHTML
else:
rules = []
for rule in self.PREPROCESS + rules:
html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
return html

View File

@ -12,19 +12,22 @@ class MOBIInput(InputFormatPlugin):
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log,
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.mobi.reader import MobiReader
from lxml import html
mr = MobiReader(stream, log, options.input_encoding,
options.debug_input)
parse_cache = {}
mr.extract_content('.', parse_cache)
raw = parse_cache.get('calibre_raw_mobi_markup', False)
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw)
for f, root in parse_cache.items():
if '.' in f:
accelerators[f] = {'pagebreaks':root.xpath(
'//div[@class="mbp_pagebreak"]')}
with open(f, 'wb') as q:
q.write(html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False))
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
return mr.created_opf_path

View File

@ -20,6 +20,8 @@ from cssutils import CSSParser
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
CSSPreProcessor
XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -207,6 +209,10 @@ class OEBError(Exception):
"""Generic OEB-processing error."""
pass
class NotHTML(OEBError):
'''Raised when a file that should be HTML (as per manifest) is not'''
pass
class NullContainer(object):
"""An empty container.
@ -575,14 +581,7 @@ class Manifest(object):
def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings
data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in data and XLINK_NS not in data:
data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
data = self.oeb.html_preprocessor(data)
# Try with more & more drastic measures to parse
try:
data = etree.fromstring(data)
@ -606,7 +605,7 @@ class Manifest(object):
data = etree.fromstring(data, parser=RECOVER_PARSER)
# Force into the XHTML namespace
if barename(data.tag) != 'html':
raise OEBError(
raise NotHTML(
'File %r does not appear to be (X)HTML' % self.href)
elif not namespace(data.tag):
data.attrib['xmlns'] = XHTML_NS
@ -659,6 +658,7 @@ class Manifest(object):
def _parse_css(self, data):
data = self.oeb.decode(data)
data = self.CSSPreProcessor(data)
data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
fetcher=self._fetch_css)
@ -793,7 +793,7 @@ class Manifest(object):
MIME type which is not one of the OPS core media types. Either the
item's data itself may be provided with :param:`data`, or a loader
function for the data may be provided with :param:`loader`, or the
item's data may latter be set manually via the :attr:`data` attribute.
item's data may later be set manually via the :attr:`data` attribute.
"""
item = self.Item(
self.oeb, id, href, media_type, fallback, loader, data)
@ -840,6 +840,9 @@ class Manifest(object):
for item in self.items:
yield item
def __len__(self):
return len(self.items)
def values(self):
return list(self.items)
@ -1255,17 +1258,22 @@ class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger, parse_cache={}, encoding='utf-8',
pretty_print=False):
"""Create empty book. Optional arguments:
def __init__(self, logger,
html_preprocessor=HTMLPreProcessor(),
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments:
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
paths to the cached files and values are lxml root objects and
cssutils stylesheets.
:param:`encoding`: Default encoding for textual content read
from an external container.
:param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed.
:param html_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all html files
before they are parsed.
:param css_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all CSS files
before they are parsed.
:param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible
via the instance data members :attr:`logger,log`.
@ -1286,6 +1294,8 @@ class OEBBook(object):
"""
self.encoding = encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print
self.logger = self.log = logger
self.version = '2.0'

View File

@ -161,10 +161,30 @@ class OEBReader(object):
self.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
def _manifest_add_missing(self):
def _manifest_prune_invalid(self):
'''
Remove items from manifest that contain invalid data. This prevents
catastrophic conversion failure, when a few files contain corrupted
data.
'''
bad = []
check = OEB_DOCS+OEB_STYLES
for item in list(self.oeb.manifest.values()):
if item.media_type in check:
try:
item.data
except:
self.logger.exception('Failed to parse content in %s'%
item.href)
bad.append(item)
self.oeb.manifest.remove(item)
return bad
def _manifest_add_missing(self, invalid):
manifest = self.oeb.manifest
known = set(manifest.hrefs)
unchecked = set(manifest.values())
bad = []
while unchecked:
new = set()
for item in unchecked:
@ -190,6 +210,13 @@ class OEBReader(object):
unchecked.clear()
for href in new:
known.add(href)
is_invalid = False
for item in invalid:
if href == item.abshref(urlnormalize(href)):
is_invalid = True
break
if is_invalid:
continue
if not self.oeb.container.exists(href):
self.logger.warn('Referenced file %r not found' % href)
continue
@ -222,7 +249,8 @@ class OEBReader(object):
self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
self._manifest_add_missing()
invalid = self._manifest_prune_invalid()
self._manifest_add_missing(invalid)
def _spine_add_extra(self):
manifest = self.oeb.manifest