Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook.

2025-08-11 09:13:57 -04:00 · 2009-03-29 21:09:04 -07:00 · 2009-03-29 21:09:04 -07:00 · 44799e05ef
commit 44799e05ef
parent b98ada75f7
8 changed files with 242 additions and 107 deletions
--- a/session.vim
+++ b/session.vim
@ -1,5 +1,5 @@
 " Project wide builtins
-let g:pyflakes_builtins += ["dynamic_property"]
+let g:pyflakes_builtins += ["dynamic_property", '__']
 python << EOFPY
 import os
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -4,8 +4,6 @@ Defines the plugin system for conversions.
 '''
 import re, os, shutil
 from lxml import html
 from calibre import CurrentDir
 from calibre.customize import Plugin
@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
    #: (option_name, recommended_value, recommendation_level)
    recommendations = set([])
-    def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
+    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
        the path to the created OPF file. All output should be contained in
@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
                         is guaranteed to be one of the `file_types` supported
                         by this plugin.
        :param parse_cache:    A dictionary that maps absolute file paths to
                               parsed representations of their contents. For
                               HTML the representation is an lxml element of
                               the root of the tree. For CSS it is a cssutils
                               stylesheet. If this plugin parses any of the
                               output files, it should add them to the cache
                               so that later stages of the conversion wont
                               have to re-parse them. If a parsed representation
                               is in the cache, there is no need to actually
                               write the file to disk.
        :param log: A :class:`calibre.utils.logging.Log` object. All output
                    should use this object.
@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError
-    def __call__(self, stream, options, file_ext, parse_cache, log,
+    def __call__(self, stream, options, file_ext, log,
                 accelerators, output_dir):
        log('InputFormatPlugin: %s running'%self.name, end=' ')
        if hasattr(stream, 'name'):
@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
-            ret = self.convert(stream, options, file_ext, parse_cache,
+            ret = self.convert(stream, options, file_ext,
                               log, accelerators)
            for key in list(parse_cache.keys()):
                if os.path.abspath(key) != key:
                    log.warn(('InputFormatPlugin: %s returned a '
                             'relative path: %s')%(self.name, key)
                             )
                    parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
        if options.debug_input is not None:
            options.debug_input = os.path.abspath(options.debug_input)
            if not os.path.exists(options.debug_input):
                os.makedirs(options.debug_input)
            shutil.rmtree(options.debug_input)
            for f, obj in parse_cache.items():
                if hasattr(obj, 'cssText'):
                    raw = obj.cssText
                else:
                    raw = html.tostring(obj, encoding='utf-8', method='xml',
                         include_meta_content_type=True, pretty_print=True)
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                open(f, 'wb').write(raw)
            shutil.copytree('.', options.debug_input)
        return ret
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -39,7 +39,7 @@ class OutputProfile(Plugin):
    epub_flow_size            = sys.maxint
    screen_size               = None
-    remove_special_chars      = False
+    remove_special_chars      = None
    remove_object_tags        = False
 class SonyReader(OutputProfile):
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -8,6 +8,7 @@ import os
 from calibre.customize.conversion import OptionRecommendation
 from calibre.customize.ui import input_profiles, output_profiles, \
        plugin_for_input_format, plugin_for_output_format
 from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
 class OptionValues(object):
    pass
@ -258,16 +259,17 @@ OptionRecommendation(name='language',
        # heavy lifting.
        from calibre.ebooks.oeb.reader import OEBReader
        from calibre.ebooks.oeb.base import OEBBook
-        parse_cache, accelerators = {}, {}
+        accelerators = {}
        opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
-                                    self.input_fmt, parse_cache, self.log,
+                                    self.input_fmt, self.log,
                                    accelerators)
-
+        html_preprocessor = HTMLPreProcessor()
        self.reader = OEBReader()
-        self.oeb = OEBBook(self.log, parse_cache=parse_cache)
+        self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
        # Read OEB Book into OEBBook
        self.reader(self.oeb, opfpath)
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -0,0 +1,123 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re, functools
 from calibre import entity_to_unicode
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 SVG_NS       = 'http://www.w3.org/2000/svg'
 XLINK_NS     = 'http://www.w3.org/1999/xlink'
 convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
 _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
 def sanitize_head(match):
    x = match.group(1)
    x = _span_pat.sub('', x)
    return '<head>\n'+x+'\n</head>'
 class CSSPreProcessor(object):
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
    def __call__(self, data):
        data = self.PAGE_PAT.sub('', data)
        return data
 class HTMLPreProcessor(object):
    PREPROCESS = [
                  # Some idiotic HTML generators (Frontpage I'm looking at you)
                  # Put all sorts of crap into <head>. This messes up lxml
                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
                   lambda match: ''),
                  ]
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Remove page numbers
                  (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
                   lambda match: match.group() if \
                           re.match('<', match.group(1).lstrip()) or \
                           len(match.group(1)) < 40  else match.group(1)),
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  # Remove non breaking spaces
                  (re.compile(ur'\u00a0'), lambda match : ' '),
                  ]
    # Fix Book Designer markup
    BOOK_DESIGNER = [
                     # HR
                     (re.compile('<hr>', re.IGNORECASE),
                      lambda match : '<span style="page-break-after:always"> </span>'),
                     # Create header tags
                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
    def is_baen(self, src):
        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
                          re.IGNORECASE).search(src) is not None
    def is_book_designer(self, raw):
        return re.search('<H2[^><]*id=BookTitle', raw) is not None
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
    def __call__(self, html, remove_special_chars=None):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):
            rules = self.PDFTOHTML
        else:
            rules = []
        for rule in self.PREPROCESS + rules:
            html = rule[0].sub(rule[1], html)
        # Handle broken XHTML w/ SVG (ugh)
        if 'svg:' in html and SVG_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
        if 'xlink:' in html and XLINK_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
        html = XMLDECL_RE.sub('', html)
        return html
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -12,19 +12,22 @@ class MOBIInput(InputFormatPlugin):
    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
    file_types  = set(['mobi', 'prc', 'azw'])
-    def convert(self, stream, options, file_ext, parse_cache, log, 
+    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.mobi.reader import MobiReader
        from lxml import html
        mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_input)
        parse_cache = {}
        mr.extract_content('.', parse_cache)
-        raw = parse_cache.get('calibre_raw_mobi_markup', False)
+        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
        if raw:
            if isinstance(raw, unicode):
                raw = raw.encode('utf-8')
            open('debug-raw.html', 'wb').write(raw)
        for f, root in parse_cache.items():
-            if '.' in f:
+            with open(f, 'wb') as q:
-                accelerators[f] = {'pagebreaks':root.xpath(
+                q.write(html.tostring(root, encoding='utf-8', method='xml',
-                                            '//div[@class="mbp_pagebreak"]')}
+                    include_meta_content_type=False))
            accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
        return mr.created_opf_path
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -20,6 +20,8 @@ from cssutils import CSSParser
 from calibre.translations.dynamic import translate
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
        CSSPreProcessor
 XML_NS       = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
@ -207,6 +209,10 @@ class OEBError(Exception):
    """Generic OEB-processing error."""
    pass
 class NotHTML(OEBError):
    '''Raised when a file that should be HTML (as per manifest) is not'''
    pass
 class NullContainer(object):
    """An empty container.
@ -575,14 +581,7 @@ class Manifest(object):
        def _parse_xhtml(self, data):
            # Convert to Unicode and normalize line endings
            data = self.oeb.decode(data)
-            data = XMLDECL_RE.sub('', data)
+            data = self.oeb.html_preprocessor(data)
            # Handle broken XHTML w/ SVG (ugh)
            if 'svg:' in data and SVG_NS not in data:
                data = data.replace(
                    '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
            if 'xlink:' in data and XLINK_NS not in data:
                data = data.replace(
                    '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
            # Try with more & more drastic measures to parse
            try:
                data = etree.fromstring(data)
@ -606,7 +605,7 @@ class Manifest(object):
                        data = etree.fromstring(data, parser=RECOVER_PARSER)
            # Force into the XHTML namespace
            if barename(data.tag) != 'html':
-                raise OEBError(
+                raise NotHTML(
                    'File %r does not appear to be (X)HTML' % self.href)
            elif not namespace(data.tag):
                data.attrib['xmlns'] = XHTML_NS
@ -659,6 +658,7 @@ class Manifest(object):
        def _parse_css(self, data):
            data = self.oeb.decode(data)
            data = self.CSSPreProcessor(data)
            data = XHTML_CSS_NAMESPACE + data
            parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
                               fetcher=self._fetch_css)
@ -793,7 +793,7 @@ class Manifest(object):
        MIME type which is not one of the OPS core media types.  Either the
        item's data itself may be provided with :param:`data`, or a loader
        function for the data may be provided with :param:`loader`, or the
-        item's data may latter be set manually via the :attr:`data` attribute.
+        item's data may later be set manually via the :attr:`data` attribute.
        """
        item = self.Item(
            self.oeb, id, href, media_type, fallback, loader, data)
@ -840,6 +840,9 @@ class Manifest(object):
        for item in self.items:
            yield item
    def __len__(self):
        return len(self.items)
    def values(self):
        return list(self.items)
@ -1255,17 +1258,22 @@ class OEBBook(object):
    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
-    def __init__(self, logger, parse_cache={}, encoding='utf-8',
+    def __init__(self, logger,
-                 pretty_print=False):
+            html_preprocessor=HTMLPreProcessor(),
-        """Create empty book.  Optional arguments:
+            css_preprocessor=CSSPreProcessor(),
            encoding='utf-8', pretty_print=False):
        """Create empty book.  Arguments:
        :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
            paths to the cached files and values are lxml root objects and
            cssutils stylesheets.
        :param:`encoding`: Default encoding for textual content read
            from an external container.
        :param:`pretty_print`: Whether or not the canonical string form
            of XML markup is pretty-printed.
        :param html_preprocessor: A callable that takes a unicode object
            and returns a unicode object. Will be called on all html files
            before they are parsed.
        :param css_preprocessor: A callable that takes a unicode object
            and returns a unicode object. Will be called on all CSS files
            before they are parsed.
        :param:`logger`: A Log object to use for logging all messages
            related to the processing of this book.  It is accessible
            via the instance data members :attr:`logger,log`.
@ -1286,6 +1294,8 @@ class OEBBook(object):
        """
        self.encoding = encoding
        self.html_preprocessor = html_preprocessor
        self.css_preprocessor = css_preprocessor
        self.pretty_print = pretty_print
        self.logger = self.log = logger
        self.version = '2.0'
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -161,10 +161,30 @@ class OEBReader(object):
            self.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
-    def _manifest_add_missing(self):
+    def _manifest_prune_invalid(self):
        '''
        Remove items from manifest that contain invalid data. This prevents
        catastrophic conversion failure, when a few files contain corrupted
        data.
        '''
        bad = []
        check = OEB_DOCS+OEB_STYLES
        for item in list(self.oeb.manifest.values()):
            if item.media_type in check:
                try:
                    item.data
                except:
                    self.logger.exception('Failed to parse content in %s'%
                            item.href)
                    bad.append(item)
                    self.oeb.manifest.remove(item)
        return bad
    def _manifest_add_missing(self, invalid):
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        bad = []
        while unchecked:
            new = set()
            for item in unchecked:
@ -190,6 +210,13 @@ class OEBReader(object):
            unchecked.clear()
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    self.logger.warn('Referenced file %r not found' % href)
                    continue
@ -222,7 +249,8 @@ class OEBReader(object):
                self.logger.warn(u'Duplicate manifest id %r' % id)
                id, href = manifest.generate(id, href)
            manifest.add(id, href, media_type, fallback)
-        self._manifest_add_missing()
+        invalid = self._manifest_prune_invalid()
        self._manifest_add_missing(invalid)
    def _spine_add_extra(self):
        manifest = self.oeb.manifest