From 44799e05efc6a4696f98a8fcf4f7350876427bb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 29 Mar 2009 21:09:04 -0700 Subject: [PATCH] Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook. --- session.vim | 2 +- src/calibre/customize/conversion.py | 37 +----- src/calibre/customize/profiles.py | 28 ++--- src/calibre/ebooks/conversion/plumber.py | 10 +- src/calibre/ebooks/conversion/preprocess.py | 123 ++++++++++++++++++++ src/calibre/ebooks/mobi/input.py | 21 ++-- src/calibre/ebooks/oeb/base.py | 42 ++++--- src/calibre/ebooks/oeb/reader.py | 86 +++++++++----- 8 files changed, 242 insertions(+), 107 deletions(-) create mode 100644 src/calibre/ebooks/conversion/preprocess.py diff --git a/session.vim b/session.vim index 9d326c5822..454b468ae0 100644 --- a/session.vim +++ b/session.vim @@ -1,5 +1,5 @@ " Project wide builtins -let g:pyflakes_builtins += ["dynamic_property"] +let g:pyflakes_builtins += ["dynamic_property", '__'] python << EOFPY import os diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 5cf497d904..3ebabc4d52 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -4,8 +4,6 @@ Defines the plugin system for conversions. ''' import re, os, shutil -from lxml import html - from calibre import CurrentDir from calibre.customize import Plugin @@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) - def convert(self, stream, options, file_ext, parse_cache, log, accelerators): + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return the path to the created OPF file. All output should be contained in @@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin): is guaranteed to be one of the `file_types` supported by this plugin. - :param parse_cache: A dictionary that maps absolute file paths to - parsed representations of their contents. For - HTML the representation is an lxml element of - the root of the tree. For CSS it is a cssutils - stylesheet. If this plugin parses any of the - output files, it should add them to the cache - so that later stages of the conversion wont - have to re-parse them. If a parsed representation - is in the cache, there is no need to actually - write the file to disk. - :param log: A :class:`calibre.utils.logging.Log` object. All output should use this object. @@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError - def __call__(self, stream, options, file_ext, parse_cache, log, + def __call__(self, stream, options, file_ext, log, accelerators, output_dir): log('InputFormatPlugin: %s running'%self.name, end=' ') if hasattr(stream, 'name'): @@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - ret = self.convert(stream, options, file_ext, parse_cache, + ret = self.convert(stream, options, file_ext, log, accelerators) - for key in list(parse_cache.keys()): - if os.path.abspath(key) != key: - log.warn(('InputFormatPlugin: %s returned a ' - 'relative path: %s')%(self.name, key) - ) - parse_cache[os.path.abspath(key)] = parse_cache.pop(key) - if options.debug_input is not None: options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): os.makedirs(options.debug_input) shutil.rmtree(options.debug_input) - for f, obj in parse_cache.items(): - if hasattr(obj, 'cssText'): - raw = obj.cssText - else: - raw = html.tostring(obj, encoding='utf-8', method='xml', - include_meta_content_type=True, pretty_print=True) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') - open(f, 'wb').write(raw) shutil.copytree('.', options.debug_input) - - return ret diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index a3a7e22298..bd11a89bed 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -7,7 +7,7 @@ import sys, re from calibre.customize import Plugin class InputProfile(Plugin): - + author = 'Kovid Goyal' supported_platforms = set(['windows', 'osx', 'linux']) can_be_disabled = False @@ -20,40 +20,40 @@ class InputProfile(Plugin): short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you know nothing about the input document.') - + input_profiles = [InputProfile] - + class OutputProfile(Plugin): - + author = 'Kovid Goyal' supported_platforms = set(['windows', 'osx', 'linux']) can_be_disabled = False type = _('Output profile') - + name = 'Default Output Profile' short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you want to produce a document intended to be read at a ' 'computer or on a range of devices.') - + epub_flow_size = sys.maxint screen_size = None - remove_special_chars = False + remove_special_chars = None remove_object_tags = False - + class SonyReader(OutputProfile): - + name = 'Sony Reader' short_name = 'sony' description = _('This profile is intended for the SONY PRS line. ' 'The 500/505/700 etc.') - + epub_flow_size = 270000 screen_size = (590, 765) remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_object_tags = True - - - -output_profiles = [OutputProfile, SonyReader] \ No newline at end of file + + + +output_profiles = [OutputProfile, SonyReader] diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 44e2fda0c3..0e2f98fde4 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -8,6 +8,7 @@ import os from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format +from calibre.ebooks.conversion.preprocess import HTMLPreProcessor class OptionValues(object): pass @@ -258,16 +259,17 @@ OptionRecommendation(name='language', # heavy lifting. from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook - parse_cache, accelerators = {}, {} + accelerators = {} opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, - self.input_fmt, parse_cache, self.log, + self.input_fmt, self.log, accelerators) - + html_preprocessor = HTMLPreProcessor() self.reader = OEBReader() - self.oeb = OEBBook(self.log, parse_cache=parse_cache) + self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) # Read OEB Book into OEBBook self.reader(self.oeb, opfpath) + diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py new file mode 100644 index 0000000000..f544a331d8 --- /dev/null +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, functools + +from calibre import entity_to_unicode + +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') +SVG_NS = 'http://www.w3.org/2000/svg' +XLINK_NS = 'http://www.w3.org/1999/xlink' + +convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) +_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) + + +def sanitize_head(match): + x = match.group(1) + x = _span_pat.sub('', x) + return '\n'+x+'\n' + + +class CSSPreProcessor(object): + + PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + + def __call__(self, data): + data = self.PAGE_PAT.sub('', data) + return data + +class HTMLPreProcessor(object): + + PREPROCESS = [ + # Some idiotic HTML generators (Frontpage I'm looking at you) + # Put all sorts of crap into . This messes up lxml + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), + sanitize_head), + # Convert all entities, since lxml doesn't handle them well + (re.compile(r'&(\S+?);'), convert_entities), + # Remove the ', re.IGNORECASE), + lambda match: ''), + ] + + # Fix pdftohtml markup + PDFTOHTML = [ + # Remove
tags + (re.compile(r'', re.IGNORECASE), lambda match: '
'), + # Remove page numbers + (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), + # Remove
and replace

with

+ (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if \ + re.match('<', match.group(1).lstrip()) or \ + len(match.group(1)) < 40 else match.group(1)), + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r']+>'), lambda match : ''), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + ] + + # Fix Book Designer markup + BOOK_DESIGNER = [ + # HR + (re.compile('


', re.IGNORECASE), + lambda match : ' '), + # Create header tags + (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + lambda match : '

%s

'%(match.group(1),)), + (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + lambda match : '

%s

'%(match.group(1),)), + ] + + def is_baen(self, src): + return re.compile(r'<]*id=BookTitle', raw) is not None + + def is_pdftohtml(self, src): + return '' in src[:1000] + + def __call__(self, html, remove_special_chars=None): + if remove_special_chars is not None: + html = remove_special_chars.sub('', html) + if self.is_baen(html): + rules = [] + elif self.is_book_designer(html): + rules = self.BOOK_DESIGNER + elif self.is_pdftohtml(html): + rules = self.PDFTOHTML + else: + rules = [] + for rule in self.PREPROCESS + rules: + html = rule[0].sub(rule[1], html) + + # Handle broken XHTML w/ SVG (ugh) + if 'svg:' in html and SVG_NS not in html: + html = html.replace( + '