From dc0e0f26a1a285bcaf3c1dd7b622bc54e0017a58 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 May 2009 12:40:29 -0700 Subject: [PATCH] Add a --preprocess-html option --- src/calibre/customize/conversion.py | 12 ++++++++++++ src/calibre/ebooks/conversion/cli.py | 1 + src/calibre/ebooks/conversion/plumber.py | 16 +++++++++++++--- src/calibre/ebooks/conversion/preprocess.py | 18 ++++++++++++------ src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/lit/input.py | 2 +- src/calibre/ebooks/oeb/base.py | 2 +- 7 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 7920b823de..3a89a9b156 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() + def preprocess_html(self, html): + ''' + This method is called by the conversion pipeline on all HTML before it + is parsed. It is meant to be used to do any required preprocessing on + the HTML, like removing hard line breaks, etc. + + :param html: A unicode string + :return: A unicode string + ''' + return html + + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 53b1a2065d..3274b912ea 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber): 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'preprocess_html', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index d1630a25f2..ed0fd4584e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata', ) ), +OptionRecommendation(name='preprocess_html', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Attempt to detect and correct hard line breaks and other ' + 'problems in the source file. This may make things worse, so use ' + 'with care.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes', self.log('Debug input called, aborting the rest of the pipeline.') return if not hasattr(self.oeb, 'manifest'): - self.oeb = create_oebbook(self.log, self.oeb, self.opts) + self.oeb = create_oebbook(self.log, self.oeb, self.opts, + self.input_plugin) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr(0., _('Running transforms on ebook...')) @@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes', self.opts, self.log) self.ui_reporter(1.) -def create_oebbook(log, path_or_stream, opts, reader=None): +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): ''' Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor() + html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, + opts.preprocess_html) oeb = OEBBook(log, html_preprocessor=html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9bfe6d4255..76fc36708e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,16 +26,16 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '

'+chap+'


\n' - else: + else: return '

'+chap+'
\n'+title+'


\n' def wrap_lines(match): ital = match.group('ital') - if not ital: + if not ital: return ' ' - else: + else: return ital+' ' def line_length(raw, percent): @@ -106,7 +106,7 @@ class HTMLPreProcessor(object): (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), - + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags @@ -151,6 +151,9 @@ class HTMLPreProcessor(object): (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), ] + def __init__(self, input_plugin_preprocess, plugin_preprocess): + self.input_plugin_preprocess = input_plugin_preprocess + self.plugin_preprocess = plugin_preprocess def is_baen(self, src): return re.compile(r')?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] - + rules = self.PDFTOHTML + line_length_rules else: rules = [] @@ -192,5 +195,8 @@ class HTMLPreProcessor(object): html = XMLDECL_RE.sub('', html) + if self.plugin_preprocess: + html = self.input_plugin_preprocess(html) + return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 252032a23d..255d975b1e 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin): return opfpath from calibre.ebooks.conversion.plumber import create_oebbook - oeb = create_oebbook(log, opfpath, opts) + oeb = create_oebbook(log, opfpath, opts, self) from calibre.ebooks.oeb.transforms.package import Package Package(os.getcwdu())(oeb, opts) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 2d726f7eeb..409482da29 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin): accelerators): from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, stream, options, reader=LitReader) + return create_oebbook(log, stream, options, self, reader=LitReader) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index faf2d02dc4..728e1711a0 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1506,7 +1506,7 @@ class OEBBook(object): COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') def __init__(self, logger, - html_preprocessor=HTMLPreProcessor(), + html_preprocessor, css_preprocessor=CSSPreProcessor(), encoding='utf-8', pretty_print=False): """Create empty book. Arguments: