diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 7920b823de..3a89a9b156 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() + def preprocess_html(self, html): + ''' + This method is called by the conversion pipeline on all HTML before it + is parsed. It is meant to be used to do any required preprocessing on + the HTML, like removing hard line breaks, etc. + + :param html: A unicode string + :return: A unicode string + ''' + return html + + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 53b1a2065d..3274b912ea 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber): 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'preprocess_html', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index d1630a25f2..ed0fd4584e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata', ) ), +OptionRecommendation(name='preprocess_html', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Attempt to detect and correct hard line breaks and other ' + 'problems in the source file. This may make things worse, so use ' + 'with care.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes', self.log('Debug input called, aborting the rest of the pipeline.') return if not hasattr(self.oeb, 'manifest'): - self.oeb = create_oebbook(self.log, self.oeb, self.opts) + self.oeb = create_oebbook(self.log, self.oeb, self.opts, + self.input_plugin) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr(0., _('Running transforms on ebook...')) @@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes', self.opts, self.log) self.ui_reporter(1.) -def create_oebbook(log, path_or_stream, opts, reader=None): +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): ''' Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor() + html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, + opts.preprocess_html) oeb = OEBBook(log, html_preprocessor=html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9bfe6d4255..76fc36708e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,16 +26,16 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '