From dc0e0f26a1a285bcaf3c1dd7b622bc54e0017a58 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 2 May 2009 12:40:29 -0700
Subject: [PATCH] Add a --preprocess-html option

---
 src/calibre/customize/conversion.py         | 12 ++++++++++++
 src/calibre/ebooks/conversion/cli.py        |  1 +
 src/calibre/ebooks/conversion/plumber.py    | 16 +++++++++++++---
 src/calibre/ebooks/conversion/preprocess.py | 18 ++++++++++++------
 src/calibre/ebooks/html/input.py            |  2 +-
 src/calibre/ebooks/lit/input.py             |  2 +-
 src/calibre/ebooks/oeb/base.py              |  2 +-
 7 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index 7920b823de..3a89a9b156 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
+    def preprocess_html(self, html):
+        '''
+        This method is called by the conversion pipeline on all HTML before it
+        is parsed. It is meant to be used to do any required preprocessing on
+        the HTML, like removing hard line breaks, etc.
+
+        :param html: A unicode string
+        :return: A unicode string
+        '''
+        return html
+
+
     def convert(self, stream, options, file_ext, log, accelerators):
         '''
         This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 53b1a2065d..3274b912ea 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
                       'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
                       'prefer_metadata_cover', 'remove_first_image',
                       'insert_metadata', 'page_breaks_before',
+                      'preprocess_html',
                   ]
                   ),
 
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index d1630a25f2..ed0fd4584e 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
             )
         ),
 
+OptionRecommendation(name='preprocess_html',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Attempt to detect and correct hard line breaks and other '
+            'problems in the source file. This may make things worse, so use '
+            'with care.'
+            )
+        ),
+
 
 OptionRecommendation(name='read_metadata_from_opf',
             recommended_value=None, level=OptionRecommendation.LOW,
@@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
             self.log('Debug input called, aborting the rest of the pipeline.')
             return
         if not hasattr(self.oeb, 'manifest'):
-            self.oeb = create_oebbook(self.log, self.oeb, self.opts)
+            self.oeb = create_oebbook(self.log, self.oeb, self.opts,
+                    self.input_plugin)
         pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
         pr(0., _('Running transforms on ebook...'))
 
@@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
                 self.opts, self.log)
         self.ui_reporter(1.)
 
-def create_oebbook(log, path_or_stream, opts, reader=None):
+def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
     '''
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor()
+    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
+            opts.preprocess_html)
     oeb = OEBBook(log, html_preprocessor=html_preprocessor,
             pretty_print=opts.pretty_print)
     # Read OEB Book into OEBBook
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 9bfe6d4255..76fc36708e 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -26,16 +26,16 @@ def sanitize_head(match):
 def chap_head(match):
     chap = match.group('chap')
     title = match.group('title')
-    if not title: 
+    if not title:
                return '<h1>'+chap+'</h1><br/>\n'
-    else: 
+    else:
                return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
 
 def wrap_lines(match):
     ital = match.group('ital')
-    if not ital: 
+    if not ital:
                return ' '
-    else: 
+    else:
                return ital+' '
 
 def line_length(raw, percent):
@@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
                   (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
                   (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
-                  
+
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
@@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
                      (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                       lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                      ]
+    def __init__(self, input_plugin_preprocess, plugin_preprocess):
+        self.input_plugin_preprocess = input_plugin_preprocess
+        self.plugin_preprocess = plugin_preprocess
 
     def is_baen(self, src):
         return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@@ -175,7 +178,7 @@ class HTMLPreProcessor(object):
                 # Un wrap using punctuation
                 (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
             ]
-            
+
             rules = self.PDFTOHTML + line_length_rules
         else:
             rules = []
@@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
 
         html = XMLDECL_RE.sub('', html)
 
+        if self.plugin_preprocess:
+            html = self.input_plugin_preprocess(html)
+
         return html
 
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 252032a23d..255d975b1e 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
             return opfpath
 
         from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, opfpath, opts)
+        oeb = create_oebbook(log, opfpath, opts, self)
 
         from calibre.ebooks.oeb.transforms.package import Package
         Package(os.getcwdu())(oeb, opts)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 2d726f7eeb..409482da29 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
                 accelerators):
         from calibre.ebooks.lit.reader import LitReader
         from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream, options, reader=LitReader)
+        return create_oebbook(log, stream, options, self, reader=LitReader)
 
 
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index faf2d02dc4..728e1711a0 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -1506,7 +1506,7 @@ class OEBBook(object):
     COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
 
     def __init__(self, logger,
-            html_preprocessor=HTMLPreProcessor(),
+            html_preprocessor,
             css_preprocessor=CSSPreProcessor(),
             encoding='utf-8', pretty_print=False):
         """Create empty book.  Arguments: