From 64796696ae0bec276c798bcc12e8b6d10a878788 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 12:35:02 -0500
Subject: [PATCH] Enable heuristic processing over the entire conversion pipe
 line when option is enabled.

---
 src/calibre/customize/conversion.py         | 12 ------------
 src/calibre/ebooks/conversion/plumber.py    |  6 ++----
 src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
 src/calibre/ebooks/conversion/utils.py      |  4 ++--
 src/calibre/ebooks/html/input.py            |  7 -------
 src/calibre/ebooks/lit/input.py             |  9 +--------
 src/calibre/ebooks/lrf/input.py             |  9 ---------
 src/calibre/ebooks/pdb/input.py             |  1 -
 src/calibre/ebooks/rtf/input.py             |  1 -
 9 files changed, 11 insertions(+), 50 deletions(-)
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index a9e573ffa0..b77ac81587 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
-    def heuristics(self, opts, html):
-        '''
-        This method is called by the conversion pipeline on all HTML before it
-        is parsed. It is meant to be used to do any required preprocessing on
-        the HTML, like removing hard line breaks, etc.
-
-        :param html: A unicode string
-        :return: A unicode string
-        '''
-        return html
-
-
     def convert(self, stream, options, file_ext, log, accelerators):
         '''
         This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index b8c45dfa14..249f848661 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
                 self.opts_to_mi(self.user_metadata)
             if not hasattr(self.oeb, 'manifest'):
                 self.oeb = create_oebbook(self.log, self.oeb, self.opts,
-                        self.input_plugin,
                         encoding=self.input_plugin.output_encoding)
             self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
             self.opts.is_image_collection = self.input_plugin.is_image_collection
@@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
         self.log(self.output_fmt.upper(), 'output written to', self.output)
         self.flush()
 
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
         encoding='utf-8', populate=True):
     '''
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
-            opts.enable_heuristics, opts)
+    html_preprocessor = HTMLPreProcessor(log, opts)
     if not encoding:
         encoding = None
     oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 35a311d58f..abaff77f33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
                      (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                       lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                      ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
+    def __init__(self, log=None, extra_opts=None):
+        self.log = log
         self.extra_opts = extra_opts
 
     def is_baen(self, src):
@@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
             unidecoder = Unidecoder()
             html = unidecoder.decode(html)
 
-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)
 
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..48806e78e7 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
 
-class PreProcessor(object):
+class HeuristicProcessor(object):
 
     def __init__(self, extra_opts=None, log=None):
         self.log = default_log if log is None else log
@@ -366,7 +366,7 @@ class PreProcessor(object):
 
 
     def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+        self.log("*********  Heuristic processing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 479f852c77..ed0bf7b3ef 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
             self.log.exception('Failed to read CSS file: %r'%link)
             return (None, None)
         return (None, raw)
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index d0ecf008b7..7b822b68a6 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
 
 
 class LITInput(InputFormatPlugin):
@@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
                     for elem in body:
                         ne = copy.deepcopy(elem)
                         pre.append(ne)
-
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 05c8731da5..70f3c3a15a 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 
 class Canvas(etree.XSLTExtension):
@@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
             f.write(result)
         styles.write()
         return os.path.abspath('content.opf')
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
-
-
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index de210e0a6d..cd861216af 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class PDBInput(InputFormatPlugin):
 
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 2f931d1d04..d3849bc5f5 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 border_style_map = {
         'single' : 'solid',