diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index a9e573ffa0..b77ac81587 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def heuristics(self, opts, html):
- '''
- This method is called by the conversion pipeline on all HTML before it
- is parsed. It is meant to be used to do any required preprocessing on
- the HTML, like removing hard line breaks, etc.
-
- :param html: A unicode string
- :return: A unicode string
- '''
- return html
-
-
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index b8c45dfa14..249f848661 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
- self.input_plugin,
encoding=self.input_plugin.output_encoding)
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection
@@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
- opts.enable_heuristics, opts)
+ html_preprocessor = HTMLPreProcessor(log, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 35a311d58f..abaff77f33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
(re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
lambda match : '
%s
'%(match.group(1),)),
]
- def __init__(self, input_plugin_preprocess, plugin_preprocess,
- extra_opts=None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
+ def __init__(self, log=None, extra_opts=None):
+ self.log = log
self.extra_opts = extra_opts
def is_baen(self, src):
@@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
unidecoder = Unidecoder()
html = unidecoder.decode(html)
- if self.plugin_preprocess:
- html = self.input_plugin_preprocess(self.extra_opts, html)
+ if getattr(self.extra_opts, 'enable_heuristics', False):
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+ html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..48806e78e7 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
@@ -366,7 +366,7 @@ class PreProcessor(object):
def __call__(self, html):
- self.log("********* Preprocessing HTML *********")
+ self.log("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 479f852c77..ed0bf7b3ef 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index d0ecf008b7..7b822b68a6 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin):
@@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)
-
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 05c8731da5..70f3c3a15a 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,7 +12,6 @@ from copy import deepcopy
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type
class Canvas(etree.XSLTExtension):
@@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
f.write(result)
styles.write()
return os.path.abspath('content.opf')
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
-
-
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index de210e0a6d..cd861216af 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,7 +9,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin):
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 2f931d1d04..d3849bc5f5 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
border_style_map = {
'single' : 'solid',