diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index a9e573ffa0..b77ac81587 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def heuristics(self, opts, html):
- '''
- This method is called by the conversion pipeline on all HTML before it
- is parsed. It is meant to be used to do any required preprocessing on
- the HTML, like removing hard line breaks, etc.
-
- :param html: A unicode string
- :return: A unicode string
- '''
- return html
-
-
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index b8c45dfa14..249f848661 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
- self.input_plugin,
encoding=self.input_plugin.output_encoding)
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection
@@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
- opts.enable_heuristics, opts)
+ html_preprocessor = HTMLPreProcessor(log, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 35a311d58f..abaff77f33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
(re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
lambda match : '
%s
'%(match.group(1),)),
]
- def __init__(self, input_plugin_preprocess, plugin_preprocess,
- extra_opts=None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
+ def __init__(self, log=None, extra_opts=None):
+ self.log = log
self.extra_opts = extra_opts
def is_baen(self, src):
@@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
unidecoder = Unidecoder()
html = unidecoder.decode(html)
- if self.plugin_preprocess:
- html = self.input_plugin_preprocess(self.extra_opts, html)
+ if getattr(self.extra_opts, 'enable_heuristics', False):
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+ html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9825585cbf..96a9a4783d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
@@ -367,7 +367,7 @@ class PreProcessor(object):
def __call__(self, html):
- self.log("********* Preprocessing HTML *********")
+ self.log("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 479f852c77..ed0bf7b3ef 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index d0ecf008b7..7b822b68a6 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin):
@@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)
-
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 05c8731da5..70f3c3a15a 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,7 +12,6 @@ from copy import deepcopy
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type
class Canvas(etree.XSLTExtension):
@@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
f.write(result)
styles.write()
return os.path.abspath('content.opf')
-
- def heuristics(self, options, html):
- self.options = options
- preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
- return preprocessor(html)
-
-
-
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index de210e0a6d..cd861216af 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,7 +9,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin):
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 2f931d1d04..d3849bc5f5 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
border_style_map = {
'single' : 'solid',
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 2b9df50457..904804f32e 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
'italicize_common_cases', 'fix_indents',
'html_unwrap_factor', 'unwrap_lines',
'delete_blank_paragraphs', 'format_scene_breaks',
- 'dehyphenate']
+ 'dehyphenate', 'renumber_headings']
)
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)
@@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
self.opt_delete_blank_paragraphs.setEnabled(state)
self.opt_format_scene_breaks.setEnabled(state)
self.opt_dehyphenate.setEnabled(state)
+ self.opt_renumber_headings(state)
self.opt_unwrap_lines.setEnabled(state)
if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index e64e79e1df..c5f3c2cb3e 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -6,7 +6,7 @@
0
0
- 657
+ 811
479
@@ -80,42 +80,42 @@
- -
+
-
Delete blank lines between paragraphs
- -
+
-
Ensure scene breaks are consistently formatted
- -
+
-
Remove unnecessary hyphens
- -
+
-
Italicize common words and patterns
- -
+
-
Replace entity indents with CSS indents
- -
+
-
Qt::Vertical
@@ -141,6 +141,13 @@
+ -
+
+
+ Renumber sequences of <h1> or <h2> tags to prevent splitting
+
+
+