Enable heuristic processing over the entire conversion pipe line when option is enabled.

2025-07-09 03:04:10 -04:00 · 2011-01-15 12:35:02 -05:00 · 2011-01-15 12:35:02 -05:00 · 64796696ae
commit 64796696ae
parent d6256ef452
9 changed files with 11 additions and 50 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()
    def heuristics(self, opts, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
        the HTML, like removing hard line breaks, etc.
        :param html: A unicode string
        :return: A unicode string
        '''
        return html
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
                self.opts_to_mi(self.user_metadata)
            if not hasattr(self.oeb, 'manifest'):
                self.oeb = create_oebbook(self.log, self.oeb, self.opts,
                        self.input_plugin,
                        encoding=self.input_plugin.output_encoding)
            self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
            self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
        self.log(self.output_fmt.upper(), 'output written to', self.output)
        self.flush()
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
        encoding='utf-8', populate=True):
    '''
    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+    html_preprocessor = HTMLPreProcessor(log, opts)
            opts.enable_heuristics, opts)
    if not encoding:
        encoding = None
    oeb = OEBBook(log, html_preprocessor,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
+    def __init__(self, log=None, extra_opts=None):
-            extra_opts=None):
+        self.log = log
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
        self.extra_opts = extra_opts
    def is_baen(self, src):
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
            unidecoder = Unidecoder()
            html = unidecoder.decode(html)
-        if self.plugin_preprocess:
+        if getattr(self.extra_opts, 'enable_heuristics', False):
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
@ -366,7 +366,7 @@ class PreProcessor(object):
    def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+        self.log("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.conversion.utils import PreProcessor
 class Link(object):
    '''
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
        return (None, raw)
    def heuristics(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
 class LITInput(InputFormatPlugin):
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
                    for elem in body:
                        ne = copy.deepcopy(elem)
                        pre.append(ne)
    def heuristics(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
            f.write(result)
        styles.write()
        return os.path.abspath('content.opf')
    def heuristics(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 from calibre.ebooks.conversion.utils import PreProcessor
 class PDBInput(InputFormatPlugin):
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 border_style_map = {
        'single' : 'solid',