From 64796696ae0bec276c798bcc12e8b6d10a878788 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 15 Jan 2011 12:35:02 -0500 Subject: [PATCH] Enable heuristic processing over the entire conversion pipe line when option is enabled. --- src/calibre/customize/conversion.py | 12 ------------ src/calibre/ebooks/conversion/plumber.py | 6 ++---- src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------ src/calibre/ebooks/conversion/utils.py | 4 ++-- src/calibre/ebooks/html/input.py | 7 ------- src/calibre/ebooks/lit/input.py | 9 +-------- src/calibre/ebooks/lrf/input.py | 9 --------- src/calibre/ebooks/pdb/input.py | 1 - src/calibre/ebooks/rtf/input.py | 1 - 9 files changed, 11 insertions(+), 50 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index a9e573ffa0..b77ac81587 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() - def heuristics(self, opts, html): - ''' - This method is called by the conversion pipeline on all HTML before it - is parsed. It is meant to be used to do any required preprocessing on - the HTML, like removing hard line breaks, etc. - - :param html: A unicode string - :return: A unicode string - ''' - return html - - def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index b8c45dfa14..249f848661 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace', self.opts_to_mi(self.user_metadata) if not hasattr(self.oeb, 'manifest'): self.oeb = create_oebbook(self.log, self.oeb, self.opts, - self.input_plugin, encoding=self.input_plugin.output_encoding) self.input_plugin.postprocess_book(self.oeb, self.opts, self.log) self.opts.is_image_collection = self.input_plugin.is_image_collection @@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace', self.log(self.output_fmt.upper(), 'output written to', self.output) self.flush() -def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, +def create_oebbook(log, path_or_stream, opts, reader=None, encoding='utf-8', populate=True): ''' Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor(input_plugin.heuristics, - opts.enable_heuristics, opts) + html_preprocessor = HTMLPreProcessor(log, opts) if not encoding: encoding = None oeb = OEBBook(log, html_preprocessor, diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 35a311d58f..abaff77f33 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -397,10 +397,8 @@ class HTMLPreProcessor(object): (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), ] - def __init__(self, input_plugin_preprocess, plugin_preprocess, - extra_opts=None): - self.input_plugin_preprocess = input_plugin_preprocess - self.plugin_preprocess = plugin_preprocess + def __init__(self, log=None, extra_opts=None): + self.log = log self.extra_opts = extra_opts def is_baen(self, src): @@ -542,8 +540,10 @@ class HTMLPreProcessor(object): unidecoder = Unidecoder() html = unidecoder.decode(html) - if self.plugin_preprocess: - html = self.input_plugin_preprocess(self.extra_opts, html) + if getattr(self.extra_opts, 'enable_heuristics', False): + from calibre.ebooks.conversion.utils import HeuristicProcessor + preprocessor = HeuristicProcessor(self.extra_opts, self.log) + html = preprocessor(html) if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 305346d496..48806e78e7 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log from calibre.utils.wordcount import get_wordcount_obj -class PreProcessor(object): +class HeuristicProcessor(object): def __init__(self, extra_opts=None, log=None): self.log = default_log if log is None else log @@ -366,7 +366,7 @@ class PreProcessor(object): def __call__(self, html): - self.log("********* Preprocessing HTML *********") + self.log("********* Heuristic processing HTML *********") # Count the words in the document to estimate how many chapters to look for and whether # other types of processing are attempted diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 479f852c77..ed0bf7b3ef 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename -from calibre.ebooks.conversion.utils import PreProcessor class Link(object): ''' @@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin): self.log.exception('Failed to read CSS file: %r'%link) return (None, None) return (None, raw) - - def heuristics(self, options, html): - self.options = options - preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html) - diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index d0ecf008b7..7b822b68a6 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.utils import PreProcessor +from calibre.ebooks.conversion.utils import HeuristicProcessor class LITInput(InputFormatPlugin): @@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin): for elem in body: ne = copy.deepcopy(elem) pre.append(ne) - - - def heuristics(self, options, html): - self.options = options - preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html) - diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index 05c8731da5..70f3c3a15a 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -12,7 +12,6 @@ from copy import deepcopy from lxml import etree from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.utils import PreProcessor from calibre import guess_type class Canvas(etree.XSLTExtension): @@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin): f.write(result) styles.write() return os.path.abspath('content.opf') - - def heuristics(self, options, html): - self.options = options - preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html) - - - diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index de210e0a6d..cd861216af 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,7 +9,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader -from calibre.ebooks.conversion.utils import PreProcessor class PDBInput(InputFormatPlugin): diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 2f931d1d04..d3849bc5f5 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -7,7 +7,6 @@ import os, glob, re, textwrap from lxml import etree from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.utils import PreProcessor border_style_map = { 'single' : 'solid',