mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Enable heuristic processing over the entire conversion pipe line when option is enabled.
This commit is contained in:
parent
d6256ef452
commit
64796696ae
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def heuristics(self, opts, html):
|
||||
'''
|
||||
This method is called by the conversion pipeline on all HTML before it
|
||||
is parsed. It is meant to be used to do any required preprocessing on
|
||||
the HTML, like removing hard line breaks, etc.
|
||||
|
||||
:param html: A unicode string
|
||||
:return: A unicode string
|
||||
'''
|
||||
return html
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
|
@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.opts_to_mi(self.user_metadata)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||
self.input_plugin,
|
||||
encoding=self.input_plugin.output_encoding)
|
||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||
self.flush()
|
||||
|
||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||
encoding='utf-8', populate=True):
|
||||
'''
|
||||
Create an OEBBook.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
|
||||
opts.enable_heuristics, opts)
|
||||
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||
if not encoding:
|
||||
encoding = None
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
|
@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
||||
extra_opts=None):
|
||||
self.input_plugin_preprocess = input_plugin_preprocess
|
||||
self.plugin_preprocess = plugin_preprocess
|
||||
def __init__(self, log=None, extra_opts=None):
|
||||
self.log = log
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_baen(self, src):
|
||||
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
|
@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
class PreProcessor(object):
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
self.log = default_log if log is None else log
|
||||
@ -366,7 +366,7 @@ class PreProcessor(object):
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
self.log("********* Heuristic processing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
|
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
||||
from calibre import unicode_path
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre import guess_type
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
||||
|
||||
|
@ -9,7 +9,6 @@ import os
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
|
Loading…
x
Reference in New Issue
Block a user