mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Enable heuristic processing over the entire conversion pipe line when option is enabled.
This commit is contained in:
parent
d6256ef452
commit
64796696ae
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def heuristics(self, opts, html):
|
|
||||||
'''
|
|
||||||
This method is called by the conversion pipeline on all HTML before it
|
|
||||||
is parsed. It is meant to be used to do any required preprocessing on
|
|
||||||
the HTML, like removing hard line breaks, etc.
|
|
||||||
|
|
||||||
:param html: A unicode string
|
|
||||||
:return: A unicode string
|
|
||||||
'''
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
|
@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
|
|||||||
self.opts_to_mi(self.user_metadata)
|
self.opts_to_mi(self.user_metadata)
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||||
self.input_plugin,
|
|
||||||
encoding=self.input_plugin.output_encoding)
|
encoding=self.input_plugin.output_encoding)
|
||||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||||
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
|
|||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||||
encoding='utf-8', populate=True):
|
encoding='utf-8', populate=True):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
|
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||||
opts.enable_heuristics, opts)
|
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = None
|
encoding = None
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
|
@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
def __init__(self, log=None, extra_opts=None):
|
||||||
extra_opts=None):
|
self.log = log
|
||||||
self.input_plugin_preprocess = input_plugin_preprocess
|
|
||||||
self.plugin_preprocess = plugin_preprocess
|
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
|
|||||||
unidecoder = Unidecoder()
|
unidecoder = Unidecoder()
|
||||||
html = unidecoder.decode(html)
|
html = unidecoder.decode(html)
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
|
html = preprocessor(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = self.smarten_punctuation(html)
|
html = self.smarten_punctuation(html)
|
||||||
|
@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
|||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.wordcount import get_wordcount_obj
|
from calibre.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
class PreProcessor(object):
|
class HeuristicProcessor(object):
|
||||||
|
|
||||||
def __init__(self, extra_opts=None, log=None):
|
def __init__(self, extra_opts=None, log=None):
|
||||||
self.log = default_log if log is None else log
|
self.log = default_log if log is None else log
|
||||||
@ -366,7 +366,7 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
# other types of processing are attempted
|
# other types of processing are attempted
|
||||||
|
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
|||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
self.log.exception('Failed to read CSS file: %r'%link)
|
self.log.exception('Failed to read CSS file: %r'%link)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def heuristics(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
|
|||||||
for elem in body:
|
for elem in body:
|
||||||
ne = copy.deepcopy(elem)
|
ne = copy.deepcopy(elem)
|
||||||
pre.append(ne)
|
pre.append(ne)
|
||||||
|
|
||||||
|
|
||||||
def heuristics(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
class Canvas(etree.XSLTExtension):
|
class Canvas(etree.XSLTExtension):
|
||||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
|||||||
f.write(result)
|
f.write(result)
|
||||||
styles.write()
|
styles.write()
|
||||||
return os.path.abspath('content.opf')
|
return os.path.abspath('content.opf')
|
||||||
|
|
||||||
def heuristics(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class PDBInput(InputFormatPlugin):
|
class PDBInput(InputFormatPlugin):
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
border_style_map = {
|
border_style_map = {
|
||||||
'single' : 'solid',
|
'single' : 'solid',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user