Add a --preprocess-html option

This commit is contained in:
Kovid Goyal 2009-05-02 12:40:29 -07:00
parent 8be2541738
commit dc0e0f26a1
7 changed files with 41 additions and 12 deletions

View File

@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def preprocess_html(self, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return

View File

@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image', 'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before', 'insert_metadata', 'page_breaks_before',
'preprocess_html',
] ]
), ),

View File

@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
) )
), ),
OptionRecommendation(name='preprocess_html',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Attempt to detect and correct hard line breaks and other '
'problems in the source file. This may make things worse, so use '
'with care.'
)
),
OptionRecommendation(name='read_metadata_from_opf', OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW, recommended_value=None, level=OptionRecommendation.LOW,
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
self.log('Debug input called, aborting the rest of the pipeline.') self.log('Debug input called, aborting the rest of the pipeline.')
return return
if not hasattr(self.oeb, 'manifest'): if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts) self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...')) pr(0., _('Running transforms on ebook...'))
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
self.opts, self.log) self.opts, self.log)
self.ui_reporter(1.) self.ui_reporter(1.)
def create_oebbook(log, path_or_stream, opts, reader=None): def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
''' '''
Create an OEBBook. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor() html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor=html_preprocessor, oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print) pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook

View File

@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def is_baen(self, src): def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html) html = XMLDECL_RE.sub('', html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
return html return html

View File

@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
return opfpath return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts) oeb = create_oebbook(log, opfpath, opts, self)
from calibre.ebooks.oeb.transforms.package import Package from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts) Package(os.getcwdu())(oeb, opts)

View File

@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
accelerators): accelerators):
from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader) return create_oebbook(log, stream, options, self, reader=LitReader)

View File

@ -1506,7 +1506,7 @@ class OEBBook(object):
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger, def __init__(self, logger,
html_preprocessor=HTMLPreProcessor(), html_preprocessor,
css_preprocessor=CSSPreProcessor(), css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False): encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments: """Create empty book. Arguments: