Add a --preprocess-html option

This commit is contained in:
Kovid Goyal 2009-05-02 12:40:29 -07:00
parent 8be2541738
commit dc0e0f26a1
7 changed files with 41 additions and 12 deletions

View File

@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
def preprocess_html(self, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return

View File

@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
'preprocess_html',
]
),

View File

@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
)
),
OptionRecommendation(name='preprocess_html',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Attempt to detect and correct hard line breaks and other '
'problems in the source file. This may make things worse, so use '
'with care.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...'))
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
self.opts, self.log)
self.ui_reporter(1.)
def create_oebbook(log, path_or_stream, opts, reader=None):
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor()
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook

View File

@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
return html

View File

@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts)
oeb = create_oebbook(log, opfpath, opts, self)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)

View File

@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader)
return create_oebbook(log, stream, options, self, reader=LitReader)

View File

@ -1506,7 +1506,7 @@ class OEBBook(object):
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger,
html_preprocessor=HTMLPreProcessor(),
html_preprocessor,
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments: