Add a --preprocess-html option

This commit is contained in:
Kovid Goyal 2009-05-02 12:40:29 -07:00
parent 8be2541738
commit dc0e0f26a1
7 changed files with 41 additions and 12 deletions

View File

@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def preprocess_html(self, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return

View File

@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image', 'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before', 'insert_metadata', 'page_breaks_before',
'preprocess_html',
] ]
), ),

View File

@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
) )
), ),
OptionRecommendation(name='preprocess_html',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Attempt to detect and correct hard line breaks and other '
'problems in the source file. This may make things worse, so use '
'with care.'
)
),
OptionRecommendation(name='read_metadata_from_opf', OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW, recommended_value=None, level=OptionRecommendation.LOW,
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
self.log('Debug input called, aborting the rest of the pipeline.') self.log('Debug input called, aborting the rest of the pipeline.')
return return
if not hasattr(self.oeb, 'manifest'): if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts) self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...')) pr(0., _('Running transforms on ebook...'))
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
self.opts, self.log) self.opts, self.log)
self.ui_reporter(1.) self.ui_reporter(1.)
def create_oebbook(log, path_or_stream, opts, reader=None): def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
''' '''
Create an OEBBook. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor() html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor=html_preprocessor, oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print) pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook

View File

@ -26,16 +26,16 @@ def sanitize_head(match):
def chap_head(match): def chap_head(match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
return '<h1>'+chap+'</h1><br/>\n' return '<h1>'+chap+'</h1><br/>\n'
else: else:
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n' return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
def wrap_lines(match): def wrap_lines(match):
ital = match.group('ital') ital = match.group('ital')
if not ital: if not ital:
return ' ' return ' '
else: else:
return ital+' ' return ital+' '
def line_length(raw, percent): def line_length(raw, percent):
@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'), (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
# Remove page links # Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags # Remove <hr> tags
@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def is_baen(self, src): def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -175,7 +178,7 @@ class HTMLPreProcessor(object):
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
] ]
rules = self.PDFTOHTML + line_length_rules rules = self.PDFTOHTML + line_length_rules
else: else:
rules = [] rules = []
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html) html = XMLDECL_RE.sub('', html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
return html return html

View File

@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
return opfpath return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts) oeb = create_oebbook(log, opfpath, opts, self)
from calibre.ebooks.oeb.transforms.package import Package from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts) Package(os.getcwdu())(oeb, opts)

View File

@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
accelerators): accelerators):
from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader) return create_oebbook(log, stream, options, self, reader=LitReader)

View File

@ -1506,7 +1506,7 @@ class OEBBook(object):
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger, def __init__(self, logger,
html_preprocessor=HTMLPreProcessor(), html_preprocessor,
css_preprocessor=CSSPreProcessor(), css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False): encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments: """Create empty book. Arguments: