mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a --preprocess-html option
This commit is contained in:
parent
8be2541738
commit
dc0e0f26a1
@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def preprocess_html(self, html):
|
||||
'''
|
||||
This method is called by the conversion pipeline on all HTML before it
|
||||
is parsed. It is meant to be used to do any required preprocessing on
|
||||
the HTML, like removing hard line breaks, etc.
|
||||
|
||||
:param html: A unicode string
|
||||
:return: A unicode string
|
||||
'''
|
||||
return html
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
|
@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
|
||||
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'preprocess_html',
|
||||
]
|
||||
),
|
||||
|
||||
|
@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='preprocess_html',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Attempt to detect and correct hard line breaks and other '
|
||||
'problems in the source file. This may make things worse, so use '
|
||||
'with care.'
|
||||
)
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
|
||||
self.log('Debug input called, aborting the rest of the pipeline.')
|
||||
return
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||
self.input_plugin)
|
||||
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
||||
pr(0., _('Running transforms on ebook...'))
|
||||
|
||||
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
|
||||
self.opts, self.log)
|
||||
self.ui_reporter(1.)
|
||||
|
||||
def create_oebbook(log, path_or_stream, opts, reader=None):
|
||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
|
||||
'''
|
||||
Create an OEBBook.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||
opts.preprocess_html)
|
||||
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
|
||||
pretty_print=opts.pretty_print)
|
||||
# Read OEB Book into OEBBook
|
||||
|
@ -26,16 +26,16 @@ def sanitize_head(match):
|
||||
def chap_head(match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
if not title:
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
else:
|
||||
else:
|
||||
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
|
||||
|
||||
def wrap_lines(match):
|
||||
ital = match.group('ital')
|
||||
if not ital:
|
||||
if not ital:
|
||||
return ' '
|
||||
else:
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
def line_length(raw, percent):
|
||||
@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||
|
||||
|
||||
# Remove page links
|
||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||
# Remove <hr> tags
|
||||
@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
def __init__(self, input_plugin_preprocess, plugin_preprocess):
|
||||
self.input_plugin_preprocess = input_plugin_preprocess
|
||||
self.plugin_preprocess = plugin_preprocess
|
||||
|
||||
def is_baen(self, src):
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
@ -175,7 +178,7 @@ class HTMLPreProcessor(object):
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
|
||||
]
|
||||
|
||||
|
||||
rules = self.PDFTOHTML + line_length_rules
|
||||
else:
|
||||
rules = []
|
||||
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
|
||||
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(html)
|
||||
|
||||
return html
|
||||
|
||||
|
@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
return opfpath
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
oeb = create_oebbook(log, opfpath, opts)
|
||||
oeb = create_oebbook(log, opfpath, opts, self)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.package import Package
|
||||
Package(os.getcwdu())(oeb, opts)
|
||||
|
@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
|
||||
accelerators):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
||||
|
||||
|
||||
|
@ -1506,7 +1506,7 @@ class OEBBook(object):
|
||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||
|
||||
def __init__(self, logger,
|
||||
html_preprocessor=HTMLPreProcessor(),
|
||||
html_preprocessor,
|
||||
css_preprocessor=CSSPreProcessor(),
|
||||
encoding='utf-8', pretty_print=False):
|
||||
"""Create empty book. Arguments:
|
||||
|
Loading…
x
Reference in New Issue
Block a user