mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a --preprocess-html option
This commit is contained in:
parent
8be2541738
commit
dc0e0f26a1
@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def preprocess_html(self, html):
|
||||||
|
'''
|
||||||
|
This method is called by the conversion pipeline on all HTML before it
|
||||||
|
is parsed. It is meant to be used to do any required preprocessing on
|
||||||
|
the HTML, like removing hard line breaks, etc.
|
||||||
|
|
||||||
|
:param html: A unicode string
|
||||||
|
:return: A unicode string
|
||||||
|
'''
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
|
@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
|
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
|
'preprocess_html',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='preprocess_html',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Attempt to detect and correct hard line breaks and other '
|
||||||
|
'problems in the source file. This may make things worse, so use '
|
||||||
|
'with care.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
|
|||||||
self.log('Debug input called, aborting the rest of the pipeline.')
|
self.log('Debug input called, aborting the rest of the pipeline.')
|
||||||
return
|
return
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
|
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||||
|
self.input_plugin)
|
||||||
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
||||||
pr(0., _('Running transforms on ebook...'))
|
pr(0., _('Running transforms on ebook...'))
|
||||||
|
|
||||||
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
|
|||||||
self.opts, self.log)
|
self.opts, self.log)
|
||||||
self.ui_reporter(1.)
|
self.ui_reporter(1.)
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, reader=None):
|
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor()
|
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||||
|
opts.preprocess_html)
|
||||||
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
|
||||||
pretty_print=opts.pretty_print)
|
pretty_print=opts.pretty_print)
|
||||||
# Read OEB Book into OEBBook
|
# Read OEB Book into OEBBook
|
||||||
|
@ -26,16 +26,16 @@ def sanitize_head(match):
|
|||||||
def chap_head(match):
|
def chap_head(match):
|
||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
return '<h1>'+chap+'</h1><br/>\n'
|
return '<h1>'+chap+'</h1><br/>\n'
|
||||||
else:
|
else:
|
||||||
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
|
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
|
||||||
|
|
||||||
def wrap_lines(match):
|
def wrap_lines(match):
|
||||||
ital = match.group('ital')
|
ital = match.group('ital')
|
||||||
if not ital:
|
if not ital:
|
||||||
return ' '
|
return ' '
|
||||||
else:
|
else:
|
||||||
return ital+' '
|
return ital+' '
|
||||||
|
|
||||||
def line_length(raw, percent):
|
def line_length(raw, percent):
|
||||||
@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
|
def __init__(self, input_plugin_preprocess, plugin_preprocess):
|
||||||
|
self.input_plugin_preprocess = input_plugin_preprocess
|
||||||
|
self.plugin_preprocess = plugin_preprocess
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||||
@ -175,7 +178,7 @@ class HTMLPreProcessor(object):
|
|||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
|
||||||
]
|
]
|
||||||
|
|
||||||
rules = self.PDFTOHTML + line_length_rules
|
rules = self.PDFTOHTML + line_length_rules
|
||||||
else:
|
else:
|
||||||
rules = []
|
rules = []
|
||||||
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
html = XMLDECL_RE.sub('', html)
|
html = XMLDECL_RE.sub('', html)
|
||||||
|
|
||||||
|
if self.plugin_preprocess:
|
||||||
|
html = self.input_plugin_preprocess(html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return opfpath
|
return opfpath
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
oeb = create_oebbook(log, opfpath, opts)
|
oeb = create_oebbook(log, opfpath, opts, self)
|
||||||
|
|
||||||
from calibre.ebooks.oeb.transforms.package import Package
|
from calibre.ebooks.oeb.transforms.package import Package
|
||||||
Package(os.getcwdu())(oeb, opts)
|
Package(os.getcwdu())(oeb, opts)
|
||||||
|
@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
|
|||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.lit.reader import LitReader
|
from calibre.ebooks.lit.reader import LitReader
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, stream, options, reader=LitReader)
|
return create_oebbook(log, stream, options, self, reader=LitReader)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1506,7 +1506,7 @@ class OEBBook(object):
|
|||||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||||
|
|
||||||
def __init__(self, logger,
|
def __init__(self, logger,
|
||||||
html_preprocessor=HTMLPreProcessor(),
|
html_preprocessor,
|
||||||
css_preprocessor=CSSPreProcessor(),
|
css_preprocessor=CSSPreProcessor(),
|
||||||
encoding='utf-8', pretty_print=False):
|
encoding='utf-8', pretty_print=False):
|
||||||
"""Create empty book. Arguments:
|
"""Create empty book. Arguments:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user