mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
EPUB output: handle <span> tags inside <head> tags. Also correctly handle <img> tags inside <a> tags when the image that is being linked to is missing.
This commit is contained in:
parent
e14b37bad2
commit
a38b1bf0a6
@ -55,7 +55,7 @@ content = functools.partial(os.path.join, u'content')
|
||||
|
||||
def remove_bad_link(element, attribute, link, pos):
|
||||
if attribute is not None:
|
||||
if element.tag in ['link', 'img']:
|
||||
if element.tag in ['link']:
|
||||
element.getparent().remove(element)
|
||||
else:
|
||||
element.set(attribute, '')
|
||||
|
@ -314,10 +314,22 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def sanitize_head(match):
|
||||
x = match.group(1)
|
||||
x = _span_pat.sub('', x)
|
||||
return '<head>\n'+x+'\n</head>'
|
||||
|
||||
class PreProcessor(object):
|
||||
PREPROCESS = [
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
|
||||
]
|
||||
|
||||
# Fix pdftohtml markup
|
||||
@ -875,7 +887,7 @@ def option_parser():
|
||||
%prog [options] file.html|opf
|
||||
|
||||
Follow all links in an HTML file and collect them into the specified directory.
|
||||
Also collects any references resources like images, stylesheets, scripts, etc.
|
||||
Also collects any resources like images, stylesheets, scripts, etc.
|
||||
If an OPF file is specified instead, the list of files in its <spine> element
|
||||
is used.
|
||||
'''))
|
||||
|
Loading…
x
Reference in New Issue
Block a user