From a38b1bf0a6e0623f9b334938558456495c266ece Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 31 Dec 2008 13:27:07 -0800 Subject: [PATCH] EPUB output: handle tags inside tags. Also correctly handle tags inside tags when the image that is being linked to is missing. --- src/calibre/ebooks/epub/from_html.py | 2 +- src/calibre/ebooks/html.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index a54697a214..1832d75ab3 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -55,7 +55,7 @@ content = functools.partial(os.path.join, u'content') def remove_bad_link(element, attribute, link, pos): if attribute is not None: - if element.tag in ['link', 'img']: + if element.tag in ['link']: element.getparent().remove(element) else: element.set(attribute, '') diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 4b384a6b18..0fc9c10e25 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -314,10 +314,22 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) +_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) + +def sanitize_head(match): + x = match.group(1) + x = _span_pat.sub('', x) + return '\n'+x+'\n' + class PreProcessor(object): PREPROCESS = [ + # Some idiotic HTML generators (Frontpage I'm looking at you) + # Put all sorts of crap into . This messes up lxml + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), + sanitize_head), # Convert all entities, since lxml doesn't handle them well (re.compile(r'&(\S+?);'), convert_entities), + ] # Fix pdftohtml markup @@ -875,7 +887,7 @@ def option_parser(): %prog [options] file.html|opf Follow all links in an HTML file and collect them into the specified directory. -Also collects any references resources like images, stylesheets, scripts, etc. +Also collects any resources like images, stylesheets, scripts, etc. If an OPF file is specified instead, the list of files in its element is used. '''))