EPUB output: handle <span> tags inside <head> tags. Also correctly handle <img> tags inside <a> tags when the image that is being linked to is missing.

2025-07-07 10:14:46 -04:00 · 2008-12-31 13:27:07 -08:00 · 2008-12-31 13:27:07 -08:00 · a38b1bf0a6
commit a38b1bf0a6
parent e14b37bad2
2 changed files with 14 additions and 2 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -55,7 +55,7 @@ content = functools.partial(os.path.join, u'content')

 def remove_bad_link(element, attribute, link, pos):
    if attribute is not None:
-        if element.tag in ['link', 'img']:
+        if element.tag in ['link']:
            element.getparent().remove(element)
        else:
            element.set(attribute, '')
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -314,10 +314,22 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
            

 convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
+_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+
+def sanitize_head(match):
+    x = match.group(1)
+    x = _span_pat.sub('', x)
+    return '<head>\n'+x+'\n</head>'
+    
 class PreProcessor(object):
    PREPROCESS = [
+                  # Some idiotic HTML generators (Frontpage I'm looking at you)
+                  # Put all sorts of crap into <head>. This messes up lxml
+                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), 
+                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
+                  
                  ]
                     
    # Fix pdftohtml markup
@ -875,7 +887,7 @@ def option_parser():
 %prog [options] file.html|opf

 Follow all links in an HTML file and collect them into the specified directory.
-Also collects any references resources like images, stylesheets, scripts, etc. 
+Also collects any resources like images, stylesheets, scripts, etc. 
 If an OPF file is specified instead, the list of files in its <spine> element
 is used.
 '''))