diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index a54697a214..1832d75ab3 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -55,7 +55,7 @@ content = functools.partial(os.path.join, u'content')
def remove_bad_link(element, attribute, link, pos):
if attribute is not None:
- if element.tag in ['link', 'img']:
+ if element.tag in ['link']:
element.getparent().remove(element)
else:
element.set(attribute, '')
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 4b384a6b18..0fc9c10e25 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -314,10 +314,22 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
+_span_pat = re.compile('', re.DOTALL|re.IGNORECASE)
+
+def sanitize_head(match):
+ x = match.group(1)
+ x = _span_pat.sub('', x)
+ return '\n'+x+'\n'
+
class PreProcessor(object):
PREPROCESS = [
+ # Some idiotic HTML generators (Frontpage I'm looking at you)
+ # Put all sorts of crap into . This messes up lxml
+ (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL),
+ sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
+
]
# Fix pdftohtml markup
@@ -875,7 +887,7 @@ def option_parser():
%prog [options] file.html|opf
Follow all links in an HTML file and collect them into the specified directory.
-Also collects any references resources like images, stylesheets, scripts, etc.
+Also collects any resources like images, stylesheets, scripts, etc.
If an OPF file is specified instead, the list of files in its element
is used.
'''))