diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index dd1322c8a0..ba5ee0da7f 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -46,7 +46,7 @@ _ligpat = re.compile('|'.join(LIGATURES)) def sanitize_head(match): - x = match.group(1) + x = match.group(1).strip() x = _span_pat.sub('', x) return '\n%s\n' % x @@ -380,8 +380,7 @@ def html_preprocess_rules(): (re.compile(r'\s{10000,}'), ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), - sanitize_head), + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), sanitize_head), # Convert all entities, since lxml doesn't handle them well (re.compile(r'&(\S+?);'), convert_entities), # Remove the