From 235b7e38c197ba4a3c17531e516610af8795e348 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Nov 2021 21:26:18 +0530 Subject: [PATCH] Fix inefficient regex that slows down a lot with certain input. Fixes #1951979 [Private bug](https://bugs.launchpad.net/calibre/+bug/1951979) --- src/calibre/ebooks/conversion/preprocess.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index dd1322c8a0..ba5ee0da7f 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -46,7 +46,7 @@ _ligpat = re.compile('|'.join(LIGATURES)) def sanitize_head(match): - x = match.group(1) + x = match.group(1).strip() x = _span_pat.sub('', x) return '\n%s\n' % x @@ -380,8 +380,7 @@ def html_preprocess_rules(): (re.compile(r'\s{10000,}'), ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), - sanitize_head), + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), sanitize_head), # Convert all entities, since lxml doesn't handle them well (re.compile(r'&(\S+?);'), convert_entities), # Remove the