Fix inefficient regex that slows down a lot with certain input. Fixes #1951979 [Private bug](https://bugs.launchpad.net/calibre/+bug/1951979)

This commit is contained in:
Kovid Goyal 2021-11-23 21:26:18 +05:30
parent 1e12201376
commit 235b7e38c1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -46,7 +46,7 @@ _ligpat = re.compile('|'.join(LIGATURES))
def sanitize_head(match):
x = match.group(1)
x = match.group(1).strip()
x = _span_pat.sub('', x)
return '<head>\n%s\n</head>' % x
@ -380,8 +380,7 @@ def html_preprocess_rules():
(re.compile(r'\s{10000,}'), ''),
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word