mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: Strip out large blocks of contiguous space (more than 10000 contiguous blanks) as these slow down the conversion process and are almost always indicative of an error in the input document.
This commit is contained in:
parent
08f5775f65
commit
59d9e15580
@ -303,6 +303,9 @@ class CSSPreProcessor(object):
|
||||
class HTMLPreProcessor(object):
|
||||
|
||||
PREPROCESS = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), lambda m: ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
|
Loading…
x
Reference in New Issue
Block a user