From 59d9e1558004c53be8fc31b2f1838b1389587d91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 10:53:31 -0600 Subject: [PATCH] Conversion pipeline: Strip out large blocks of contiguous space (more than 10000 contiguous blanks) as these slow down the conversion process and are almost always indicative of an error in the input document. --- src/calibre/ebooks/conversion/preprocess.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 885d0621e0..751d4f8cd6 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -303,6 +303,9 @@ class CSSPreProcessor(object): class HTMLPreProcessor(object): PREPROCESS = [ + # Remove huge block of contiguous spaces as they slow down + # the following regexes pretty badly + (re.compile(r'\s{10000,}'), lambda m: ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL),