diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b3eb102fc6..c597e0021d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -372,12 +372,13 @@ def accent_regex(accent_maps, letter_before=False): return pat, sub -class HTMLPreProcessor(object): - - PREPROCESS = [ +def html_preprocess_rules(): + ans = getattr(html_preprocess_rules, 'ans', None) + if ans is None: + ans = html_preprocess_rules.ans = [ # Remove huge block of contiguous spaces as they slow down # the following regexes pretty badly - (re.compile(r'\s{10000,}'), lambda m: ''), + (re.compile(r'\s{10000,}'), ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into
. This messes up lxml (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), @@ -385,12 +386,15 @@ class HTMLPreProcessor(object): # Convert all entities, since lxml doesn't handle them well (re.compile(r'&(\S+?);'), convert_entities), # Remove the ', re.IGNORECASE), - lambda match: ''), + (re.compile(r'{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''), ] + return ans - # Fix pdftohtml markup - PDFTOHTML = [ + +def pdftohtml_rules(): + ans = getattr(pdftohtml_rules, 'ans', None) + if ans is None: + ans = pdftohtml_rules.ans = [ accent_regex({ '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ', '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ', @@ -428,9 +432,13 @@ class HTMLPreProcessor(object): (re.compile(r'(?'), ' '), (re.compile(r'(?=\w)'), ' '), ] + return ans - # Fix Book Designer markup - BOOK_DESIGNER = [ + +def book_designer_rules(): + ans = getattr(book_designer_rules, 'ans', None) + if ans is None: + ans = book_designer_rules.ans = [ # HR (re.compile('