From 2014e6520e3b9b18a7e7a733deab6ceba9096072 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 18 Oct 2019 20:29:28 +0530 Subject: [PATCH] ... --- src/calibre/ebooks/conversion/preprocess.py | 28 +++++++++++---------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 061d8ae2b1..4cc3521c1d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -347,19 +347,19 @@ class CSSPreProcessor(object): class HTMLPreProcessor(object): PREPROCESS = [ - # Remove huge block of contiguous spaces as they slow down - # the following regexes pretty badly - (re.compile(r'\s{10000,}'), lambda m: ''), - # Some idiotic HTML generators (Frontpage I'm looking at you) - # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), - sanitize_head), - # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), - # Remove the ', re.IGNORECASE), - lambda match: ''), - ] + # Remove huge block of contiguous spaces as they slow down + # the following regexes pretty badly + (re.compile(r'\s{10000,}'), lambda m: ''), + # Some idiotic HTML generators (Frontpage I'm looking at you) + # Put all sorts of crap into . This messes up lxml + (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), + sanitize_head), + # Convert all entities, since lxml doesn't handle them well + (re.compile(r'&(\S+?);'), convert_entities), + # Remove the ', re.IGNORECASE), + lambda match: ''), + ] # Fix pdftohtml markup PDFTOHTML = [ @@ -636,7 +636,9 @@ class HTMLPreProcessor(object): for rule in rules + end_rules: try: + print(rule[0].pattern) html = rule[0].sub(rule[1], html) + print(222222222222) except Exception as e: if rule in user_sr_rules: self.log.error(