This commit is contained in:
Kovid Goyal 2019-10-18 20:29:28 +05:30
parent 1b0efe04d9
commit 2014e6520e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -347,19 +347,19 @@ class CSSPreProcessor(object):
class HTMLPreProcessor(object): class HTMLPreProcessor(object):
PREPROCESS = [ PREPROCESS = [
# Remove huge block of contiguous spaces as they slow down # Remove huge block of contiguous spaces as they slow down
# the following regexes pretty badly # the following regexes pretty badly
(re.compile(r'\s{10000,}'), lambda m: ''), (re.compile(r'\s{10000,}'), lambda m: ''),
# Some idiotic HTML generators (Frontpage I'm looking at you) # Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml # Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL), (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
sanitize_head), sanitize_head),
# Convert all entities, since lxml doesn't handle them well # Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities), (re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word # Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''), lambda match: ''),
] ]
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
@ -636,7 +636,9 @@ class HTMLPreProcessor(object):
for rule in rules + end_rules: for rule in rules + end_rules:
try: try:
print(rule[0].pattern)
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
print(222222222222)
except Exception as e: except Exception as e:
if rule in user_sr_rules: if rule in user_sr_rules:
self.log.error( self.log.error(