Make the regexps used for pre-processing pdftohtml output more efficient

This commit is contained in:
Kovid Goyal 2019-10-19 10:03:21 +05:30
parent 2014e6520e
commit e5dc6b4002
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -344,6 +344,34 @@ class CSSPreProcessor(object):
return '\n'.join(ans) return '\n'.join(ans)
def accent_regex(accent_maps, letter_before=False):
accent_cat = set()
letters = set()
for accent in tuple(accent_maps):
accent_cat.add(accent)
k, v = accent_maps[accent].split(':', 1)
if len(k) != len(v):
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
accent_maps[accent] = lmap = dict(zip(k, v))
letters |= set(lmap)
if letter_before:
args = ''.join(letters), ''.join(accent_cat)
accent_group, letter_group = 2, 1
else:
args = ''.join(accent_cat), ''.join(letters)
accent_group, letter_group = 1, 2
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
def sub(m):
lmap = accent_maps[m.group(accent_group)]
return lmap.get(m.group(letter_group)) or m.group()
return pat, sub
class HTMLPreProcessor(object): class HTMLPreProcessor(object):
PREPROCESS = [ PREPROCESS = [
@ -363,116 +391,19 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
# Fix umlauts accent_regex({
(re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'), '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
(re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'), '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
(re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'), '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
(re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'), 'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
(re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'), '¸': 'cC:çÇ',
(re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'), '˛': 'aAeE:ąĄęĘ',
(re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'), '˙': 'zZ:żŻ',
(re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'), 'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
(re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'), '°': 'uU:ůŮ',
(re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'), }),
# Fix accents accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
# `
(re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
(re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
(re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
(re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
(re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
(re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
(re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
(re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
(re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
(re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
# ` with letter before
(re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
(re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
(re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
(re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
(re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
(re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
(re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
(re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
(re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
(re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
# ´
(re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
(re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
(re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
(re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
(re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
(re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
(re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
(re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
(re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
(re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
(re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
(re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
(re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
(re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
(re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
(re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
(re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
(re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
(re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
(re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
(re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
(re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
# ˆ
(re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
(re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
# ¸
(re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
(re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
# ˛
(re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
(re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
(re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
(re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
# ˙
(re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
(re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
# ˇ
(re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
(re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
# °
(re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
(re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
# If pdf printed from a browser then the header/footer has a reliable pattern # If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -481,21 +412,21 @@ class HTMLPreProcessor(object):
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'), (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
# Remove <hr> tags # Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''), (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
# Convert line breaks to paragraphs # Convert line breaks to paragraphs
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'), (re.compile(r'\s*</body>'), '</p>\n</body>'),
# Clean up spaces # Clean up spaces
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
# Add space before and after italics # Add space before and after italics
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'(?<!“)<i>'), ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '), (re.compile(r'</i>(?=\w)'), '</i> '),
] ]
# Fix Book Designer markup # Fix Book Designer markup
@ -636,9 +567,7 @@ class HTMLPreProcessor(object):
for rule in rules + end_rules: for rule in rules + end_rules:
try: try:
print(rule[0].pattern)
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
print(222222222222)
except Exception as e: except Exception as e:
if rule in user_sr_rules: if rule in user_sr_rules:
self.log.error( self.log.error(