Make the regexps used for pre-processing pdftohtml output more efficient

2025-07-09 03:04:10 -04:00 · 2019-10-19 10:03:21 +05:30 · 2019-10-19 10:03:21 +05:30 · e5dc6b4002
commit e5dc6b4002
parent 2014e6520e
1 changed files with 71 additions and 142 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -344,6 +344,34 @@ class CSSPreProcessor(object):
        return '\n'.join(ans)
 def accent_regex(accent_maps, letter_before=False):
    accent_cat = set()
    letters = set()
    for accent in tuple(accent_maps):
        accent_cat.add(accent)
        k, v = accent_maps[accent].split(':', 1)
        if len(k) != len(v):
            raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
        accent_maps[accent] = lmap = dict(zip(k, v))
        letters |= set(lmap)
    if letter_before:
        args = ''.join(letters), ''.join(accent_cat)
        accent_group, letter_group = 2, 1
    else:
        args = ''.join(accent_cat), ''.join(letters)
        accent_group, letter_group = 1, 2
    pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
    def sub(m):
        lmap = accent_maps[m.group(accent_group)]
        return lmap.get(m.group(letter_group)) or m.group()
    return pat, sub
 class HTMLPreProcessor(object):
    PREPROCESS = [
@ -363,116 +391,19 @@ class HTMLPreProcessor(object):
    # Fix pdftohtml markup
    PDFTOHTML  = [
-                  # Fix umlauts
+        accent_regex({
-                  (re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'),
+            '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'),
+            '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'),
+            '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'),
+            'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'),
+            '¸': 'cC:çÇ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'),
+            '˛': 'aAeE:ąĄęĘ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'),
+            '˙': 'zZ:żŻ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'),
+            'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'),
+            '°': 'uU:ůŮ',
-                  (re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'),
+        }),
-                  # Fix accents
+        accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
                  # `
                  (re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
                  (re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
                  (re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
                  (re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
                  (re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
                  (re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
                  (re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
                  (re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
                  (re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
                  (re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
                  # ` with letter before
                  (re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
                  (re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
                  (re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
                  (re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
                  (re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
                  (re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
                  (re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
                  (re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
                  (re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
                  (re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
                  # ´
                  (re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
                  (re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
                  (re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
                  (re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
                  (re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
                  (re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
                  (re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
                  (re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
                  (re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
                  (re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
                  (re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
                  (re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
                  (re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
                  (re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
                  (re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
                  (re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
                  (re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
                  (re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
                  (re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
                  (re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
                  (re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
                  (re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
                  # ˆ
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
                  (re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
                  # ¸
                  (re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
                  (re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
                  # ˛
                  (re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
                  (re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
                  (re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
                  (re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
                  # ˙
                  (re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
                  (re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
                  # ˇ
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
                  (re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
                  # °
                  (re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
                  (re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
        # If pdf printed from a browser then the header/footer has a reliable pattern
        (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -481,21 +412,21 @@ class HTMLPreProcessor(object):
        (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
        # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
        # Remove gray background
-                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
+        (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
        # Convert line breaks to paragraphs
-                  (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
+        (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
-                  (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
+        (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
-                  (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
+        (re.compile(r'\s*</body>'), '</p>\n</body>'),
        # Clean up spaces
-                  (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
+        (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
        # Add space before and after italics
-                  (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
+        (re.compile(r'(?<!“)<i>'), ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
+        (re.compile(r'</i>(?=\w)'), '</i> '),
    ]
    # Fix Book Designer markup
@ -636,9 +567,7 @@ class HTMLPreProcessor(object):
        for rule in rules + end_rules:
            try:
                print(rule[0].pattern)
                html = rule[0].sub(rule[1], html)
                print(222222222222)
            except Exception as e:
                if rule in user_sr_rules:
                    self.log.error(