From e5dc6b40023a465221f0eb35ae7b3df701a5f688 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Oct 2019 10:03:21 +0530 Subject: [PATCH] Make the regexps used for pre-processing pdftohtml output more efficient --- src/calibre/ebooks/conversion/preprocess.py | 213 +++++++------------- 1 file changed, 71 insertions(+), 142 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 4cc3521c1d..b3eb102fc6 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -344,6 +344,34 @@ class CSSPreProcessor(object): return '\n'.join(ans) +def accent_regex(accent_maps, letter_before=False): + accent_cat = set() + letters = set() + + for accent in tuple(accent_maps): + accent_cat.add(accent) + k, v = accent_maps[accent].split(':', 1) + if len(k) != len(v): + raise ValueError('Invalid mapping for: {} -> {}'.format(k, v)) + accent_maps[accent] = lmap = dict(zip(k, v)) + letters |= set(lmap) + + if letter_before: + args = ''.join(letters), ''.join(accent_cat) + accent_group, letter_group = 2, 1 + else: + args = ''.join(accent_cat), ''.join(letters) + accent_group, letter_group = 1, 2 + + pat = re.compile(r'([{}])\s*(?:]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE) + + def sub(m): + lmap = accent_maps[m.group(accent_group)] + return lmap.get(m.group(letter_group)) or m.group() + + return pat, sub + + class HTMLPreProcessor(object): PREPROCESS = [ @@ -363,156 +391,59 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ - # Fix umlauts - (re.compile(r'¨\s*()*\s*a', re.UNICODE), lambda match: 'ä'), - (re.compile(r'¨\s*()*\s*A', re.UNICODE), lambda match: 'Ä'), - (re.compile(r'¨\s*()*\s*e', re.UNICODE), lambda match: 'ë'), - (re.compile(r'¨\s*()*\s*E', re.UNICODE), lambda match: 'Ë'), - (re.compile(r'¨\s*()*\s*i', re.UNICODE), lambda match: 'ï'), - (re.compile(r'¨\s*()*\s*I', re.UNICODE), lambda match: 'Ï'), - (re.compile(r'¨\s*()*\s*o', re.UNICODE), lambda match: 'ö'), - (re.compile(r'¨\s*()*\s*O', re.UNICODE), lambda match: 'Ö'), - (re.compile(r'¨\s*()*\s*u', re.UNICODE), lambda match: 'ü'), - (re.compile(r'¨\s*()*\s*U', re.UNICODE), lambda match: 'Ü'), + accent_regex({ + '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ', + '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ', + '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ', + 'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ', + '¸': 'cC:çÇ', + '˛': 'aAeE:ąĄęĘ', + '˙': 'zZ:żŻ', + 'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ', + '°': 'uU:ůŮ', + }), - # Fix accents - # ` - (re.compile(r'`\s*()*\s*a', re.UNICODE), lambda match: 'à'), - (re.compile(r'`\s*()*\s*A', re.UNICODE), lambda match: 'À'), - (re.compile(r'`\s*()*\s*e', re.UNICODE), lambda match: 'è'), - (re.compile(r'`\s*()*\s*E', re.UNICODE), lambda match: 'È'), - (re.compile(r'`\s*()*\s*i', re.UNICODE), lambda match: 'ì'), - (re.compile(r'`\s*()*\s*I', re.UNICODE), lambda match: 'Ì'), - (re.compile(r'`\s*()*\s*o', re.UNICODE), lambda match: 'ò'), - (re.compile(r'`\s*()*\s*O', re.UNICODE), lambda match: 'Ò'), - (re.compile(r'`\s*()*\s*u', re.UNICODE), lambda match: 'ù'), - (re.compile(r'`\s*()*\s*U', re.UNICODE), lambda match: 'Ù'), + accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True), - # ` with letter before - (re.compile(r'a\s*()*\s*`', re.UNICODE), lambda match: 'à'), - (re.compile(r'A\s*()*\s*`', re.UNICODE), lambda match: 'À'), - (re.compile(r'e\s*()*\s*`', re.UNICODE), lambda match: 'è'), - (re.compile(r'E\s*()*\s*`', re.UNICODE), lambda match: 'È'), - (re.compile(r'i\s*()*\s*`', re.UNICODE), lambda match: 'ì'), - (re.compile(r'I\s*()*\s*`', re.UNICODE), lambda match: 'Ì'), - (re.compile(r'o\s*()*\s*`', re.UNICODE), lambda match: 'ò'), - (re.compile(r'O\s*()*\s*`', re.UNICODE), lambda match: 'Ò'), - (re.compile(r'u\s*()*\s*`', re.UNICODE), lambda match: 'ù'), - (re.compile(r'U\s*()*\s*`', re.UNICODE), lambda match: 'Ù'), + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), - # ´ - (re.compile(r'´\s*()*\s*a', re.UNICODE), lambda match: 'á'), - (re.compile(r'´\s*()*\s*A', re.UNICODE), lambda match: 'Á'), - (re.compile(r'´\s*()*\s*c', re.UNICODE), lambda match: 'ć'), - (re.compile(r'´\s*()*\s*C', re.UNICODE), lambda match: 'Ć'), - (re.compile(r'´\s*()*\s*e', re.UNICODE), lambda match: 'é'), - (re.compile(r'´\s*()*\s*E', re.UNICODE), lambda match: 'É'), - (re.compile(r'´\s*()*\s*i', re.UNICODE), lambda match: 'í'), - (re.compile(r'´\s*()*\s*I', re.UNICODE), lambda match: 'Í'), - (re.compile(r'´\s*()*\s*l', re.UNICODE), lambda match: 'ĺ'), - (re.compile(r'´\s*()*\s*L', re.UNICODE), lambda match: 'Ĺ'), - (re.compile(r'´\s*()*\s*o', re.UNICODE), lambda match: 'ó'), - (re.compile(r'´\s*()*\s*O', re.UNICODE), lambda match: 'Ó'), - (re.compile(r'´\s*()*\s*n', re.UNICODE), lambda match: 'ń'), - (re.compile(r'´\s*()*\s*N', re.UNICODE), lambda match: 'Ń'), - (re.compile(r'´\s*()*\s*r', re.UNICODE), lambda match: 'ŕ'), - (re.compile(r'´\s*()*\s*R', re.UNICODE), lambda match: 'Ŕ'), - (re.compile(r'´\s*()*\s*s', re.UNICODE), lambda match: 'ś'), - (re.compile(r'´\s*()*\s*S', re.UNICODE), lambda match: 'Ś'), - (re.compile(r'´\s*()*\s*u', re.UNICODE), lambda match: 'ú'), - (re.compile(r'´\s*()*\s*U', re.UNICODE), lambda match: 'Ú'), - (re.compile(r'´\s*()*\s*z', re.UNICODE), lambda match: 'ź'), - (re.compile(r'´\s*()*\s*Z', re.UNICODE), lambda match: 'Ź'), + # Center separator lines + (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '

\n

' + match.group('break') + '

'), - # ˆ - (re.compile(r'ˆ\s*()*\s*a', re.UNICODE), lambda match: 'â'), - (re.compile(r'ˆ\s*()*\s*A', re.UNICODE), lambda match: 'Â'), - (re.compile(r'ˆ\s*()*\s*e', re.UNICODE), lambda match: 'ê'), - (re.compile(r'ˆ\s*()*\s*E', re.UNICODE), lambda match: 'Ê'), - (re.compile(r'ˆ\s*()*\s*i', re.UNICODE), lambda match: 'î'), - (re.compile(r'ˆ\s*()*\s*I', re.UNICODE), lambda match: 'Î'), - (re.compile(r'ˆ\s*()*\s*o', re.UNICODE), lambda match: 'ô'), - (re.compile(r'ˆ\s*()*\s*O', re.UNICODE), lambda match: 'Ô'), - (re.compile(r'ˆ\s*()*\s*u', re.UNICODE), lambda match: 'û'), - (re.compile(r'ˆ\s*()*\s*U', re.UNICODE), lambda match: 'Û'), + # Remove
tags + (re.compile(r'', re.IGNORECASE), ''), - # ¸ - (re.compile(r'¸\s*()*\s*c', re.UNICODE), lambda match: 'ç'), - (re.compile(r'¸\s*()*\s*C', re.UNICODE), lambda match: 'Ç'), + # Remove gray background + (re.compile(r']+>'), ''), - # ˛ - (re.compile(r'\s*˛\s*()*\s*a', re.UNICODE), lambda match: 'ą'), - (re.compile(r'\s*˛\s*()*\s*A', re.UNICODE), lambda match: 'Ą'), - (re.compile(r'˛\s*()*\s*e', re.UNICODE), lambda match: 'ę'), - (re.compile(r'˛\s*()*\s*E', re.UNICODE), lambda match: 'Ę'), + # Convert line breaks to paragraphs + (re.compile(r']*>\s*'), '

\n

'), + (re.compile(r']*>\s*'), '\n

'), + (re.compile(r'\s*'), '

\n'), - # ˙ - (re.compile(r'˙\s*()*\s*z', re.UNICODE), lambda match: 'ż'), - (re.compile(r'˙\s*()*\s*Z', re.UNICODE), lambda match: 'Ż'), - - # ˇ - (re.compile(r'ˇ\s*()*\s*c', re.UNICODE), lambda match: 'č'), - (re.compile(r'ˇ\s*()*\s*C', re.UNICODE), lambda match: 'Č'), - (re.compile(r'ˇ\s*()*\s*d', re.UNICODE), lambda match: 'ď'), - (re.compile(r'ˇ\s*()*\s*D', re.UNICODE), lambda match: 'Ď'), - (re.compile(r'ˇ\s*()*\s*e', re.UNICODE), lambda match: 'ě'), - (re.compile(r'ˇ\s*()*\s*E', re.UNICODE), lambda match: 'Ě'), - (re.compile(r'ˇ\s*()*\s*l', re.UNICODE), lambda match: 'ľ'), - (re.compile(r'ˇ\s*()*\s*L', re.UNICODE), lambda match: 'Ľ'), - (re.compile(r'ˇ\s*()*\s*n', re.UNICODE), lambda match: 'ň'), - (re.compile(r'ˇ\s*()*\s*N', re.UNICODE), lambda match: 'Ň'), - (re.compile(r'ˇ\s*()*\s*r', re.UNICODE), lambda match: 'ř'), - (re.compile(r'ˇ\s*()*\s*R', re.UNICODE), lambda match: 'Ř'), - (re.compile(r'ˇ\s*()*\s*s', re.UNICODE), lambda match: 'š'), - (re.compile(r'ˇ\s*()*\s*S', re.UNICODE), lambda match: 'Š'), - (re.compile(r'ˇ\s*()*\s*t', re.UNICODE), lambda match: 'ť'), - (re.compile(r'ˇ\s*()*\s*T', re.UNICODE), lambda match: 'Ť'), - (re.compile(r'ˇ\s*()*\s*z', re.UNICODE), lambda match: 'ž'), - (re.compile(r'ˇ\s*()*\s*Z', re.UNICODE), lambda match: 'Ž'), - - # ° - (re.compile(r'°\s*()*\s*u', re.UNICODE), lambda match: 'ů'), - (re.compile(r'°\s*()*\s*U', re.UNICODE), lambda match: 'Ů'), - - # If pdf printed from a browser then the header/footer has a reliable pattern - (re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), - - # Center separator lines - (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '

\n

' + match.group('break') + '

'), - - # Remove
tags - (re.compile(r'', re.IGNORECASE), lambda match: ''), - - # Remove gray background - (re.compile(r']+>'), lambda match : ''), - - # Convert line breaks to paragraphs - (re.compile(r']*>\s*'), lambda match : '

\n

'), - (re.compile(r']*>\s*'), lambda match : '\n

'), - (re.compile(r'\s*'), lambda match : '

\n'), - - # Clean up spaces - (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Add space before and after italics - (re.compile(r'(?'), lambda match: ' '), - (re.compile(r'(?=\w)'), lambda match: ' '), - ] + # Clean up spaces + (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '), + # Add space before and after italics + (re.compile(r'(?'), ' '), + (re.compile(r'(?=\w)'), ' '), + ] # Fix Book Designer markup BOOK_DESIGNER = [ - # HR - (re.compile('
', re.IGNORECASE), - lambda match : ' '), - # Create header tags - (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), - lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), - lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), - lambda match : '

%s

'%(match.group(1),)), - (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), - lambda match : '

%s

'%(match.group(1),)), - ] + # HR + (re.compile('
', re.IGNORECASE), + lambda match : ' '), + # Create header tags + (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + lambda match : '

%s

'%(match.group(1),)), + (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + lambda match : '

%s

'%(match.group(1),)), + ] def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None): self.log = log @@ -636,9 +567,7 @@ class HTMLPreProcessor(object): for rule in rules + end_rules: try: - print(rule[0].pattern) html = rule[0].sub(rule[1], html) - print(222222222222) except Exception as e: if rule in user_sr_rules: self.log.error(