From e5dc6b40023a465221f0eb35ae7b3df701a5f688 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 19 Oct 2019 10:03:21 +0530
Subject: [PATCH] Make the regexps used for pre-processing pdftohtml output
more efficient
---
src/calibre/ebooks/conversion/preprocess.py | 213 +++++++-------------
1 file changed, 71 insertions(+), 142 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 4cc3521c1d..b3eb102fc6 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -344,6 +344,34 @@ class CSSPreProcessor(object):
return '\n'.join(ans)
+def accent_regex(accent_maps, letter_before=False):
+ accent_cat = set()
+ letters = set()
+
+ for accent in tuple(accent_maps):
+ accent_cat.add(accent)
+ k, v = accent_maps[accent].split(':', 1)
+ if len(k) != len(v):
+ raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
+ accent_maps[accent] = lmap = dict(zip(k, v))
+ letters |= set(lmap)
+
+ if letter_before:
+ args = ''.join(letters), ''.join(accent_cat)
+ accent_group, letter_group = 2, 1
+ else:
+ args = ''.join(accent_cat), ''.join(letters)
+ accent_group, letter_group = 1, 2
+
+ pat = re.compile(r'([{}])\s*(?:
]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
+
+ def sub(m):
+ lmap = accent_maps[m.group(accent_group)]
+ return lmap.get(m.group(letter_group)) or m.group()
+
+ return pat, sub
+
+
class HTMLPreProcessor(object):
PREPROCESS = [
@@ -363,156 +391,59 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup
PDFTOHTML = [
- # Fix umlauts
- (re.compile(r'¨\s*()*\s*a', re.UNICODE), lambda match: 'ä'),
- (re.compile(r'¨\s*()*\s*A', re.UNICODE), lambda match: 'Ä'),
- (re.compile(r'¨\s*()*\s*e', re.UNICODE), lambda match: 'ë'),
- (re.compile(r'¨\s*()*\s*E', re.UNICODE), lambda match: 'Ë'),
- (re.compile(r'¨\s*()*\s*i', re.UNICODE), lambda match: 'ï'),
- (re.compile(r'¨\s*()*\s*I', re.UNICODE), lambda match: 'Ï'),
- (re.compile(r'¨\s*()*\s*o', re.UNICODE), lambda match: 'ö'),
- (re.compile(r'¨\s*()*\s*O', re.UNICODE), lambda match: 'Ö'),
- (re.compile(r'¨\s*()*\s*u', re.UNICODE), lambda match: 'ü'),
- (re.compile(r'¨\s*()*\s*U', re.UNICODE), lambda match: 'Ü'),
+ accent_regex({
+ '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
+ '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
+ '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
+ 'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
+ '¸': 'cC:çÇ',
+ '˛': 'aAeE:ąĄęĘ',
+ '˙': 'zZ:żŻ',
+ 'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
+ '°': 'uU:ůŮ',
+ }),
- # Fix accents
- # `
- (re.compile(r'`\s*()*\s*a', re.UNICODE), lambda match: 'à'),
- (re.compile(r'`\s*()*\s*A', re.UNICODE), lambda match: 'À'),
- (re.compile(r'`\s*()*\s*e', re.UNICODE), lambda match: 'è'),
- (re.compile(r'`\s*()*\s*E', re.UNICODE), lambda match: 'È'),
- (re.compile(r'`\s*()*\s*i', re.UNICODE), lambda match: 'ì'),
- (re.compile(r'`\s*()*\s*I', re.UNICODE), lambda match: 'Ì'),
- (re.compile(r'`\s*()*\s*o', re.UNICODE), lambda match: 'ò'),
- (re.compile(r'`\s*()*\s*O', re.UNICODE), lambda match: 'Ò'),
- (re.compile(r'`\s*()*\s*u', re.UNICODE), lambda match: 'ù'),
- (re.compile(r'`\s*()*\s*U', re.UNICODE), lambda match: 'Ù'),
+ accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
- # ` with letter before
- (re.compile(r'a\s*()*\s*`', re.UNICODE), lambda match: 'à'),
- (re.compile(r'A\s*()*\s*`', re.UNICODE), lambda match: 'À'),
- (re.compile(r'e\s*()*\s*`', re.UNICODE), lambda match: 'è'),
- (re.compile(r'E\s*()*\s*`', re.UNICODE), lambda match: 'È'),
- (re.compile(r'i\s*()*\s*`', re.UNICODE), lambda match: 'ì'),
- (re.compile(r'I\s*()*\s*`', re.UNICODE), lambda match: 'Ì'),
- (re.compile(r'o\s*()*\s*`', re.UNICODE), lambda match: 'ò'),
- (re.compile(r'O\s*()*\s*`', re.UNICODE), lambda match: 'Ò'),
- (re.compile(r'u\s*()*\s*`', re.UNICODE), lambda match: 'ù'),
- (re.compile(r'U\s*()*\s*`', re.UNICODE), lambda match: 'Ù'),
+ # If pdf printed from a browser then the header/footer has a reliable pattern
+ (re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
- # ´
- (re.compile(r'´\s*()*\s*a', re.UNICODE), lambda match: 'á'),
- (re.compile(r'´\s*()*\s*A', re.UNICODE), lambda match: 'Á'),
- (re.compile(r'´\s*()*\s*c', re.UNICODE), lambda match: 'ć'),
- (re.compile(r'´\s*()*\s*C', re.UNICODE), lambda match: 'Ć'),
- (re.compile(r'´\s*()*\s*e', re.UNICODE), lambda match: 'é'),
- (re.compile(r'´\s*()*\s*E', re.UNICODE), lambda match: 'É'),
- (re.compile(r'´\s*()*\s*i', re.UNICODE), lambda match: 'í'),
- (re.compile(r'´\s*()*\s*I', re.UNICODE), lambda match: 'Í'),
- (re.compile(r'´\s*()*\s*l', re.UNICODE), lambda match: 'ĺ'),
- (re.compile(r'´\s*()*\s*L', re.UNICODE), lambda match: 'Ĺ'),
- (re.compile(r'´\s*()*\s*o', re.UNICODE), lambda match: 'ó'),
- (re.compile(r'´\s*()*\s*O', re.UNICODE), lambda match: 'Ó'),
- (re.compile(r'´\s*()*\s*n', re.UNICODE), lambda match: 'ń'),
- (re.compile(r'´\s*()*\s*N', re.UNICODE), lambda match: 'Ń'),
- (re.compile(r'´\s*()*\s*r', re.UNICODE), lambda match: 'ŕ'),
- (re.compile(r'´\s*()*\s*R', re.UNICODE), lambda match: 'Ŕ'),
- (re.compile(r'´\s*()*\s*s', re.UNICODE), lambda match: 'ś'),
- (re.compile(r'´\s*()*\s*S', re.UNICODE), lambda match: 'Ś'),
- (re.compile(r'´\s*()*\s*u', re.UNICODE), lambda match: 'ú'),
- (re.compile(r'´\s*()*\s*U', re.UNICODE), lambda match: 'Ú'),
- (re.compile(r'´\s*()*\s*z', re.UNICODE), lambda match: 'ź'),
- (re.compile(r'´\s*()*\s*Z', re.UNICODE), lambda match: 'Ź'),
+ # Center separator lines
+ (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '\n
' + match.group('break') + '
'),
- # ˆ
- (re.compile(r'ˆ\s*()*\s*a', re.UNICODE), lambda match: 'â'),
- (re.compile(r'ˆ\s*()*\s*A', re.UNICODE), lambda match: 'Â'),
- (re.compile(r'ˆ\s*()*\s*e', re.UNICODE), lambda match: 'ê'),
- (re.compile(r'ˆ\s*()*\s*E', re.UNICODE), lambda match: 'Ê'),
- (re.compile(r'ˆ\s*()*\s*i', re.UNICODE), lambda match: 'î'),
- (re.compile(r'ˆ\s*()*\s*I', re.UNICODE), lambda match: 'Î'),
- (re.compile(r'ˆ\s*()*\s*o', re.UNICODE), lambda match: 'ô'),
- (re.compile(r'ˆ\s*()*\s*O', re.UNICODE), lambda match: 'Ô'),
- (re.compile(r'ˆ\s*()*\s*u', re.UNICODE), lambda match: 'û'),
- (re.compile(r'ˆ\s*()*\s*U', re.UNICODE), lambda match: 'Û'),
+ # Remove
tags
+ (re.compile(r'', re.IGNORECASE), ''),
- # ¸
- (re.compile(r'¸\s*()*\s*c', re.UNICODE), lambda match: 'ç'),
- (re.compile(r'¸\s*()*\s*C', re.UNICODE), lambda match: 'Ç'),
+ # Remove gray background
+ (re.compile(r']+>'), ''),
- # ˛
- (re.compile(r'\s*˛\s*()*\s*a', re.UNICODE), lambda match: 'ą'),
- (re.compile(r'\s*˛\s*()*\s*A', re.UNICODE), lambda match: 'Ą'),
- (re.compile(r'˛\s*()*\s*e', re.UNICODE), lambda match: 'ę'),
- (re.compile(r'˛\s*()*\s*E', re.UNICODE), lambda match: 'Ę'),
+ # Convert line breaks to paragraphs
+ (re.compile(r'
]*>\s*'), '
\n'),
+ (re.compile(r'
]*>\s*'), '\n'),
+ (re.compile(r'\s*'), '
\n'),
- # ˙
- (re.compile(r'˙\s*()*\s*z', re.UNICODE), lambda match: 'ż'),
- (re.compile(r'˙\s*()*\s*Z', re.UNICODE), lambda match: 'Ż'),
-
- # ˇ
- (re.compile(r'ˇ\s*()*\s*c', re.UNICODE), lambda match: 'č'),
- (re.compile(r'ˇ\s*()*\s*C', re.UNICODE), lambda match: 'Č'),
- (re.compile(r'ˇ\s*()*\s*d', re.UNICODE), lambda match: 'ď'),
- (re.compile(r'ˇ\s*()*\s*D', re.UNICODE), lambda match: 'Ď'),
- (re.compile(r'ˇ\s*()*\s*e', re.UNICODE), lambda match: 'ě'),
- (re.compile(r'ˇ\s*()*\s*E', re.UNICODE), lambda match: 'Ě'),
- (re.compile(r'ˇ\s*()*\s*l', re.UNICODE), lambda match: 'ľ'),
- (re.compile(r'ˇ\s*()*\s*L', re.UNICODE), lambda match: 'Ľ'),
- (re.compile(r'ˇ\s*()*\s*n', re.UNICODE), lambda match: 'ň'),
- (re.compile(r'ˇ\s*()*\s*N', re.UNICODE), lambda match: 'Ň'),
- (re.compile(r'ˇ\s*()*\s*r', re.UNICODE), lambda match: 'ř'),
- (re.compile(r'ˇ\s*()*\s*R', re.UNICODE), lambda match: 'Ř'),
- (re.compile(r'ˇ\s*()*\s*s', re.UNICODE), lambda match: 'š'),
- (re.compile(r'ˇ\s*()*\s*S', re.UNICODE), lambda match: 'Š'),
- (re.compile(r'ˇ\s*()*\s*t', re.UNICODE), lambda match: 'ť'),
- (re.compile(r'ˇ\s*()*\s*T', re.UNICODE), lambda match: 'Ť'),
- (re.compile(r'ˇ\s*()*\s*z', re.UNICODE), lambda match: 'ž'),
- (re.compile(r'ˇ\s*()*\s*Z', re.UNICODE), lambda match: 'Ž'),
-
- # °
- (re.compile(r'°\s*()*\s*u', re.UNICODE), lambda match: 'ů'),
- (re.compile(r'°\s*()*\s*U', re.UNICODE), lambda match: 'Ů'),
-
- # If pdf printed from a browser then the header/footer has a reliable pattern
- (re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
-
- # Center separator lines
- (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '\n
' + match.group('break') + '
'),
-
- # Remove
tags
- (re.compile(r'', re.IGNORECASE), lambda match: ''),
-
- # Remove gray background
- (re.compile(r']+>'), lambda match : ''),
-
- # Convert line breaks to paragraphs
- (re.compile(r'
]*>\s*'), lambda match : '\n'),
- (re.compile(r'
]*>\s*'), lambda match : '\n'),
- (re.compile(r'\s*'), lambda match : '
\n'),
-
- # Clean up spaces
- (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Add space before and after italics
- (re.compile(r'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
- ]
+ # Clean up spaces
+ (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
+ # Add space before and after italics
+ (re.compile(r'(?'), ' '),
+ (re.compile(r'(?=\w)'), ' '),
+ ]
# Fix Book Designer markup
BOOK_DESIGNER = [
- # HR
- (re.compile('
', re.IGNORECASE),
- lambda match : ' '),
- # Create header tags
- (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- ]
+ # HR
+ (re.compile('
', re.IGNORECASE),
+ lambda match : ' '),
+ # Create header tags
+ (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
+ lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+ (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
+ lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+ (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
+ lambda match : '%s
'%(match.group(1),)),
+ (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
+ lambda match : '%s
'%(match.group(1),)),
+ ]
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
self.log = log
@@ -636,9 +567,7 @@ class HTMLPreProcessor(object):
for rule in rules + end_rules:
try:
- print(rule[0].pattern)
html = rule[0].sub(rule[1], html)
- print(222222222222)
except Exception as e:
if rule in user_sr_rules:
self.log.error(