mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make the regexps used for pre-processing pdftohtml output more efficient
This commit is contained in:
parent
2014e6520e
commit
e5dc6b4002
@ -344,6 +344,34 @@ class CSSPreProcessor(object):
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
def accent_regex(accent_maps, letter_before=False):
|
||||
accent_cat = set()
|
||||
letters = set()
|
||||
|
||||
for accent in tuple(accent_maps):
|
||||
accent_cat.add(accent)
|
||||
k, v = accent_maps[accent].split(':', 1)
|
||||
if len(k) != len(v):
|
||||
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
|
||||
accent_maps[accent] = lmap = dict(zip(k, v))
|
||||
letters |= set(lmap)
|
||||
|
||||
if letter_before:
|
||||
args = ''.join(letters), ''.join(accent_cat)
|
||||
accent_group, letter_group = 2, 1
|
||||
else:
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
return lmap.get(m.group(letter_group)) or m.group()
|
||||
|
||||
return pat, sub
|
||||
|
||||
|
||||
class HTMLPreProcessor(object):
|
||||
|
||||
PREPROCESS = [
|
||||
@ -363,156 +391,59 @@ class HTMLPreProcessor(object):
|
||||
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Fix umlauts
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'),
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
|
||||
# Fix accents
|
||||
# `
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
|
||||
# ` with letter before
|
||||
(re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
|
||||
(re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
|
||||
(re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
|
||||
(re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
|
||||
(re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
|
||||
(re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
|
||||
(re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
|
||||
(re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
|
||||
(re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
|
||||
(re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# ´
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
|
||||
# ˆ
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# ¸
|
||||
(re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
|
||||
(re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# ˛
|
||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
|
||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
|
||||
(re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
|
||||
(re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# ˙
|
||||
(re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
|
||||
(re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
|
||||
|
||||
# ˇ
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
|
||||
|
||||
# °
|
||||
(re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
|
||||
(re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||
]
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
|
||||
# Fix Book Designer markup
|
||||
BOOK_DESIGNER = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
|
||||
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
|
||||
self.log = log
|
||||
@ -636,9 +567,7 @@ class HTMLPreProcessor(object):
|
||||
|
||||
for rule in rules + end_rules:
|
||||
try:
|
||||
print(rule[0].pattern)
|
||||
html = rule[0].sub(rule[1], html)
|
||||
print(222222222222)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
|
Loading…
x
Reference in New Issue
Block a user