mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make the regexps used for pre-processing pdftohtml output more efficient
This commit is contained in:
parent
2014e6520e
commit
e5dc6b4002
@ -344,6 +344,34 @@ class CSSPreProcessor(object):
|
|||||||
return '\n'.join(ans)
|
return '\n'.join(ans)
|
||||||
|
|
||||||
|
|
||||||
|
def accent_regex(accent_maps, letter_before=False):
|
||||||
|
accent_cat = set()
|
||||||
|
letters = set()
|
||||||
|
|
||||||
|
for accent in tuple(accent_maps):
|
||||||
|
accent_cat.add(accent)
|
||||||
|
k, v = accent_maps[accent].split(':', 1)
|
||||||
|
if len(k) != len(v):
|
||||||
|
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
|
||||||
|
accent_maps[accent] = lmap = dict(zip(k, v))
|
||||||
|
letters |= set(lmap)
|
||||||
|
|
||||||
|
if letter_before:
|
||||||
|
args = ''.join(letters), ''.join(accent_cat)
|
||||||
|
accent_group, letter_group = 2, 1
|
||||||
|
else:
|
||||||
|
args = ''.join(accent_cat), ''.join(letters)
|
||||||
|
accent_group, letter_group = 1, 2
|
||||||
|
|
||||||
|
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||||
|
|
||||||
|
def sub(m):
|
||||||
|
lmap = accent_maps[m.group(accent_group)]
|
||||||
|
return lmap.get(m.group(letter_group)) or m.group()
|
||||||
|
|
||||||
|
return pat, sub
|
||||||
|
|
||||||
|
|
||||||
class HTMLPreProcessor(object):
|
class HTMLPreProcessor(object):
|
||||||
|
|
||||||
PREPROCESS = [
|
PREPROCESS = [
|
||||||
@ -363,116 +391,19 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
# Fix pdftohtml markup
|
# Fix pdftohtml markup
|
||||||
PDFTOHTML = [
|
PDFTOHTML = [
|
||||||
# Fix umlauts
|
accent_regex({
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'),
|
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'),
|
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'),
|
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'),
|
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'),
|
'¸': 'cC:çÇ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'),
|
'˛': 'aAeE:ąĄęĘ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'),
|
'˙': 'zZ:żŻ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'),
|
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'),
|
'°': 'uU:ůŮ',
|
||||||
(re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'),
|
}),
|
||||||
|
|
||||||
# Fix accents
|
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||||
# `
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
|
|
||||||
(re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
|
|
||||||
|
|
||||||
# ` with letter before
|
|
||||||
(re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
|
|
||||||
(re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
|
|
||||||
(re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
|
|
||||||
(re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
|
|
||||||
(re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
|
|
||||||
(re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
|
|
||||||
(re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
|
|
||||||
(re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
|
|
||||||
(re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
|
|
||||||
(re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
|
|
||||||
|
|
||||||
# ´
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
|
|
||||||
(re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
|
|
||||||
|
|
||||||
# ˆ
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
|
|
||||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
|
|
||||||
|
|
||||||
# ¸
|
|
||||||
(re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
|
|
||||||
(re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
|
|
||||||
|
|
||||||
# ˛
|
|
||||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
|
|
||||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
|
|
||||||
(re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
|
|
||||||
(re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
|
|
||||||
|
|
||||||
# ˙
|
|
||||||
(re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
|
|
||||||
(re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
|
|
||||||
|
|
||||||
# ˇ
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
|
|
||||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
|
|
||||||
|
|
||||||
# °
|
|
||||||
(re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
|
|
||||||
(re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
|
|
||||||
|
|
||||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
@ -481,21 +412,21 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||||
|
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||||
|
|
||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||||
|
|
||||||
# Convert line breaks to paragraphs
|
# Convert line breaks to paragraphs
|
||||||
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
|
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||||
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
|
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||||
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||||
|
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
|
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Fix Book Designer markup
|
# Fix Book Designer markup
|
||||||
@ -636,9 +567,7 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
for rule in rules + end_rules:
|
for rule in rules + end_rules:
|
||||||
try:
|
try:
|
||||||
print(rule[0].pattern)
|
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
print(222222222222)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if rule in user_sr_rules:
|
if rule in user_sr_rules:
|
||||||
self.log.error(
|
self.log.error(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user