mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pdftohtml processing: better line re-wrapping
This commit is contained in:
parent
2c3e8cccb8
commit
f7ec532d57
@ -48,6 +48,8 @@ class HTMLPreProcessor(object):
|
||||
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Remove page links
|
||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
||||
# Remove page numbers
|
||||
@ -69,6 +71,12 @@ class HTMLPreProcessor(object):
|
||||
|
||||
# Have paragraphs show better
|
||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||
|
||||
# Re wrap lines
|
||||
(re.compile(r'(?<=\w)\s*</i>\s*<p.*?>\s*<i>\s*(?=\w)'), lambda match: ' '),
|
||||
(re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '),
|
||||
# Clean up spaces
|
||||
(re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
]
|
||||
|
||||
# Fix Book Designer markup
|
||||
|
Loading…
x
Reference in New Issue
Block a user