diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bb8ee90364..3fbbb47d13 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -48,6 +48,8 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ + # Remove page links + (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers @@ -69,6 +71,12 @@ class HTMLPreProcessor(object): # Have paragraphs show better (re.compile(r''), lambda match : '

'), + + # Re wrap lines + (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), + (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), + # Clean up spaces + (re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), ] # Fix Book Designer markup