From f7ec532d578f874bf915c5f2bbd5077e667c074c Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 9 Apr 2009 17:31:07 -0400 Subject: [PATCH] pdftohtml processing: better line re-wrapping --- src/calibre/ebooks/conversion/preprocess.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bb8ee90364..3fbbb47d13 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -48,6 +48,8 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ + # Remove page links + (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers @@ -69,6 +71,12 @@ class HTMLPreProcessor(object): # Have paragraphs show better (re.compile(r''), lambda match : '

'), + + # Re wrap lines + (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), + (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), + # Clean up spaces + (re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), ] # Fix Book Designer markup