From ab9c4b39f0d1aa9dc041c9ceefbba3412bcd56b7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 14 Jun 2009 20:33:55 -0400 Subject: [PATCH] PDF preprocessing rule additions. --- src/calibre/ebooks/conversion/preprocess.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 2dc404e586..1cbec251e3 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -130,7 +130,11 @@ class HTMLPreProcessor(object): # Have paragraphs show better (re.compile(r''), lambda match : '

'), # Clean up spaces - (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + # Connect paragraphs split by - + (re.compile(u'(?<=[^\s][-–])[\s]*(

)*[\s]*(

)*\s*(?=[^\s])'), lambda match: ''), + # Remove - that splits words + (re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?'), lambda match: ' '), (re.compile(r'(?=\w)'), lambda match: ' '),