From ab9c4b39f0d1aa9dc041c9ceefbba3412bcd56b7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 14 Jun 2009 20:33:55 -0400
Subject: [PATCH] PDF preprocessing rule additions.

---
 src/calibre/ebooks/conversion/preprocess.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 2dc404e586..1cbec251e3 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -130,7 +130,11 @@ class HTMLPreProcessor(object):
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
-                  (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
+                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
+                  # Connect paragraphs split by -
+                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
+                  # Remove - that splits words
+                  (re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                   (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),