diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da652c1a38..973e508746 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -5,8 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import functools
-import re
+import functools, re
 
 from calibre import entity_to_unicode
 
@@ -73,7 +72,7 @@ def line_length(format, raw, percent):
     '''
     raw = raw.replace('&nbsp;', ' ')
     if format == 'html':
-    	linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
+        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     lines = linere.findall(raw)
@@ -205,9 +204,6 @@ class HTMLPreProcessor(object):
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
-                  # Remove non breaking spaces
-                  (re.compile(ur'\u00a0'), lambda match : ' '),
-
                   # Detect Chapters to match default XPATH in GUI
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
@@ -254,20 +250,27 @@ class HTMLPreProcessor(object):
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
-    def __call__(self, html, remove_special_chars=None):
+    def __call__(self, html, remove_special_chars=None,
+            get_preprocess_html=False):
         if remove_special_chars is not None:
             html = remove_special_chars.sub('', html)
         html = html.replace('\0', '')
+        is_pdftohtml = self.is_pdftohtml(html)
         if self.is_baen(html):
             rules = []
         elif self.is_book_designer(html):
             rules = self.BOOK_DESIGNER
-        elif self.is_pdftohtml(html):
+        elif is_pdftohtml:
             rules = self.PDFTOHTML
         else:
             rules = []
 
-        if not self.extra_opts.keep_ligatures:
+        start_rules = []
+        if is_pdftohtml:
+            # Remove non breaking spaces
+            start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+
+        if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
 
         end_rules = []
@@ -299,9 +302,35 @@ class HTMLPreProcessor(object):
                     (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                 )
 
-        for rule in self.PREPROCESS + rules + end_rules:
+        for rule in self.PREPROCESS + start_rules:
             html = rule[0].sub(rule[1], html)
 
+        if get_preprocess_html:
+            return html
+
+        def dump(raw, where):
+            import os
+            dp = getattr(self.extra_opts, 'debug_pipeline', None)
+            if dp and os.path.exists(dp):
+                odir = os.path.join(dp, 'input')
+                if os.path.exists(odir):
+                    odir = os.path.join(odir, where)
+                    if not os.path.exists(odir):
+                        os.makedirs(odir)
+                    name, i = None, 0
+                    while not name or os.path.exists(os.path.join(odir, name)):
+                        i += 1
+                        name = '%04d.html'%i
+                    with open(os.path.join(odir, name), 'wb') as f:
+                        f.write(raw.encode('utf-8'))
+
+        #dump(html, 'pre-preprocess')
+
+        for rule in rules + end_rules:
+            html = rule[0].sub(rule[1], html)
+
+        #dump(html, 'post-preprocess')
+
         # Handle broken XHTML w/ SVG (ugh)
         if 'svg:' in html and SVG_NS not in html:
             html = html.replace(
diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py
index b10772b86c..c5c8d84a88 100644
--- a/src/calibre/gui2/convert/regex_builder.py
+++ b/src/calibre/gui2/convert/regex_builder.py
@@ -14,7 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder
 from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
 from calibre.gui2 import error_dialog, choose_files
 from calibre.ebooks.oeb.iterator import EbookIterator
-from calibre.ebooks.conversion.preprocess import convert_entities
+from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
 from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
 
 class RegexBuilder(QDialog, Ui_RegexBuilder):
@@ -91,10 +91,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
         self.iterator = EbookIterator(pathtoebook)
         self.iterator.__enter__(only_input_plugin=True)
         text = [u'']
-        ent_pat = re.compile(r'&(\S+?);')
+        preprocessor = HTMLPreProcessor(None, False)
         for path in self.iterator.spine:
             html = open(path, 'rb').read().decode('utf-8', 'replace')
-            html = ent_pat.sub(convert_entities, html)
+            html = preprocessor(html, get_preprocess_html=True)
             text.append(html)
         self.preview.setPlainText('\n---\n'.join(text))