From aafc038b177a5555b890cd7380aeff72da26e082 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 5 Feb 2019 13:21:58 +0530
Subject: [PATCH] Move replacement of nbsp in pdftohtml output from pipeline to
 input plugin. Makes viewing PDF in the viewer a bit better. See #1814626
 (Text goes over to other page)

---
 src/calibre/ebooks/conversion/preprocess.py | 3 ---
 src/calibre/ebooks/pdf/pdftohtml.py         | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 58a7dc6c25..d92cc2aa1f 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -531,9 +531,6 @@ class HTMLPreProcessor(object):
             rules = []
 
         start_rules = []
-        if is_pdftohtml:
-            # Remove non breaking spaces
-            start_rules.append((re.compile(unicode(r'\u00a0')), lambda match : ' '))
 
         if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py
index 44da4e8095..1cbf66daef 100644
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@@ -107,6 +107,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
                 raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                 raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
                 raw = replace_entities(raw)
+                raw = raw.replace('\u00a0', ' ')
 
                 i.write(raw.encode('utf-8'))