From aafc038b177a5555b890cd7380aeff72da26e082 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 5 Feb 2019 13:21:58 +0530 Subject: [PATCH] Move replacement of nbsp in pdftohtml output from pipeline to input plugin. Makes viewing PDF in the viewer a bit better. See #1814626 (Text goes over to other page) --- src/calibre/ebooks/conversion/preprocess.py | 3 --- src/calibre/ebooks/pdf/pdftohtml.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 58a7dc6c25..d92cc2aa1f 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -531,9 +531,6 @@ class HTMLPreProcessor(object): rules = [] start_rules = [] - if is_pdftohtml: - # Remove non breaking spaces - start_rules.append((re.compile(unicode(r'\u00a0')), lambda match : ' ')) if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 44da4e8095..1cbf66daef 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -107,6 +107,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): raw = re.sub(r'