Move replacement of nbsp in pdftohtml output from pipeline to input plugin. Makes viewing PDF in the viewer a bit better. See #1814626 (Text goes over to other page)

2025-08-30 23:00:21 -04:00 · 2019-02-05 13:21:58 +05:30 · 2019-02-05 13:21:58 +05:30 · aafc038b17
commit aafc038b17
parent 0c815cd06d
2 changed files with 1 additions and 3 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -531,9 +531,6 @@ class HTMLPreProcessor(object):
            rules = []
        start_rules = []
        if is_pdftohtml:
            # Remove non breaking spaces
            start_rules.append((re.compile(unicode(r'\u00a0')), lambda match : ' '))
        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -107,6 +107,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')
                i.write(raw.encode('utf-8'))