diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 92864f8e6f..3ae43be80f 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -664,6 +664,9 @@ class HTMLPreProcessor(object): preprocessor = HeuristicProcessor(self.extra_opts, self.log) html = preprocessor(html) + if is_pdftohtml: + html = html.replace('', '') + if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 69b7724147..b4f073312c 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -100,7 +100,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): with lopen(index, 'r+b') as i: raw = i.read() raw = flip_images(raw) - raw = '\n' + raw + raw = raw.replace('\n = 0.20 output self closing
tags, this @@ -109,9 +109,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): raw = re.sub(br'. The - # background color is incorrect - raw = re.sub(b'', b'', raw) i.write(raw)