PDF Input: Fix a regression in 3.24 that caused conversion of PDF to be significantly worse. Fixes #1775984 [pdf to epub conversion worsen from 3.23 to 3.24](https://bugs.launchpad.net/calibre/+bug/1775984)

The magic cookie the conversion pipeline used to detect pdftohtml output was being stripped by the change to handle private entities
2025-07-09 03:04:10 -04:00 · 2018-06-10 08:02:21 +05:30 · 2018-06-10 08:02:21 +05:30 · 10aea19a9f
commit 10aea19a9f
parent 2a5c7c098b
2 changed files with 4 additions and 4 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -664,6 +664,9 @@ class HTMLPreProcessor(object):
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)
        if is_pdftohtml:
            html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = smarten_punctuation(html, self.log)
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -100,7 +100,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
            with lopen(index, 'r+b') as i:
                raw = i.read()
                raw = flip_images(raw)
-                raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
+                raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
@ -109,9 +109,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
                raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
                raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
                # pdftohtml adds link and background colors on <body>. The
                # background color is incorrect
                raw = re.sub(b'<body .+?>', b'<body>', raw)
                i.write(raw)