PDF Input: Fix a regression in 3.24 that caused conversion of PDF to be significantly worse. Fixes #1775984 [pdf to epub conversion worsen from 3.23 to 3.24](https://bugs.launchpad.net/calibre/+bug/1775984)

The magic cookie the conversion pipeline used to detect pdftohtml output
was being stripped by the change to handle private entities
This commit is contained in:
Kovid Goyal 2018-06-10 08:02:21 +05:30
parent 2a5c7c098b
commit 10aea19a9f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 4 additions and 4 deletions

View File

@ -664,6 +664,9 @@ class HTMLPreProcessor(object):
preprocessor = HeuristicProcessor(self.extra_opts, self.log) preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html) html = preprocessor(html)
if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log) html = smarten_punctuation(html, self.log)

View File

@ -100,7 +100,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
with lopen(index, 'r+b') as i: with lopen(index, 'r+b') as i:
raw = i.read() raw = i.read()
raw = flip_images(raw) raw = flip_images(raw)
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1)
i.seek(0) i.seek(0)
i.truncate() i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this # versions of pdftohtml >= 0.20 output self closing <br> tags, this
@ -109,9 +109,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I) raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I) raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I) raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
# pdftohtml adds link and background colors on <body>. The
# background color is incorrect
raw = re.sub(b'<body .+?>', b'<body>', raw)
i.write(raw) i.write(raw)