mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Input: Fix a regression in 3.24 that caused conversion of PDF to be significantly worse. Fixes #1775984 [pdf to epub conversion worsen from 3.23 to 3.24](https://bugs.launchpad.net/calibre/+bug/1775984)
The magic cookie the conversion pipeline used to detect pdftohtml output was being stripped by the change to handle private entities
This commit is contained in:
parent
2a5c7c098b
commit
10aea19a9f
@ -664,6 +664,9 @@ class HTMLPreProcessor(object):
|
|||||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
html = preprocessor(html)
|
html = preprocessor(html)
|
||||||
|
|
||||||
|
if is_pdftohtml:
|
||||||
|
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = smarten_punctuation(html, self.log)
|
html = smarten_punctuation(html, self.log)
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
with lopen(index, 'r+b') as i:
|
with lopen(index, 'r+b') as i:
|
||||||
raw = i.read()
|
raw = i.read()
|
||||||
raw = flip_images(raw)
|
raw = flip_images(raw)
|
||||||
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
|
raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1)
|
||||||
i.seek(0)
|
i.seek(0)
|
||||||
i.truncate()
|
i.truncate()
|
||||||
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
||||||
@ -109,9 +109,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
|
raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
|
||||||
raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
|
raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
|
||||||
raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
|
raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
|
||||||
# pdftohtml adds link and background colors on <body>. The
|
|
||||||
# background color is incorrect
|
|
||||||
raw = re.sub(b'<body .+?>', b'<body>', raw)
|
|
||||||
|
|
||||||
i.write(raw)
|
i.write(raw)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user