PDF Input: Add support for PDF outlines (bookmarks). Fixes #1524522 [Private bug](https://bugs.launchpad.net/calibre/+bug/1524522)

This commit is contained in:
Kovid Goyal 2015-12-10 18:50:47 +05:30
parent d727310212
commit 090f8e4e5f
3 changed files with 47 additions and 7 deletions

View File

@ -67,5 +67,12 @@ class PDFInput(InputFormatPlugin):
log.debug('Rendering manifest...')
with open(u'metadata.opf', 'wb') as opffile:
opf.render(opffile)
if os.path.exists(u'toc.ncx'):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with open(u'metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % bytes(ncxid))
f.seek(0)
f.write(raw)
return os.path.join(os.getcwdu(), u'metadata.opf')

View File

@ -460,8 +460,6 @@ class HTMLPreProcessor(object):
# Center separator lines
(re.compile(u'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
# Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),

View File

@ -85,10 +85,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
logf.flush()
logf.close()
out = open(logf.name, 'rb').read().strip()
try:
os.remove(pdfsrc)
except:
pass
if ret != 0:
raise ConversionError(b'return code: %d\n%s' % (ret, out))
if out:
@ -106,7 +102,46 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
# breaks the pdf heuristics regexps, so replace them
i.write(raw.replace(b'<br/>', b'<br>'))
raw = raw.replace(b'<br/>', b'<br>')
raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
i.write(raw)
cmd = [exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
b'-nodrm', b'-q', b'-stdout', a(pdfsrc)]
p = popen(cmd, stdout=subprocess.PIPE)
raw = p.stdout.read().strip()
if p.wait() == 0 and raw:
parse_outline(raw, output_dir)
if isbsd:
cmd.remove(b'-nodrm')
try:
os.remove(pdfsrc)
except:
pass
def parse_outline(raw, output_dir):
from lxml import etree
outline = etree.fromstring(raw).xpath('(//outline)[1]')
if outline:
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
outline = outline[0]
toc = TOC()
def process_node(node, toc):
for child in node.iterdescendants('*'):
if child.tag == 'outline':
parent = toc.children[-1] if toc.children else toc
process_node(child, parent)
else:
page = child.get('page', '1')
toc.add(child.text, 'index.html', page)
process_node(outline, toc)
root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def flip_image(img, flip):
from calibre.utils.magick import Image