From beb97277570be7393a0458e8d227bda1a8d403e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 3 Oct 2019 18:12:45 +0530 Subject: [PATCH] Add a function to get the outline from a PDF file --- src/calibre/utils/podofo/__init__.py | 13 ++++++- src/calibre/utils/podofo/doc.cpp | 3 ++ src/calibre/utils/podofo/global.h | 1 + src/calibre/utils/podofo/outlines.cpp | 56 ++++++++++++++++++++++++++- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index c036eb8a23..2402dc8216 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import shutil +import sys from calibre.constants import plugins, preferred_encoding from calibre.ebooks.metadata import authors_to_string @@ -115,6 +116,17 @@ def get_xmp_metadata(path): return p.get_xmp_metadata() +def get_outline(path=None): + if path is None: + path = sys.argv[-1] + podofo = get_podofo() + p = podofo.PDFDoc() + with open(path, 'rb') as f: + raw = f.read() + p.load(raw) + return p.get_outline()['children'] + + def get_image_count(path): podofo = get_podofo() p = podofo.PDFDoc() @@ -214,5 +226,4 @@ def test_podofo(): if __name__ == '__main__': - import sys get_xmp_metadata(sys.argv[-1]) diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 013b588cd0..f788eaefeb 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -752,6 +752,9 @@ static PyMethodDef PDFDoc_methods[] = { {"create_outline", (PyCFunction)py_create_outline, METH_VARARGS, "create_outline(title, pagenum) -> Create an outline, return the first outline item." }, + {"get_outline", (PyCFunction)py_get_outline, METH_NOARGS, + "get_outline() -> Get the outline if any in the PDF file." + }, {"get_xmp_metadata", (PyCFunction)PDFDoc_get_xmp_metadata, METH_VARARGS, "get_xmp_metadata(raw) -> Get the XMP metadata as raw bytes" }, diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index c5ff9be795..09b8f1b02b 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -103,6 +103,7 @@ PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args); PyObject* py_impose(PDFDoc *self, PyObject *args); PyObject* py_dedup_images(PDFDoc *self, PyObject *args); PyObject* py_create_outline(PDFDoc *self, PyObject *args); +PyObject* py_get_outline(PDFDoc *self, PyObject *args); } } diff --git a/src/calibre/utils/podofo/outlines.cpp b/src/calibre/utils/podofo/outlines.cpp index be7cc7352d..674cf5d432 100644 --- a/src/calibre/utils/podofo/outlines.cpp +++ b/src/calibre/utils/podofo/outlines.cpp @@ -9,7 +9,6 @@ using namespace pdf; -// create_outline() {{{ static PyObject * create_outline(PDFDoc *self, PyObject *args) { PDFOutlineItem *ans; @@ -53,6 +52,59 @@ error: Py_XDECREF(ans); return NULL; -} // }}} +} + +static PyObject* +create_outline_node() { + pyunique_ptr ans(PyDict_New()); + if (!ans) return NULL; + pyunique_ptr children(PyList_New(0)); + if (!children) return NULL; + if (PyDict_SetItemString(ans.get(), "children", children.get()) != 0) return NULL; + return ans.release(); +} + +static void +convert_outline(PDFDoc *self, PyObject *parent, PdfOutlineItem *item) { + pyunique_ptr title(podofo_convert_pdfstring(item->GetTitle())); + if (!title) return; + pyunique_ptr node(create_outline_node()); + if (!node) return; + if (PyDict_SetItemString(node.get(), "title", title.get()) != 0) return; + PdfDestination* dest = item->GetDestination(self->doc); + if (dest) { + PdfPage *page = dest->GetPage(self->doc); + long pnum = page ? page->GetPageNumber() : -1; + pyunique_ptr d(Py_BuildValue("{sl sd sd sd}", "page", pnum, "top", dest->GetTop(), "left", dest->GetLeft(), "zoom", dest->GetZoom())); + if (!d) return; + if (PyDict_SetItemString(node.get(), "dest", d.get()) != 0) return; + } + PyObject *children = PyDict_GetItemString(parent, "children"); + if (PyList_Append(children, node.get()) != 0) return; + + if (item->First()) { + convert_outline(self, node.get(), item->First()); + if (PyErr_Occurred()) return; + } + + if (item->Next()) { + convert_outline(self, parent, item->Next()); + if (PyErr_Occurred()) return; + } +} + +static PyObject * +get_outline(PDFDoc *self, PyObject *args) { + PdfOutlines *root = self->doc->GetOutlines(PoDoFo::ePdfDontCreateObject); + if (!root || !root->First()) Py_RETURN_NONE; + PyObject *ans = create_outline_node(); + if (!ans) return NULL; + convert_outline(self, ans, root->First()); + if (PyErr_Occurred()) { Py_DECREF(ans); return NULL; } + if (!ans) return NULL; + + return ans; +} PYWRAP(create_outline) +PYWRAP(get_outline)