Code to extract anchor positions from PDF files generated by web engine

2025-07-08 18:54:09 -04:00 · 2019-07-07 08:14:10 +05:30 · 2019-07-07 08:14:10 +05:30 · f4941219b9
commit f4941219b9
parent c8619893e0
2 changed files with 99 additions and 0 deletions
--- a/src/calibre/ebooks/pdf/develop.py
+++ b/src/calibre/ebooks/pdf/develop.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+
+from PyQt5.Qt import QApplication, QUrl, QPageLayout, QPageSize, QMarginsF
+from PyQt5.QtWebEngineWidgets import QWebEnginePage
+
+from calibre.gui2 import load_builtin_fonts, must_use_qt
+from calibre.utils.podofo import get_podofo
+
+OUTPUT = '/t/dev.pdf'
+
+
+class Renderer(QWebEnginePage):
+
+    def do_print(self, ok):
+        p = QPageLayout(QPageSize(QPageSize(QPageSize.A6)), QPageLayout.Portrait, QMarginsF(10, 10, 10, 10))
+        self.printToPdf(self.print_finished, p)
+
+    def print_finished(self, pdf_data):
+        with open(OUTPUT, 'wb') as f:
+            f.write(pdf_data)
+        QApplication.instance().exit(0)
+        podofo = get_podofo()
+        doc = podofo.PDFDoc()
+        doc.load(pdf_data)
+        from pprint import pprint
+        pprint(doc.extract_anchors())
+
+
+def main():
+    must_use_qt()
+    load_builtin_fonts()
+    renderer = Renderer()
+    renderer.setUrl(QUrl.fromLocalFile(sys.argv[-1]))
+    renderer.loadFinished.connect(renderer.do_print)
+    QApplication.instance().exec_()
+    print('Output written to:', OUTPUT)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/calibre/utils/podofo/doc.cpp
+++ b/src/calibre/utils/podofo/doc.cpp
@ -6,6 +6,7 @@
 */

 #include "global.h"
+#include <iostream>

 using namespace pdf;

@ -348,6 +349,55 @@ error:

 } // }}}

+// extract_links() {{{
+static PyObject *
+PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
+    const PdfObject* catalog = NULL;
+    PyObject *ans = PyList_New(0);
+    try {
+            if ((catalog = self->doc->GetCatalog()) != NULL) {
+                const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
+                PdfPagesTree *tree = self->doc->GetPagesTree();
+                if (dests_ref && dests_ref->IsReference()) {
+                    const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
+                    if (dests_obj && dests_obj->IsDictionary()) {
+                        const PdfDictionary &dests = dests_obj->GetDictionary();
+                        const TKeyMap &keys = dests.GetKeys();
+                        for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
+                            if (itres->second->IsArray()) {
+                                const PdfArray &dest = itres->second->GetArray();
+                                // see section 8.2 of PDF spec for different types of destination arrays
+                                // but chromium apparently generates only [page /XYZ left top zoom] type arrays
+                                if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
+                                    const PdfPage *page = tree->GetPage(dest[0].GetReference());
+                                    if (page) {
+                                        unsigned int pagenum = page->GetPageNumber();
+                                        double left = dest[2].GetReal(), top = dest[3].GetReal();
+                                        long long zoom = dest[4].GetNumber();
+                                        const std::string &anchor = itres->first.GetName();
+                                        PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom);
+                                        if (!tuple) { break; }
+                                        else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+    } catch(const PdfError & err) {
+        podofo_set_exception(err);
+        Py_CLEAR(ans);
+        return NULL;
+    } catch (...) {
+        PyErr_SetString(PyExc_ValueError, "An unknown error occurred while trying to set the box");
+        Py_CLEAR(ans);
+        return NULL;
+    }
+    if (PyErr_Occurred()) { Py_CLEAR(ans); return NULL; }
+    return ans;
+} // }}}
+
 // Properties {{{

 static PyObject *
@ -570,6 +620,9 @@ static PyMethodDef PDFDoc_methods[] = {
    {"image_count", (PyCFunction)PDFDoc_image_count, METH_VARARGS,
     "image_count() -> Number of images in the PDF."
    },
+    {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
+     "extract_anchors() -> Extract information about links in the document."
+    },
    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
    },