diff --git a/src/calibre/ebooks/pdf/develop.py b/src/calibre/ebooks/pdf/develop.py new file mode 100644 index 0000000000..4376c7eee6 --- /dev/null +++ b/src/calibre/ebooks/pdf/develop.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2019, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +import sys + +from PyQt5.Qt import QApplication, QUrl, QPageLayout, QPageSize, QMarginsF +from PyQt5.QtWebEngineWidgets import QWebEnginePage + +from calibre.gui2 import load_builtin_fonts, must_use_qt +from calibre.utils.podofo import get_podofo + +OUTPUT = '/t/dev.pdf' + + +class Renderer(QWebEnginePage): + + def do_print(self, ok): + p = QPageLayout(QPageSize(QPageSize(QPageSize.A6)), QPageLayout.Portrait, QMarginsF(10, 10, 10, 10)) + self.printToPdf(self.print_finished, p) + + def print_finished(self, pdf_data): + with open(OUTPUT, 'wb') as f: + f.write(pdf_data) + QApplication.instance().exit(0) + podofo = get_podofo() + doc = podofo.PDFDoc() + doc.load(pdf_data) + from pprint import pprint + pprint(doc.extract_anchors()) + + +def main(): + must_use_qt() + load_builtin_fonts() + renderer = Renderer() + renderer.setUrl(QUrl.fromLocalFile(sys.argv[-1])) + renderer.loadFinished.connect(renderer.do_print) + QApplication.instance().exec_() + print('Output written to:', OUTPUT) + + +if __name__ == '__main__': + main() diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 0a1e71a820..da0f6161c2 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -6,6 +6,7 @@ */ #include "global.h" +#include using namespace pdf; @@ -348,6 +349,55 @@ error: } // }}} +// extract_links() {{{ +static PyObject * +PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) { + const PdfObject* catalog = NULL; + PyObject *ans = PyList_New(0); + try { + if ((catalog = self->doc->GetCatalog()) != NULL) { + const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests"); + PdfPagesTree *tree = self->doc->GetPagesTree(); + if (dests_ref && dests_ref->IsReference()) { + const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference()); + if (dests_obj && dests_obj->IsDictionary()) { + const PdfDictionary &dests = dests_obj->GetDictionary(); + const TKeyMap &keys = dests.GetKeys(); + for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) { + if (itres->second->IsArray()) { + const PdfArray &dest = itres->second->GetArray(); + // see section 8.2 of PDF spec for different types of destination arrays + // but chromium apparently generates only [page /XYZ left top zoom] type arrays + if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") { + const PdfPage *page = tree->GetPage(dest[0].GetReference()); + if (page) { + unsigned int pagenum = page->GetPageNumber(); + double left = dest[2].GetReal(), top = dest[3].GetReal(); + long long zoom = dest[4].GetNumber(); + const std::string &anchor = itres->first.GetName(); + PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom); + if (!tuple) { break; } + else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; } + } + } + } + } + } + } + } + } catch(const PdfError & err) { + podofo_set_exception(err); + Py_CLEAR(ans); + return NULL; + } catch (...) { + PyErr_SetString(PyExc_ValueError, "An unknown error occurred while trying to set the box"); + Py_CLEAR(ans); + return NULL; + } + if (PyErr_Occurred()) { Py_CLEAR(ans); return NULL; } + return ans; +} // }}} + // Properties {{{ static PyObject * @@ -570,6 +620,9 @@ static PyMethodDef PDFDoc_methods[] = { {"image_count", (PyCFunction)PDFDoc_image_count, METH_VARARGS, "image_count() -> Number of images in the PDF." }, + {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS, + "extract_anchors() -> Extract information about links in the document." + }, {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." },