Code to extract anchor positions from PDF files generated by web engine

This commit is contained in:
Kovid Goyal 2019-07-07 08:14:10 +05:30
parent c8619893e0
commit f4941219b9
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 99 additions and 0 deletions

View File

@ -0,0 +1,46 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
from PyQt5.Qt import QApplication, QUrl, QPageLayout, QPageSize, QMarginsF
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from calibre.gui2 import load_builtin_fonts, must_use_qt
from calibre.utils.podofo import get_podofo
OUTPUT = '/t/dev.pdf'
class Renderer(QWebEnginePage):
def do_print(self, ok):
p = QPageLayout(QPageSize(QPageSize(QPageSize.A6)), QPageLayout.Portrait, QMarginsF(10, 10, 10, 10))
self.printToPdf(self.print_finished, p)
def print_finished(self, pdf_data):
with open(OUTPUT, 'wb') as f:
f.write(pdf_data)
QApplication.instance().exit(0)
podofo = get_podofo()
doc = podofo.PDFDoc()
doc.load(pdf_data)
from pprint import pprint
pprint(doc.extract_anchors())
def main():
must_use_qt()
load_builtin_fonts()
renderer = Renderer()
renderer.setUrl(QUrl.fromLocalFile(sys.argv[-1]))
renderer.loadFinished.connect(renderer.do_print)
QApplication.instance().exec_()
print('Output written to:', OUTPUT)
if __name__ == '__main__':
main()

View File

@ -6,6 +6,7 @@
*/
#include "global.h"
#include <iostream>
using namespace pdf;
@ -348,6 +349,55 @@ error:
} // }}}
// extract_links() {{{
static PyObject *
PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
const PdfObject* catalog = NULL;
PyObject *ans = PyList_New(0);
try {
if ((catalog = self->doc->GetCatalog()) != NULL) {
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
PdfPagesTree *tree = self->doc->GetPagesTree();
if (dests_ref && dests_ref->IsReference()) {
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
if (dests_obj && dests_obj->IsDictionary()) {
const PdfDictionary &dests = dests_obj->GetDictionary();
const TKeyMap &keys = dests.GetKeys();
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
if (itres->second->IsArray()) {
const PdfArray &dest = itres->second->GetArray();
// see section 8.2 of PDF spec for different types of destination arrays
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
const PdfPage *page = tree->GetPage(dest[0].GetReference());
if (page) {
unsigned int pagenum = page->GetPageNumber();
double left = dest[2].GetReal(), top = dest[3].GetReal();
long long zoom = dest[4].GetNumber();
const std::string &anchor = itres->first.GetName();
PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom);
if (!tuple) { break; }
else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; }
}
}
}
}
}
}
}
} catch(const PdfError & err) {
podofo_set_exception(err);
Py_CLEAR(ans);
return NULL;
} catch (...) {
PyErr_SetString(PyExc_ValueError, "An unknown error occurred while trying to set the box");
Py_CLEAR(ans);
return NULL;
}
if (PyErr_Occurred()) { Py_CLEAR(ans); return NULL; }
return ans;
} // }}}
// Properties {{{
static PyObject *
@ -570,6 +620,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"image_count", (PyCFunction)PDFDoc_image_count, METH_VARARGS,
"image_count() -> Number of images in the PDF."
},
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
"extract_anchors() -> Extract information about links in the document."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
},