mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Code to extract anchor positions from PDF files generated by web engine
This commit is contained in:
parent
c8619893e0
commit
f4941219b9
46
src/calibre/ebooks/pdf/develop.py
Normal file
46
src/calibre/ebooks/pdf/develop.py
Normal file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import sys
|
||||
|
||||
from PyQt5.Qt import QApplication, QUrl, QPageLayout, QPageSize, QMarginsF
|
||||
from PyQt5.QtWebEngineWidgets import QWebEnginePage
|
||||
|
||||
from calibre.gui2 import load_builtin_fonts, must_use_qt
|
||||
from calibre.utils.podofo import get_podofo
|
||||
|
||||
OUTPUT = '/t/dev.pdf'
|
||||
|
||||
|
||||
class Renderer(QWebEnginePage):
|
||||
|
||||
def do_print(self, ok):
|
||||
p = QPageLayout(QPageSize(QPageSize(QPageSize.A6)), QPageLayout.Portrait, QMarginsF(10, 10, 10, 10))
|
||||
self.printToPdf(self.print_finished, p)
|
||||
|
||||
def print_finished(self, pdf_data):
|
||||
with open(OUTPUT, 'wb') as f:
|
||||
f.write(pdf_data)
|
||||
QApplication.instance().exit(0)
|
||||
podofo = get_podofo()
|
||||
doc = podofo.PDFDoc()
|
||||
doc.load(pdf_data)
|
||||
from pprint import pprint
|
||||
pprint(doc.extract_anchors())
|
||||
|
||||
|
||||
def main():
|
||||
must_use_qt()
|
||||
load_builtin_fonts()
|
||||
renderer = Renderer()
|
||||
renderer.setUrl(QUrl.fromLocalFile(sys.argv[-1]))
|
||||
renderer.loadFinished.connect(renderer.do_print)
|
||||
QApplication.instance().exec_()
|
||||
print('Output written to:', OUTPUT)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include "global.h"
|
||||
#include <iostream>
|
||||
|
||||
using namespace pdf;
|
||||
|
||||
@ -348,6 +349,55 @@ error:
|
||||
|
||||
} // }}}
|
||||
|
||||
// extract_links() {{{
|
||||
static PyObject *
|
||||
PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
||||
const PdfObject* catalog = NULL;
|
||||
PyObject *ans = PyList_New(0);
|
||||
try {
|
||||
if ((catalog = self->doc->GetCatalog()) != NULL) {
|
||||
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
|
||||
PdfPagesTree *tree = self->doc->GetPagesTree();
|
||||
if (dests_ref && dests_ref->IsReference()) {
|
||||
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
|
||||
if (dests_obj && dests_obj->IsDictionary()) {
|
||||
const PdfDictionary &dests = dests_obj->GetDictionary();
|
||||
const TKeyMap &keys = dests.GetKeys();
|
||||
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
|
||||
if (itres->second->IsArray()) {
|
||||
const PdfArray &dest = itres->second->GetArray();
|
||||
// see section 8.2 of PDF spec for different types of destination arrays
|
||||
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
|
||||
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
|
||||
const PdfPage *page = tree->GetPage(dest[0].GetReference());
|
||||
if (page) {
|
||||
unsigned int pagenum = page->GetPageNumber();
|
||||
double left = dest[2].GetReal(), top = dest[3].GetReal();
|
||||
long long zoom = dest[4].GetNumber();
|
||||
const std::string &anchor = itres->first.GetName();
|
||||
PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom);
|
||||
if (!tuple) { break; }
|
||||
else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(const PdfError & err) {
|
||||
podofo_set_exception(err);
|
||||
Py_CLEAR(ans);
|
||||
return NULL;
|
||||
} catch (...) {
|
||||
PyErr_SetString(PyExc_ValueError, "An unknown error occurred while trying to set the box");
|
||||
Py_CLEAR(ans);
|
||||
return NULL;
|
||||
}
|
||||
if (PyErr_Occurred()) { Py_CLEAR(ans); return NULL; }
|
||||
return ans;
|
||||
} // }}}
|
||||
|
||||
// Properties {{{
|
||||
|
||||
static PyObject *
|
||||
@ -570,6 +620,9 @@ static PyMethodDef PDFDoc_methods[] = {
|
||||
{"image_count", (PyCFunction)PDFDoc_image_count, METH_VARARGS,
|
||||
"image_count() -> Number of images in the PDF."
|
||||
},
|
||||
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
|
||||
"extract_anchors() -> Extract information about links in the document."
|
||||
},
|
||||
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
||||
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
||||
},
|
||||
|
Loading…
x
Reference in New Issue
Block a user