Get anchor locations from PDF segments

This commit is contained in:
Kovid Goyal 2019-07-10 20:14:41 +05:30
parent 600f71fda6
commit 0cebe16938
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 111 additions and 8 deletions

View File

@ -7,6 +7,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
import signal
from collections import namedtuple
from io import BytesIO
from PyQt5.Qt import QApplication, QMarginsF, QPageLayout, QTimer, QUrl
@ -14,8 +15,10 @@ from PyQt5.QtWebEngineWidgets import QWebEnginePage
from calibre.constants import iswindows
from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet
from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
from calibre.ebooks.oeb.polish.split import merge_html
from calibre.ebooks.oeb.polish.toc import get_toc
from calibre.ebooks.pdf.image_writer import (
Image, PDFMetadata, draw_image_page, get_page_layout
)
@ -24,7 +27,8 @@ from calibre.gui2 import setup_unix_signals
from calibre.gui2.webengine import secure_webengine
from calibre.utils.logging import default_log
from calibre.utils.podofo import get_podofo, set_metadata_implementation
from polyglot.builtins import range
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, range
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
@ -188,15 +192,110 @@ def render_name(container, name, margins, renderer, page_layout):
return pdf_doc
def add_anchors_markup(root, uuid, anchors):
body = root[-1]
div = body.makeelement(XHTML('div'), id=uuid, style='page-break-before: always')
body.append(div)
for i, anchor in enumerate(anchors):
div.append(div.makeelement(XHTML('a'), href='#' + anchor))
div[-1].text = '{}'.format(i)
div[-1].tail = ' '
div.append(div.makeelement(XHTML('a'), href='#' + uuid))
div[-1].text = 'top'
div[-1].tail = ' '
def add_toc_links(container, toc, margin_groups):
uuid = uuid4()
name_anchor_map = {}
for item in toc.iterdescendants():
if item.dest and item.frag:
anchors = name_anchor_map.setdefault(item.dest, set())
anchors.add(item.frag)
for group in margin_groups:
name = group[0][0]
anchors = name_anchor_map.get(name, set())
add_anchors_markup(container.parsed(name), uuid, anchors)
container.dirty(name)
return uuid
def make_anchors_unique(container):
mapping = {}
count = 0
base = None
def replacer(url):
if not url:
return url
if '#' not in url:
return url
if url.startswith('#'):
href, frag = base, url[1:]
else:
href, frag = url.partition('#')[::2]
name = container.href_to_name(href, base)
if not name:
return url
key = name, frag
new_frag = mapping.get(key)
if new_frag is None:
return url
replacer.replaced = True
if url.startswith('#'):
return '#' + new_frag
return href + '#' + new_frag
for spine_name, is_linear in container.spine_names:
root = container.parsed(spine_name)
for elem in root.xpath('//*[@id]'):
count += 1
key = spine_name, elem.get('id')
if key not in mapping:
new_id = mapping[key] = 'a{}'.format(count)
elem.set('id', new_id)
for name in container.mime_map:
base = name
replacer.replaced = False
container.replace_links(name, replacer)
AnchorLocation = namedtuple('AnchorLocation', 'pagenum left top zoom')
def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
ans = {}
anchors = pdf_doc.extract_anchors()
toc_pagenum = anchors.pop(toc_uuid)[0]
for r in range(pdf_doc.page_count(), toc_pagenum - 1, -1):
pdf_doc.delete_page(r - 1)
for anchor, loc in iteritems(anchors):
loc = list(loc)
loc[0] += first_page_num - 1
ans[anchor] = AnchorLocation(*loc)
return ans
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
container = Container(opf_path, log)
make_anchors_unique(container)
margin_groups = create_margin_groups(container)
toc = get_toc(container)
toc_uuid = add_toc_links(container, toc, margin_groups)
container.commit()
renderer = Renderer(opts)
page_layout = get_page_layout(opts)
pdf_doc = None
anchor_locations = {}
num_pages = 0
for group in margin_groups:
doc = render_name(container, group[0][0], group[0][1], renderer, page_layout)
name, margins = group[0]
doc = render_name(container, name, margins, renderer, page_layout)
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
num_pages += doc.page_count()
if pdf_doc is None:
pdf_doc = doc
else:

View File

@ -181,7 +181,7 @@ PDFDoc_image_count(PDFDoc *self, PyObject *args) {
return Py_BuildValue("i", count);
} // }}}
// delete_page {{{
// delete_page() {{{
static PyObject *
PDFDoc_delete_page(PDFDoc *self, PyObject *args) {
int num = 0;
@ -367,11 +367,12 @@ error:
} // }}}
// extract_links() {{{
// extract_anchors() {{{
static PyObject *
PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
const PdfObject* catalog = NULL;
PyObject *ans = PyList_New(0);
PyObject *ans = PyDict_New();
if (ans == NULL) return NULL;
try {
if ((catalog = self->doc->GetCatalog()) != NULL) {
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
@ -393,9 +394,12 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
double left = dest[2].GetReal(), top = dest[3].GetReal();
long long zoom = dest[4].GetNumber();
const std::string &anchor = itres->first.GetName();
PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom);
if (!tuple) { break; }
else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; }
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
if (!tuple || !key) { break; }
int ret = PyDict_SetItem(ans, key, tuple);
Py_DECREF(key); Py_DECREF(tuple);
if (ret != 0) break;
}
}
}