mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Get anchor locations from PDF segments
This commit is contained in:
parent
600f71fda6
commit
0cebe16938
@ -7,6 +7,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
from collections import namedtuple
|
||||
from io import BytesIO
|
||||
|
||||
from PyQt5.Qt import QApplication, QMarginsF, QPageLayout, QTimer, QUrl
|
||||
@ -14,8 +15,10 @@ from PyQt5.QtWebEngineWidgets import QWebEnginePage
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
||||
from calibre.ebooks.oeb.polish.split import merge_html
|
||||
from calibre.ebooks.oeb.polish.toc import get_toc
|
||||
from calibre.ebooks.pdf.image_writer import (
|
||||
Image, PDFMetadata, draw_image_page, get_page_layout
|
||||
)
|
||||
@ -24,7 +27,8 @@ from calibre.gui2 import setup_unix_signals
|
||||
from calibre.gui2.webengine import secure_webengine
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.podofo import get_podofo, set_metadata_implementation
|
||||
from polyglot.builtins import range
|
||||
from calibre.utils.short_uuid import uuid4
|
||||
from polyglot.builtins import iteritems, range
|
||||
|
||||
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
|
||||
|
||||
@ -188,15 +192,110 @@ def render_name(container, name, margins, renderer, page_layout):
|
||||
return pdf_doc
|
||||
|
||||
|
||||
def add_anchors_markup(root, uuid, anchors):
|
||||
body = root[-1]
|
||||
div = body.makeelement(XHTML('div'), id=uuid, style='page-break-before: always')
|
||||
body.append(div)
|
||||
for i, anchor in enumerate(anchors):
|
||||
div.append(div.makeelement(XHTML('a'), href='#' + anchor))
|
||||
div[-1].text = '{}'.format(i)
|
||||
div[-1].tail = ' '
|
||||
div.append(div.makeelement(XHTML('a'), href='#' + uuid))
|
||||
div[-1].text = 'top'
|
||||
div[-1].tail = ' '
|
||||
|
||||
|
||||
def add_toc_links(container, toc, margin_groups):
|
||||
uuid = uuid4()
|
||||
name_anchor_map = {}
|
||||
for item in toc.iterdescendants():
|
||||
if item.dest and item.frag:
|
||||
anchors = name_anchor_map.setdefault(item.dest, set())
|
||||
anchors.add(item.frag)
|
||||
for group in margin_groups:
|
||||
name = group[0][0]
|
||||
anchors = name_anchor_map.get(name, set())
|
||||
add_anchors_markup(container.parsed(name), uuid, anchors)
|
||||
container.dirty(name)
|
||||
return uuid
|
||||
|
||||
|
||||
def make_anchors_unique(container):
|
||||
mapping = {}
|
||||
count = 0
|
||||
base = None
|
||||
|
||||
def replacer(url):
|
||||
if not url:
|
||||
return url
|
||||
if '#' not in url:
|
||||
return url
|
||||
if url.startswith('#'):
|
||||
href, frag = base, url[1:]
|
||||
else:
|
||||
href, frag = url.partition('#')[::2]
|
||||
name = container.href_to_name(href, base)
|
||||
if not name:
|
||||
return url
|
||||
key = name, frag
|
||||
new_frag = mapping.get(key)
|
||||
if new_frag is None:
|
||||
return url
|
||||
replacer.replaced = True
|
||||
if url.startswith('#'):
|
||||
return '#' + new_frag
|
||||
return href + '#' + new_frag
|
||||
|
||||
for spine_name, is_linear in container.spine_names:
|
||||
root = container.parsed(spine_name)
|
||||
for elem in root.xpath('//*[@id]'):
|
||||
count += 1
|
||||
key = spine_name, elem.get('id')
|
||||
if key not in mapping:
|
||||
new_id = mapping[key] = 'a{}'.format(count)
|
||||
elem.set('id', new_id)
|
||||
|
||||
for name in container.mime_map:
|
||||
base = name
|
||||
replacer.replaced = False
|
||||
container.replace_links(name, replacer)
|
||||
|
||||
|
||||
AnchorLocation = namedtuple('AnchorLocation', 'pagenum left top zoom')
|
||||
|
||||
|
||||
def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
|
||||
ans = {}
|
||||
anchors = pdf_doc.extract_anchors()
|
||||
toc_pagenum = anchors.pop(toc_uuid)[0]
|
||||
for r in range(pdf_doc.page_count(), toc_pagenum - 1, -1):
|
||||
pdf_doc.delete_page(r - 1)
|
||||
for anchor, loc in iteritems(anchors):
|
||||
loc = list(loc)
|
||||
loc[0] += first_page_num - 1
|
||||
ans[anchor] = AnchorLocation(*loc)
|
||||
return ans
|
||||
|
||||
|
||||
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
|
||||
container = Container(opf_path, log)
|
||||
make_anchors_unique(container)
|
||||
margin_groups = create_margin_groups(container)
|
||||
toc = get_toc(container)
|
||||
toc_uuid = add_toc_links(container, toc, margin_groups)
|
||||
container.commit()
|
||||
|
||||
renderer = Renderer(opts)
|
||||
page_layout = get_page_layout(opts)
|
||||
pdf_doc = None
|
||||
anchor_locations = {}
|
||||
num_pages = 0
|
||||
for group in margin_groups:
|
||||
doc = render_name(container, group[0][0], group[0][1], renderer, page_layout)
|
||||
name, margins = group[0]
|
||||
doc = render_name(container, name, margins, renderer, page_layout)
|
||||
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
|
||||
num_pages += doc.page_count()
|
||||
|
||||
if pdf_doc is None:
|
||||
pdf_doc = doc
|
||||
else:
|
||||
|
@ -181,7 +181,7 @@ PDFDoc_image_count(PDFDoc *self, PyObject *args) {
|
||||
return Py_BuildValue("i", count);
|
||||
} // }}}
|
||||
|
||||
// delete_page {{{
|
||||
// delete_page() {{{
|
||||
static PyObject *
|
||||
PDFDoc_delete_page(PDFDoc *self, PyObject *args) {
|
||||
int num = 0;
|
||||
@ -367,11 +367,12 @@ error:
|
||||
|
||||
} // }}}
|
||||
|
||||
// extract_links() {{{
|
||||
// extract_anchors() {{{
|
||||
static PyObject *
|
||||
PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
||||
const PdfObject* catalog = NULL;
|
||||
PyObject *ans = PyList_New(0);
|
||||
PyObject *ans = PyDict_New();
|
||||
if (ans == NULL) return NULL;
|
||||
try {
|
||||
if ((catalog = self->doc->GetCatalog()) != NULL) {
|
||||
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
|
||||
@ -393,9 +394,12 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
||||
double left = dest[2].GetReal(), top = dest[3].GetReal();
|
||||
long long zoom = dest[4].GetNumber();
|
||||
const std::string &anchor = itres->first.GetName();
|
||||
PyObject *tuple = Py_BuildValue("NIddL", PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"), pagenum, left, top, zoom);
|
||||
if (!tuple) { break; }
|
||||
else { int ret = PyList_Append(ans, tuple); Py_DECREF(tuple); if (ret != 0) break; }
|
||||
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
|
||||
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
|
||||
if (!tuple || !key) { break; }
|
||||
int ret = PyDict_SetItem(ans, key, tuple);
|
||||
Py_DECREF(key); Py_DECREF(tuple);
|
||||
if (ret != 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user