Get document links working when PDF is generated in parts

2025-08-11 09:13:57 -04:00 · 2019-07-11 17:27:32 +05:30 · 2019-07-11 17:27:32 +05:30 · f0584b8fdb
commit f0584b8fdb
parent 7b03c7567c
4 changed files with 159 additions and 45 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -439,13 +439,17 @@ class Container(ContainerBase):  # {{{
        using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
        media_type = self.mime_map.get(name, guess_type(name))
        if name == self.opf_name:
            replace_func.file_type = 'opf'
            for elem in self.opf_xpath('//*[@href]'):
                elem.set('href', replace_func(elem.get('href')))
        elif media_type.lower() in OEB_DOCS:
            replace_func.file_type = 'text'
            rewrite_links(self.parsed(name), replace_func)
        elif media_type.lower() in OEB_STYLES:
            replace_func.file_type = 'style'
            replaceUrls(self.parsed(name), replace_func)
        elif media_type.lower() == guess_type('toc.ncx'):
            replace_func.file_type = 'ncx'
            for elem in self.parsed(name).xpath('//*[@src]'):
                elem.set('src', replace_func(elem.get('src')))
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -110,7 +110,7 @@ class TOC(object):
        return ans
    def __str__(self):
-        return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
+        return '\n'.join(self.get_lines())
    def to_dict(self, node_counter=None):
        ans = {
--- a/src/calibre/ebooks/pdf/html_writer.py
+++ b/src/calibre/ebooks/pdf/html_writer.py
@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
 from calibre.utils.podofo import get_podofo, set_metadata_implementation
 from calibre.utils.short_uuid import uuid4
 from polyglot.builtins import iteritems, range
 from polyglot.urllib import urlparse
 OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
    div[-1].tail = ' '
-def add_toc_links(container, toc, margin_groups):
+def add_all_links(container, margin_groups):
    # TODO: Change this to work for all anchors so it can be used to fix
    # arbitrary links
    uuid = uuid4()
    name_anchor_map = {}
-    for item in toc.iterdescendants():
+    for name, is_linear in container.spine_names:
-        if item.dest and item.frag:
+        root = container.parsed(name)
-            anchors = name_anchor_map.setdefault(item.dest, set())
+        name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
            anchors.add(item.frag)
    for group in margin_groups:
        name = group[0][0]
        anchors = name_anchor_map.get(name, set())
@ -226,12 +224,15 @@ def make_anchors_unique(container):
    mapping = {}
    count = 0
    base = None
    spine_names = set()
    def replacer(url):
        if replacer.file_type != 'text':
            return url
        if not url:
            return url
        if '#' not in url:
-            return url
+            url += '#'
        if url.startswith('#'):
            href, frag = base, url[1:]
        else:
@ -239,16 +240,21 @@ def make_anchors_unique(container):
        name = container.href_to_name(href, base)
        if not name:
            return url
        if not frag and name in spine_names:
            replacer.replaced = True
            return 'https://calibre-pdf-anchor.n#' + name
        key = name, frag
        new_frag = mapping.get(key)
        if new_frag is None:
            return url
        replacer.replaced = True
        return 'https://calibre-pdf-anchor.a#' + new_frag
        if url.startswith('#'):
            return '#' + new_frag
        return href + '#' + new_frag
    for spine_name, is_linear in container.spine_names:
        spine_names.add(spine_name)
        root = container.parsed(spine_name)
        for elem in root.xpath('//*[@id]'):
            count += 1
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
    return ans
 def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
    def replace_link(url):
        purl = urlparse(url)
        if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
            return
        loc = None
        if purl.netloc == 'calibre-pdf-anchor.a':
            loc = anchor_locations.get(purl.fragment)
            if loc is None:
                log.warn('Anchor location for link to {} not found'.format(purl.fragment))
        else:
            pnum = name_page_numbers.get(purl.fragment)
            if pnum is None:
                log.warn('Anchor location for link to {} not found'.format(purl.fragment))
            else:
                loc = AnchorLocation(pnum, 0, 0, 0)
        return loc
    pdf_doc.alter_links(replace_link, mark_links)
 def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
    container = Container(opf_path, log)
    make_anchors_unique(container)
    margin_groups = create_margin_groups(container)
    links_page_uuid = add_all_links(container, margin_groups)
    toc = get_toc(container)
-    toc_uuid = add_toc_links(container, toc, margin_groups)
+    (toc)
    container.commit()
    renderer = Renderer(opts)
    page_layout = get_page_layout(opts)
    pdf_doc = None
    anchor_locations = {}
    name_page_numbers = {}
    num_pages = 0
    for group in margin_groups:
        name, margins = group[0]
        name_page_numbers[name] = num_pages + 1
        doc = render_name(container, name, margins, renderer, page_layout)
-        anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
+        anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
        num_pages += doc.page_count()
        if pdf_doc is None:
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
        else:
            pdf_doc.append(doc)
-    # TODO: Fix links using anchor_locations
+    fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)
    if cover_data:
        add_cover(pdf_doc, cover_data, page_layout, opts)
--- a/src/calibre/utils/podofo/doc.cpp
+++ b/src/calibre/utils/podofo/doc.cpp
@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
    return ans;
 } // }}}
 // alter_links() {{{
 template<typename T>
 static inline bool
 dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
 	const PdfObject *val = d.GetKey(key);
 	if (val && val->IsName() && val->GetName().GetName() == name) return true;
 	return false;
 }
 static PyObject *
 PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
    int count = 0;
 	static const PdfName XYZ("XYZ");
 	PyObject *alter_callback, *py_mark_links;
 	if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
 	bool mark_links = PyObject_IsTrue(py_mark_links);
    try {
 		PdfArray border, link_color;
 		border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
 		link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
        for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
 			if((*it)->IsDictionary()) {
 				PdfDictionary &link = (*it)->GetDictionary();
 				if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
 					if (mark_links) {
 						link.AddKey("Border", border);
 						link.AddKey("C", link_color);
 					}
 					if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
 						PdfDictionary &A = link.GetKey("A")->GetDictionary();
 						if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
 							PdfObject *uo = A.GetKey("URI");
 							if (uo && uo->IsString()) {
 								const std::string &uri = uo->GetString().GetStringUtf8();
 								PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
 								if (!ret) { return NULL; }
 								if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
 									int pagenum; double left, top; long long zoom;
 									if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
 										PdfPage *page = NULL;
 										try {
 											page = self->doc->GetPage(pagenum - 1);
 										} catch(const PdfError &err) {
 											PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
 											Py_DECREF(ret);
 											return NULL;
 										}
 										if (page) {
 											const PdfReference &pageref = page->GetObject()->Reference();
 											PdfArray dest;
 											dest.push_back(pageref);
 											dest.push_back(XYZ);
 											dest.push_back(left);
 											dest.push_back(top);
 											dest.push_back((PoDoFo::pdf_int64)zoom);
 											link.RemoveKey("A");
 											link.AddKey("Dest", dest);
 										}
 									}
 								}
 								Py_DECREF(ret);
 							}
 						}
 					}
 				}
 			}
 		}
    } catch(const PdfError & err) {
        podofo_set_exception(err);
        return NULL;
    }
    return Py_BuildValue("i", count);
 } // }}}
 // Properties {{{
 static PyObject *
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
    {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
     "extract_anchors() -> Extract information about links in the document."
    },
    {"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
     "alter_links() -> Change links in the document."
    },
    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
    },