Get document links working when PDF is generated in parts

2025-07-08 10:44:09 -04:00 · 2019-07-11 17:27:32 +05:30 · 2019-07-11 17:27:32 +05:30 · f0584b8fdb
commit f0584b8fdb
parent 7b03c7567c
4 changed files with 159 additions and 45 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -439,13 +439,17 @@ class Container(ContainerBase):  # {{{
        using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
        media_type = self.mime_map.get(name, guess_type(name))
        if name == self.opf_name:
+            replace_func.file_type = 'opf'
            for elem in self.opf_xpath('//*[@href]'):
                elem.set('href', replace_func(elem.get('href')))
        elif media_type.lower() in OEB_DOCS:
+            replace_func.file_type = 'text'
            rewrite_links(self.parsed(name), replace_func)
        elif media_type.lower() in OEB_STYLES:
+            replace_func.file_type = 'style'
            replaceUrls(self.parsed(name), replace_func)
        elif media_type.lower() == guess_type('toc.ncx'):
+            replace_func.file_type = 'ncx'
            for elem in self.parsed(name).xpath('//*[@src]'):
                elem.set('src', replace_func(elem.get('src')))

--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -110,7 +110,7 @@ class TOC(object):
        return ans

    def __str__(self):
-        return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
+        return '\n'.join(self.get_lines())

    def to_dict(self, node_counter=None):
        ans = {
--- a/src/calibre/ebooks/pdf/html_writer.py
+++ b/src/calibre/ebooks/pdf/html_writer.py
@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
 from calibre.utils.podofo import get_podofo, set_metadata_implementation
 from calibre.utils.short_uuid import uuid4
 from polyglot.builtins import iteritems, range
+from polyglot.urllib import urlparse

 OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)

@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
    div[-1].tail = ' '


-def add_toc_links(container, toc, margin_groups):
-    # TODO: Change this to work for all anchors so it can be used to fix
-    # arbitrary links
+def add_all_links(container, margin_groups):
    uuid = uuid4()
    name_anchor_map = {}
-    for item in toc.iterdescendants():
-        if item.dest and item.frag:
-            anchors = name_anchor_map.setdefault(item.dest, set())
-            anchors.add(item.frag)
+    for name, is_linear in container.spine_names:
+        root = container.parsed(name)
+        name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
    for group in margin_groups:
        name = group[0][0]
        anchors = name_anchor_map.get(name, set())
@ -226,12 +224,15 @@ def make_anchors_unique(container):
    mapping = {}
    count = 0
    base = None
+    spine_names = set()

    def replacer(url):
+        if replacer.file_type != 'text':
+            return url
        if not url:
            return url
        if '#' not in url:
-            return url
+            url += '#'
        if url.startswith('#'):
            href, frag = base, url[1:]
        else:
@ -239,16 +240,21 @@ def make_anchors_unique(container):
        name = container.href_to_name(href, base)
        if not name:
            return url
+        if not frag and name in spine_names:
+            replacer.replaced = True
+            return 'https://calibre-pdf-anchor.n#' + name
        key = name, frag
        new_frag = mapping.get(key)
        if new_frag is None:
            return url
        replacer.replaced = True
+        return 'https://calibre-pdf-anchor.a#' + new_frag
        if url.startswith('#'):
            return '#' + new_frag
        return href + '#' + new_frag

    for spine_name, is_linear in container.spine_names:
+        spine_names.add(spine_name)
        root = container.parsed(spine_name)
        for elem in root.xpath('//*[@id]'):
            count += 1
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
    return ans


+def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
+
+    def replace_link(url):
+        purl = urlparse(url)
+        if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
+            return
+        loc = None
+        if purl.netloc == 'calibre-pdf-anchor.a':
+            loc = anchor_locations.get(purl.fragment)
+            if loc is None:
+                log.warn('Anchor location for link to {} not found'.format(purl.fragment))
+        else:
+            pnum = name_page_numbers.get(purl.fragment)
+            if pnum is None:
+                log.warn('Anchor location for link to {} not found'.format(purl.fragment))
+            else:
+                loc = AnchorLocation(pnum, 0, 0, 0)
+        return loc
+
+    pdf_doc.alter_links(replace_link, mark_links)
+
+
 def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
    container = Container(opf_path, log)
    make_anchors_unique(container)
    margin_groups = create_margin_groups(container)
+    links_page_uuid = add_all_links(container, margin_groups)
    toc = get_toc(container)
-    toc_uuid = add_toc_links(container, toc, margin_groups)
+    (toc)
    container.commit()

    renderer = Renderer(opts)
    page_layout = get_page_layout(opts)
    pdf_doc = None
    anchor_locations = {}
+    name_page_numbers = {}
    num_pages = 0
    for group in margin_groups:
        name, margins = group[0]
+        name_page_numbers[name] = num_pages + 1
        doc = render_name(container, name, margins, renderer, page_layout)
-        anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
+        anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
        num_pages += doc.page_count()

        if pdf_doc is None:
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
        else:
            pdf_doc.append(doc)

-    # TODO: Fix links using anchor_locations
+    fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)

    if cover_data:
        add_cover(pdf_doc, cover_data, page_layout, opts)
--- a/src/calibre/utils/podofo/doc.cpp
+++ b/src/calibre/utils/podofo/doc.cpp
@ -374,39 +374,39 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
    PyObject *ans = PyDict_New();
 	if (ans == NULL) return NULL;
    try {
-            if ((catalog = self->doc->GetCatalog()) != NULL) {
-                const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
-                PdfPagesTree *tree = self->doc->GetPagesTree();
-                if (dests_ref && dests_ref->IsReference()) {
-                    const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
-                    if (dests_obj && dests_obj->IsDictionary()) {
-                        const PdfDictionary &dests = dests_obj->GetDictionary();
-                        const TKeyMap &keys = dests.GetKeys();
-                        for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
-                            if (itres->second->IsArray()) {
-                                const PdfArray &dest = itres->second->GetArray();
-                                // see section 8.2 of PDF spec for different types of destination arrays
-                                // but chromium apparently generates only [page /XYZ left top zoom] type arrays
-                                if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
-                                    const PdfPage *page = tree->GetPage(dest[0].GetReference());
-                                    if (page) {
-                                        unsigned int pagenum = page->GetPageNumber();
-                                        double left = dest[2].GetReal(), top = dest[3].GetReal();
-                                        long long zoom = dest[4].GetNumber();
-                                        const std::string &anchor = itres->first.GetName();
-										PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
-                                        PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
-                                        if (!tuple || !key) { break; }
-										int ret = PyDict_SetItem(ans, key, tuple);
-										Py_DECREF(key); Py_DECREF(tuple);
-										if (ret != 0) break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+		if ((catalog = self->doc->GetCatalog()) != NULL) {
+			const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
+			PdfPagesTree *tree = self->doc->GetPagesTree();
+			if (dests_ref && dests_ref->IsReference()) {
+				const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
+				if (dests_obj && dests_obj->IsDictionary()) {
+					const PdfDictionary &dests = dests_obj->GetDictionary();
+					const TKeyMap &keys = dests.GetKeys();
+					for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
+						if (itres->second->IsArray()) {
+							const PdfArray &dest = itres->second->GetArray();
+							// see section 8.2 of PDF spec for different types of destination arrays
+							// but chromium apparently generates only [page /XYZ left top zoom] type arrays
+							if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
+								const PdfPage *page = tree->GetPage(dest[0].GetReference());
+								if (page) {
+									unsigned int pagenum = page->GetPageNumber();
+									double left = dest[2].GetReal(), top = dest[3].GetReal();
+									long long zoom = dest[4].GetNumber();
+									const std::string &anchor = itres->first.GetName();
+									PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
+									PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
+									if (!tuple || !key) { break; }
+									int ret = PyDict_SetItem(ans, key, tuple);
+									Py_DECREF(key); Py_DECREF(tuple);
+									if (ret != 0) break;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
    } catch(const PdfError & err) {
        podofo_set_exception(err);
        Py_CLEAR(ans);
@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
    return ans;
 } // }}}

+// alter_links() {{{
+
+template<typename T>
+static inline bool
+dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
+	const PdfObject *val = d.GetKey(key);
+	if (val && val->IsName() && val->GetName().GetName() == name) return true;
+	return false;
+}
+
+
+static PyObject *
+PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
+    int count = 0;
+	static const PdfName XYZ("XYZ");
+	PyObject *alter_callback, *py_mark_links;
+	if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
+	bool mark_links = PyObject_IsTrue(py_mark_links);
+    try {
+		PdfArray border, link_color;
+		border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
+		link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
+        for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
+			if((*it)->IsDictionary()) {
+				PdfDictionary &link = (*it)->GetDictionary();
+				if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
+					if (mark_links) {
+						link.AddKey("Border", border);
+						link.AddKey("C", link_color);
+					}
+					if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
+						PdfDictionary &A = link.GetKey("A")->GetDictionary();
+						if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
+							PdfObject *uo = A.GetKey("URI");
+							if (uo && uo->IsString()) {
+								const std::string &uri = uo->GetString().GetStringUtf8();
+								PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
+								if (!ret) { return NULL; }
+								if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
+									int pagenum; double left, top; long long zoom;
+									if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
+										PdfPage *page = NULL;
+										try {
+											page = self->doc->GetPage(pagenum - 1);
+										} catch(const PdfError &err) {
+											PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
+											Py_DECREF(ret);
+											return NULL;
+										}
+										if (page) {
+											const PdfReference &pageref = page->GetObject()->Reference();
+											PdfArray dest;
+											dest.push_back(pageref);
+											dest.push_back(XYZ);
+											dest.push_back(left);
+											dest.push_back(top);
+											dest.push_back((PoDoFo::pdf_int64)zoom);
+											link.RemoveKey("A");
+											link.AddKey("Dest", dest);
+										}
+									}
+								}
+								Py_DECREF(ret);
+							}
+						}
+					}
+				}
+			}
+		}
+    } catch(const PdfError & err) {
+        podofo_set_exception(err);
+        return NULL;
+    }
+    return Py_BuildValue("i", count);
+} // }}}
+
 // Properties {{{

 static PyObject *
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
    {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
     "extract_anchors() -> Extract information about links in the document."
    },
+    {"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
+     "alter_links() -> Change links in the document."
+    },
    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
    },