diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 56a2bde352..0fdc11309f 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -439,13 +439,17 @@ class Container(ContainerBase): # {{{ using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: + replace_func.file_type = 'opf' for elem in self.opf_xpath('//*[@href]'): elem.set('href', replace_func(elem.get('href'))) elif media_type.lower() in OEB_DOCS: + replace_func.file_type = 'text' rewrite_links(self.parsed(name), replace_func) elif media_type.lower() in OEB_STYLES: + replace_func.file_type = 'style' replaceUrls(self.parsed(name), replace_func) elif media_type.lower() == guess_type('toc.ncx'): + replace_func.file_type = 'ncx' for elem in self.parsed(name).xpath('//*[@src]'): elem.set('src', replace_func(elem.get('src'))) diff --git a/src/calibre/ebooks/oeb/polish/toc.py b/src/calibre/ebooks/oeb/polish/toc.py index 1e7b224070..fb1a899bed 100644 --- a/src/calibre/ebooks/oeb/polish/toc.py +++ b/src/calibre/ebooks/oeb/polish/toc.py @@ -110,7 +110,7 @@ class TOC(object): return ans def __str__(self): - return b'\n'.join([x.encode('utf-8') for x in self.get_lines()]) + return '\n'.join(self.get_lines()) def to_dict(self, node_counter=None): ans = { diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 1cf6824856..f6ea2d2c07 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -29,6 +29,7 @@ from calibre.utils.logging import default_log from calibre.utils.podofo import get_podofo, set_metadata_implementation from calibre.utils.short_uuid import uuid4 from polyglot.builtins import iteritems, range +from polyglot.urllib import urlparse OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3) @@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors): div[-1].tail = ' ' -def add_toc_links(container, toc, margin_groups): - # TODO: Change this to work for all anchors so it can be used to fix - # arbitrary links +def add_all_links(container, margin_groups): uuid = uuid4() name_anchor_map = {} - for item in toc.iterdescendants(): - if item.dest and item.frag: - anchors = name_anchor_map.setdefault(item.dest, set()) - anchors.add(item.frag) + for name, is_linear in container.spine_names: + root = container.parsed(name) + name_anchor_map[name] = frozenset(root.xpath('//*/@id')) for group in margin_groups: name = group[0][0] anchors = name_anchor_map.get(name, set()) @@ -226,12 +224,15 @@ def make_anchors_unique(container): mapping = {} count = 0 base = None + spine_names = set() def replacer(url): + if replacer.file_type != 'text': + return url if not url: return url if '#' not in url: - return url + url += '#' if url.startswith('#'): href, frag = base, url[1:] else: @@ -239,16 +240,21 @@ def make_anchors_unique(container): name = container.href_to_name(href, base) if not name: return url + if not frag and name in spine_names: + replacer.replaced = True + return 'https://calibre-pdf-anchor.n#' + name key = name, frag new_frag = mapping.get(key) if new_frag is None: return url replacer.replaced = True + return 'https://calibre-pdf-anchor.a#' + new_frag if url.startswith('#'): return '#' + new_frag return href + '#' + new_frag for spine_name, is_linear in container.spine_names: + spine_names.add(spine_name) root = container.parsed(spine_name) for elem in root.xpath('//*[@id]'): count += 1 @@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid): return ans +def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log): + + def replace_link(url): + purl = urlparse(url) + if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'): + return + loc = None + if purl.netloc == 'calibre-pdf-anchor.a': + loc = anchor_locations.get(purl.fragment) + if loc is None: + log.warn('Anchor location for link to {} not found'.format(purl.fragment)) + else: + pnum = name_page_numbers.get(purl.fragment) + if pnum is None: + log.warn('Anchor location for link to {} not found'.format(purl.fragment)) + else: + loc = AnchorLocation(pnum, 0, 0, 0) + return loc + + pdf_doc.alter_links(replace_link, mark_links) + + def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None): container = Container(opf_path, log) make_anchors_unique(container) margin_groups = create_margin_groups(container) + links_page_uuid = add_all_links(container, margin_groups) toc = get_toc(container) - toc_uuid = add_toc_links(container, toc, margin_groups) + (toc) container.commit() renderer = Renderer(opts) page_layout = get_page_layout(opts) pdf_doc = None anchor_locations = {} + name_page_numbers = {} num_pages = 0 for group in margin_groups: name, margins = group[0] + name_page_numbers[name] = num_pages + 1 doc = render_name(container, name, margins, renderer, page_layout) - anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid)) + anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid)) num_pages += doc.page_count() if pdf_doc is None: @@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co else: pdf_doc.append(doc) - # TODO: Fix links using anchor_locations + fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log) if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index e62d42d8b5..d638c02927 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -374,39 +374,39 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) { PyObject *ans = PyDict_New(); if (ans == NULL) return NULL; try { - if ((catalog = self->doc->GetCatalog()) != NULL) { - const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests"); - PdfPagesTree *tree = self->doc->GetPagesTree(); - if (dests_ref && dests_ref->IsReference()) { - const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference()); - if (dests_obj && dests_obj->IsDictionary()) { - const PdfDictionary &dests = dests_obj->GetDictionary(); - const TKeyMap &keys = dests.GetKeys(); - for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) { - if (itres->second->IsArray()) { - const PdfArray &dest = itres->second->GetArray(); - // see section 8.2 of PDF spec for different types of destination arrays - // but chromium apparently generates only [page /XYZ left top zoom] type arrays - if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") { - const PdfPage *page = tree->GetPage(dest[0].GetReference()); - if (page) { - unsigned int pagenum = page->GetPageNumber(); - double left = dest[2].GetReal(), top = dest[3].GetReal(); - long long zoom = dest[4].GetNumber(); - const std::string &anchor = itres->first.GetName(); - PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"); - PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom); - if (!tuple || !key) { break; } - int ret = PyDict_SetItem(ans, key, tuple); - Py_DECREF(key); Py_DECREF(tuple); - if (ret != 0) break; - } - } - } - } - } - } - } + if ((catalog = self->doc->GetCatalog()) != NULL) { + const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests"); + PdfPagesTree *tree = self->doc->GetPagesTree(); + if (dests_ref && dests_ref->IsReference()) { + const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference()); + if (dests_obj && dests_obj->IsDictionary()) { + const PdfDictionary &dests = dests_obj->GetDictionary(); + const TKeyMap &keys = dests.GetKeys(); + for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) { + if (itres->second->IsArray()) { + const PdfArray &dest = itres->second->GetArray(); + // see section 8.2 of PDF spec for different types of destination arrays + // but chromium apparently generates only [page /XYZ left top zoom] type arrays + if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") { + const PdfPage *page = tree->GetPage(dest[0].GetReference()); + if (page) { + unsigned int pagenum = page->GetPageNumber(); + double left = dest[2].GetReal(), top = dest[3].GetReal(); + long long zoom = dest[4].GetNumber(); + const std::string &anchor = itres->first.GetName(); + PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace"); + PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom); + if (!tuple || !key) { break; } + int ret = PyDict_SetItem(ans, key, tuple); + Py_DECREF(key); Py_DECREF(tuple); + if (ret != 0) break; + } + } + } + } + } + } + } } catch(const PdfError & err) { podofo_set_exception(err); Py_CLEAR(ans); @@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) { return ans; } // }}} +// alter_links() {{{ + +template +static inline bool +dictionary_has_key_name(PdfDictionary &d, T key, const char *name) { + const PdfObject *val = d.GetKey(key); + if (val && val->IsName() && val->GetName().GetName() == name) return true; + return false; +} + + +static PyObject * +PDFDoc_alter_links(PDFDoc *self, PyObject *args) { + int count = 0; + static const PdfName XYZ("XYZ"); + PyObject *alter_callback, *py_mark_links; + if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL; + bool mark_links = PyObject_IsTrue(py_mark_links); + try { + PdfArray border, link_color; + border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1); + link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.); + for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) { + if((*it)->IsDictionary()) { + PdfDictionary &link = (*it)->GetDictionary(); + if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) { + if (mark_links) { + link.AddKey("Border", border); + link.AddKey("C", link_color); + } + if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) { + PdfDictionary &A = link.GetKey("A")->GetDictionary(); + if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) { + PdfObject *uo = A.GetKey("URI"); + if (uo && uo->IsString()) { + const std::string &uri = uo->GetString().GetStringUtf8(); + PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace"))); + if (!ret) { return NULL; } + if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) { + int pagenum; double left, top; long long zoom; + if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) { + PdfPage *page = NULL; + try { + page = self->doc->GetPage(pagenum - 1); + } catch(const PdfError &err) { + PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum); + Py_DECREF(ret); + return NULL; + } + if (page) { + const PdfReference &pageref = page->GetObject()->Reference(); + PdfArray dest; + dest.push_back(pageref); + dest.push_back(XYZ); + dest.push_back(left); + dest.push_back(top); + dest.push_back((PoDoFo::pdf_int64)zoom); + link.RemoveKey("A"); + link.AddKey("Dest", dest); + } + } + } + Py_DECREF(ret); + } + } + } + } + } + } + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + return Py_BuildValue("i", count); +} // }}} + // Properties {{{ static PyObject * @@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = { {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS, "extract_anchors() -> Extract information about links in the document." }, + {"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS, + "alter_links() -> Change links in the document." + }, {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." },