Get document links working when PDF is generated in parts

This commit is contained in:
Kovid Goyal 2019-07-11 17:27:32 +05:30
parent 7b03c7567c
commit f0584b8fdb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 159 additions and 45 deletions

View File

@ -439,13 +439,17 @@ class Container(ContainerBase): # {{{
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. ''' using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
media_type = self.mime_map.get(name, guess_type(name)) media_type = self.mime_map.get(name, guess_type(name))
if name == self.opf_name: if name == self.opf_name:
replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'): for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href'))) elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS: elif media_type.lower() in OEB_DOCS:
replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func) rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES: elif media_type.lower() in OEB_STYLES:
replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func) replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'): elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'): for elem in self.parsed(name).xpath('//*[@src]'):
elem.set('src', replace_func(elem.get('src'))) elem.set('src', replace_func(elem.get('src')))

View File

@ -110,7 +110,7 @@ class TOC(object):
return ans return ans
def __str__(self): def __str__(self):
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()]) return '\n'.join(self.get_lines())
def to_dict(self, node_counter=None): def to_dict(self, node_counter=None):
ans = { ans = {

View File

@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
from calibre.utils.podofo import get_podofo, set_metadata_implementation from calibre.utils.podofo import get_podofo, set_metadata_implementation
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, range from polyglot.builtins import iteritems, range
from polyglot.urllib import urlparse
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3) OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
div[-1].tail = ' ' div[-1].tail = ' '
def add_toc_links(container, toc, margin_groups): def add_all_links(container, margin_groups):
# TODO: Change this to work for all anchors so it can be used to fix
# arbitrary links
uuid = uuid4() uuid = uuid4()
name_anchor_map = {} name_anchor_map = {}
for item in toc.iterdescendants(): for name, is_linear in container.spine_names:
if item.dest and item.frag: root = container.parsed(name)
anchors = name_anchor_map.setdefault(item.dest, set()) name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
anchors.add(item.frag)
for group in margin_groups: for group in margin_groups:
name = group[0][0] name = group[0][0]
anchors = name_anchor_map.get(name, set()) anchors = name_anchor_map.get(name, set())
@ -226,12 +224,15 @@ def make_anchors_unique(container):
mapping = {} mapping = {}
count = 0 count = 0
base = None base = None
spine_names = set()
def replacer(url): def replacer(url):
if replacer.file_type != 'text':
return url
if not url: if not url:
return url return url
if '#' not in url: if '#' not in url:
return url url += '#'
if url.startswith('#'): if url.startswith('#'):
href, frag = base, url[1:] href, frag = base, url[1:]
else: else:
@ -239,16 +240,21 @@ def make_anchors_unique(container):
name = container.href_to_name(href, base) name = container.href_to_name(href, base)
if not name: if not name:
return url return url
if not frag and name in spine_names:
replacer.replaced = True
return 'https://calibre-pdf-anchor.n#' + name
key = name, frag key = name, frag
new_frag = mapping.get(key) new_frag = mapping.get(key)
if new_frag is None: if new_frag is None:
return url return url
replacer.replaced = True replacer.replaced = True
return 'https://calibre-pdf-anchor.a#' + new_frag
if url.startswith('#'): if url.startswith('#'):
return '#' + new_frag return '#' + new_frag
return href + '#' + new_frag return href + '#' + new_frag
for spine_name, is_linear in container.spine_names: for spine_name, is_linear in container.spine_names:
spine_names.add(spine_name)
root = container.parsed(spine_name) root = container.parsed(spine_name)
for elem in root.xpath('//*[@id]'): for elem in root.xpath('//*[@id]'):
count += 1 count += 1
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
return ans return ans
def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
def replace_link(url):
purl = urlparse(url)
if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
return
loc = None
if purl.netloc == 'calibre-pdf-anchor.a':
loc = anchor_locations.get(purl.fragment)
if loc is None:
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
else:
pnum = name_page_numbers.get(purl.fragment)
if pnum is None:
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
else:
loc = AnchorLocation(pnum, 0, 0, 0)
return loc
pdf_doc.alter_links(replace_link, mark_links)
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None): def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
container = Container(opf_path, log) container = Container(opf_path, log)
make_anchors_unique(container) make_anchors_unique(container)
margin_groups = create_margin_groups(container) margin_groups = create_margin_groups(container)
links_page_uuid = add_all_links(container, margin_groups)
toc = get_toc(container) toc = get_toc(container)
toc_uuid = add_toc_links(container, toc, margin_groups) (toc)
container.commit() container.commit()
renderer = Renderer(opts) renderer = Renderer(opts)
page_layout = get_page_layout(opts) page_layout = get_page_layout(opts)
pdf_doc = None pdf_doc = None
anchor_locations = {} anchor_locations = {}
name_page_numbers = {}
num_pages = 0 num_pages = 0
for group in margin_groups: for group in margin_groups:
name, margins = group[0] name, margins = group[0]
name_page_numbers[name] = num_pages + 1
doc = render_name(container, name, margins, renderer, page_layout) doc = render_name(container, name, margins, renderer, page_layout)
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid)) anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
num_pages += doc.page_count() num_pages += doc.page_count()
if pdf_doc is None: if pdf_doc is None:
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
else: else:
pdf_doc.append(doc) pdf_doc.append(doc)
# TODO: Fix links using anchor_locations fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)
if cover_data: if cover_data:
add_cover(pdf_doc, cover_data, page_layout, opts) add_cover(pdf_doc, cover_data, page_layout, opts)

View File

@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
return ans; return ans;
} // }}} } // }}}
// alter_links() {{{
template<typename T>
static inline bool
dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
const PdfObject *val = d.GetKey(key);
if (val && val->IsName() && val->GetName().GetName() == name) return true;
return false;
}
static PyObject *
PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
int count = 0;
static const PdfName XYZ("XYZ");
PyObject *alter_callback, *py_mark_links;
if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
bool mark_links = PyObject_IsTrue(py_mark_links);
try {
PdfArray border, link_color;
border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
if((*it)->IsDictionary()) {
PdfDictionary &link = (*it)->GetDictionary();
if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
if (mark_links) {
link.AddKey("Border", border);
link.AddKey("C", link_color);
}
if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
PdfDictionary &A = link.GetKey("A")->GetDictionary();
if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
PdfObject *uo = A.GetKey("URI");
if (uo && uo->IsString()) {
const std::string &uri = uo->GetString().GetStringUtf8();
PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
if (!ret) { return NULL; }
if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
int pagenum; double left, top; long long zoom;
if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
PdfPage *page = NULL;
try {
page = self->doc->GetPage(pagenum - 1);
} catch(const PdfError &err) {
PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
Py_DECREF(ret);
return NULL;
}
if (page) {
const PdfReference &pageref = page->GetObject()->Reference();
PdfArray dest;
dest.push_back(pageref);
dest.push_back(XYZ);
dest.push_back(left);
dest.push_back(top);
dest.push_back((PoDoFo::pdf_int64)zoom);
link.RemoveKey("A");
link.AddKey("Dest", dest);
}
}
}
Py_DECREF(ret);
}
}
}
}
}
}
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
return Py_BuildValue("i", count);
} // }}}
// Properties {{{ // Properties {{{
static PyObject * static PyObject *
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS, {"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
"extract_anchors() -> Extract information about links in the document." "extract_anchors() -> Extract information about links in the document."
}, },
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
"alter_links() -> Change links in the document."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
}, },