mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Get document links working when PDF is generated in parts
This commit is contained in:
parent
7b03c7567c
commit
f0584b8fdb
@ -439,13 +439,17 @@ class Container(ContainerBase): # {{{
|
|||||||
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
|
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
|
||||||
media_type = self.mime_map.get(name, guess_type(name))
|
media_type = self.mime_map.get(name, guess_type(name))
|
||||||
if name == self.opf_name:
|
if name == self.opf_name:
|
||||||
|
replace_func.file_type = 'opf'
|
||||||
for elem in self.opf_xpath('//*[@href]'):
|
for elem in self.opf_xpath('//*[@href]'):
|
||||||
elem.set('href', replace_func(elem.get('href')))
|
elem.set('href', replace_func(elem.get('href')))
|
||||||
elif media_type.lower() in OEB_DOCS:
|
elif media_type.lower() in OEB_DOCS:
|
||||||
|
replace_func.file_type = 'text'
|
||||||
rewrite_links(self.parsed(name), replace_func)
|
rewrite_links(self.parsed(name), replace_func)
|
||||||
elif media_type.lower() in OEB_STYLES:
|
elif media_type.lower() in OEB_STYLES:
|
||||||
|
replace_func.file_type = 'style'
|
||||||
replaceUrls(self.parsed(name), replace_func)
|
replaceUrls(self.parsed(name), replace_func)
|
||||||
elif media_type.lower() == guess_type('toc.ncx'):
|
elif media_type.lower() == guess_type('toc.ncx'):
|
||||||
|
replace_func.file_type = 'ncx'
|
||||||
for elem in self.parsed(name).xpath('//*[@src]'):
|
for elem in self.parsed(name).xpath('//*[@src]'):
|
||||||
elem.set('src', replace_func(elem.get('src')))
|
elem.set('src', replace_func(elem.get('src')))
|
||||||
|
|
||||||
|
@ -110,7 +110,7 @@ class TOC(object):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
|
return '\n'.join(self.get_lines())
|
||||||
|
|
||||||
def to_dict(self, node_counter=None):
|
def to_dict(self, node_counter=None):
|
||||||
ans = {
|
ans = {
|
||||||
|
@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
|
|||||||
from calibre.utils.podofo import get_podofo, set_metadata_implementation
|
from calibre.utils.podofo import get_podofo, set_metadata_implementation
|
||||||
from calibre.utils.short_uuid import uuid4
|
from calibre.utils.short_uuid import uuid4
|
||||||
from polyglot.builtins import iteritems, range
|
from polyglot.builtins import iteritems, range
|
||||||
|
from polyglot.urllib import urlparse
|
||||||
|
|
||||||
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
|
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
|
||||||
|
|
||||||
@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
|
|||||||
div[-1].tail = ' '
|
div[-1].tail = ' '
|
||||||
|
|
||||||
|
|
||||||
def add_toc_links(container, toc, margin_groups):
|
def add_all_links(container, margin_groups):
|
||||||
# TODO: Change this to work for all anchors so it can be used to fix
|
|
||||||
# arbitrary links
|
|
||||||
uuid = uuid4()
|
uuid = uuid4()
|
||||||
name_anchor_map = {}
|
name_anchor_map = {}
|
||||||
for item in toc.iterdescendants():
|
for name, is_linear in container.spine_names:
|
||||||
if item.dest and item.frag:
|
root = container.parsed(name)
|
||||||
anchors = name_anchor_map.setdefault(item.dest, set())
|
name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
|
||||||
anchors.add(item.frag)
|
|
||||||
for group in margin_groups:
|
for group in margin_groups:
|
||||||
name = group[0][0]
|
name = group[0][0]
|
||||||
anchors = name_anchor_map.get(name, set())
|
anchors = name_anchor_map.get(name, set())
|
||||||
@ -226,12 +224,15 @@ def make_anchors_unique(container):
|
|||||||
mapping = {}
|
mapping = {}
|
||||||
count = 0
|
count = 0
|
||||||
base = None
|
base = None
|
||||||
|
spine_names = set()
|
||||||
|
|
||||||
def replacer(url):
|
def replacer(url):
|
||||||
|
if replacer.file_type != 'text':
|
||||||
|
return url
|
||||||
if not url:
|
if not url:
|
||||||
return url
|
return url
|
||||||
if '#' not in url:
|
if '#' not in url:
|
||||||
return url
|
url += '#'
|
||||||
if url.startswith('#'):
|
if url.startswith('#'):
|
||||||
href, frag = base, url[1:]
|
href, frag = base, url[1:]
|
||||||
else:
|
else:
|
||||||
@ -239,16 +240,21 @@ def make_anchors_unique(container):
|
|||||||
name = container.href_to_name(href, base)
|
name = container.href_to_name(href, base)
|
||||||
if not name:
|
if not name:
|
||||||
return url
|
return url
|
||||||
|
if not frag and name in spine_names:
|
||||||
|
replacer.replaced = True
|
||||||
|
return 'https://calibre-pdf-anchor.n#' + name
|
||||||
key = name, frag
|
key = name, frag
|
||||||
new_frag = mapping.get(key)
|
new_frag = mapping.get(key)
|
||||||
if new_frag is None:
|
if new_frag is None:
|
||||||
return url
|
return url
|
||||||
replacer.replaced = True
|
replacer.replaced = True
|
||||||
|
return 'https://calibre-pdf-anchor.a#' + new_frag
|
||||||
if url.startswith('#'):
|
if url.startswith('#'):
|
||||||
return '#' + new_frag
|
return '#' + new_frag
|
||||||
return href + '#' + new_frag
|
return href + '#' + new_frag
|
||||||
|
|
||||||
for spine_name, is_linear in container.spine_names:
|
for spine_name, is_linear in container.spine_names:
|
||||||
|
spine_names.add(spine_name)
|
||||||
root = container.parsed(spine_name)
|
root = container.parsed(spine_name)
|
||||||
for elem in root.xpath('//*[@id]'):
|
for elem in root.xpath('//*[@id]'):
|
||||||
count += 1
|
count += 1
|
||||||
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
|
||||||
|
|
||||||
|
def replace_link(url):
|
||||||
|
purl = urlparse(url)
|
||||||
|
if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
|
||||||
|
return
|
||||||
|
loc = None
|
||||||
|
if purl.netloc == 'calibre-pdf-anchor.a':
|
||||||
|
loc = anchor_locations.get(purl.fragment)
|
||||||
|
if loc is None:
|
||||||
|
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
|
||||||
|
else:
|
||||||
|
pnum = name_page_numbers.get(purl.fragment)
|
||||||
|
if pnum is None:
|
||||||
|
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
|
||||||
|
else:
|
||||||
|
loc = AnchorLocation(pnum, 0, 0, 0)
|
||||||
|
return loc
|
||||||
|
|
||||||
|
pdf_doc.alter_links(replace_link, mark_links)
|
||||||
|
|
||||||
|
|
||||||
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
|
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
|
||||||
container = Container(opf_path, log)
|
container = Container(opf_path, log)
|
||||||
make_anchors_unique(container)
|
make_anchors_unique(container)
|
||||||
margin_groups = create_margin_groups(container)
|
margin_groups = create_margin_groups(container)
|
||||||
|
links_page_uuid = add_all_links(container, margin_groups)
|
||||||
toc = get_toc(container)
|
toc = get_toc(container)
|
||||||
toc_uuid = add_toc_links(container, toc, margin_groups)
|
(toc)
|
||||||
container.commit()
|
container.commit()
|
||||||
|
|
||||||
renderer = Renderer(opts)
|
renderer = Renderer(opts)
|
||||||
page_layout = get_page_layout(opts)
|
page_layout = get_page_layout(opts)
|
||||||
pdf_doc = None
|
pdf_doc = None
|
||||||
anchor_locations = {}
|
anchor_locations = {}
|
||||||
|
name_page_numbers = {}
|
||||||
num_pages = 0
|
num_pages = 0
|
||||||
for group in margin_groups:
|
for group in margin_groups:
|
||||||
name, margins = group[0]
|
name, margins = group[0]
|
||||||
|
name_page_numbers[name] = num_pages + 1
|
||||||
doc = render_name(container, name, margins, renderer, page_layout)
|
doc = render_name(container, name, margins, renderer, page_layout)
|
||||||
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
|
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
|
||||||
num_pages += doc.page_count()
|
num_pages += doc.page_count()
|
||||||
|
|
||||||
if pdf_doc is None:
|
if pdf_doc is None:
|
||||||
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
|
|||||||
else:
|
else:
|
||||||
pdf_doc.append(doc)
|
pdf_doc.append(doc)
|
||||||
|
|
||||||
# TODO: Fix links using anchor_locations
|
fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)
|
||||||
|
|
||||||
if cover_data:
|
if cover_data:
|
||||||
add_cover(pdf_doc, cover_data, page_layout, opts)
|
add_cover(pdf_doc, cover_data, page_layout, opts)
|
||||||
|
@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
|||||||
return ans;
|
return ans;
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
|
// alter_links() {{{
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static inline bool
|
||||||
|
dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
|
||||||
|
const PdfObject *val = d.GetKey(key);
|
||||||
|
if (val && val->IsName() && val->GetName().GetName() == name) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
|
||||||
|
int count = 0;
|
||||||
|
static const PdfName XYZ("XYZ");
|
||||||
|
PyObject *alter_callback, *py_mark_links;
|
||||||
|
if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
|
||||||
|
bool mark_links = PyObject_IsTrue(py_mark_links);
|
||||||
|
try {
|
||||||
|
PdfArray border, link_color;
|
||||||
|
border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
|
||||||
|
link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
|
||||||
|
for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
|
||||||
|
if((*it)->IsDictionary()) {
|
||||||
|
PdfDictionary &link = (*it)->GetDictionary();
|
||||||
|
if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
|
||||||
|
if (mark_links) {
|
||||||
|
link.AddKey("Border", border);
|
||||||
|
link.AddKey("C", link_color);
|
||||||
|
}
|
||||||
|
if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
|
||||||
|
PdfDictionary &A = link.GetKey("A")->GetDictionary();
|
||||||
|
if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
|
||||||
|
PdfObject *uo = A.GetKey("URI");
|
||||||
|
if (uo && uo->IsString()) {
|
||||||
|
const std::string &uri = uo->GetString().GetStringUtf8();
|
||||||
|
PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
|
||||||
|
if (!ret) { return NULL; }
|
||||||
|
if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
|
||||||
|
int pagenum; double left, top; long long zoom;
|
||||||
|
if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
|
||||||
|
PdfPage *page = NULL;
|
||||||
|
try {
|
||||||
|
page = self->doc->GetPage(pagenum - 1);
|
||||||
|
} catch(const PdfError &err) {
|
||||||
|
PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
|
||||||
|
Py_DECREF(ret);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (page) {
|
||||||
|
const PdfReference &pageref = page->GetObject()->Reference();
|
||||||
|
PdfArray dest;
|
||||||
|
dest.push_back(pageref);
|
||||||
|
dest.push_back(XYZ);
|
||||||
|
dest.push_back(left);
|
||||||
|
dest.push_back(top);
|
||||||
|
dest.push_back((PoDoFo::pdf_int64)zoom);
|
||||||
|
link.RemoveKey("A");
|
||||||
|
link.AddKey("Dest", dest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Py_DECREF(ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(const PdfError & err) {
|
||||||
|
podofo_set_exception(err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return Py_BuildValue("i", count);
|
||||||
|
} // }}}
|
||||||
|
|
||||||
// Properties {{{
|
// Properties {{{
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
|
|||||||
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
|
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
|
||||||
"extract_anchors() -> Extract information about links in the document."
|
"extract_anchors() -> Extract information about links in the document."
|
||||||
},
|
},
|
||||||
|
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
|
||||||
|
"alter_links() -> Change links in the document."
|
||||||
|
},
|
||||||
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
||||||
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
||||||
},
|
},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user