Get document links working when PDF is generated in parts

This commit is contained in:
Kovid Goyal 2019-07-11 17:27:32 +05:30
parent 7b03c7567c
commit f0584b8fdb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 159 additions and 45 deletions

View File

@ -439,13 +439,17 @@ class Container(ContainerBase): # {{{
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
media_type = self.mime_map.get(name, guess_type(name))
if name == self.opf_name:
replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS:
replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES:
replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'):
elem.set('src', replace_func(elem.get('src')))

View File

@ -110,7 +110,7 @@ class TOC(object):
return ans
def __str__(self):
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
return '\n'.join(self.get_lines())
def to_dict(self, node_counter=None):
ans = {

View File

@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
from calibre.utils.podofo import get_podofo, set_metadata_implementation
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, range
from polyglot.urllib import urlparse
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
div[-1].tail = ' '
def add_toc_links(container, toc, margin_groups):
# TODO: Change this to work for all anchors so it can be used to fix
# arbitrary links
def add_all_links(container, margin_groups):
uuid = uuid4()
name_anchor_map = {}
for item in toc.iterdescendants():
if item.dest and item.frag:
anchors = name_anchor_map.setdefault(item.dest, set())
anchors.add(item.frag)
for name, is_linear in container.spine_names:
root = container.parsed(name)
name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
for group in margin_groups:
name = group[0][0]
anchors = name_anchor_map.get(name, set())
@ -226,12 +224,15 @@ def make_anchors_unique(container):
mapping = {}
count = 0
base = None
spine_names = set()
def replacer(url):
if replacer.file_type != 'text':
return url
if not url:
return url
if '#' not in url:
return url
url += '#'
if url.startswith('#'):
href, frag = base, url[1:]
else:
@ -239,16 +240,21 @@ def make_anchors_unique(container):
name = container.href_to_name(href, base)
if not name:
return url
if not frag and name in spine_names:
replacer.replaced = True
return 'https://calibre-pdf-anchor.n#' + name
key = name, frag
new_frag = mapping.get(key)
if new_frag is None:
return url
replacer.replaced = True
return 'https://calibre-pdf-anchor.a#' + new_frag
if url.startswith('#'):
return '#' + new_frag
return href + '#' + new_frag
for spine_name, is_linear in container.spine_names:
spine_names.add(spine_name)
root = container.parsed(spine_name)
for elem in root.xpath('//*[@id]'):
count += 1
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
return ans
def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
def replace_link(url):
purl = urlparse(url)
if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
return
loc = None
if purl.netloc == 'calibre-pdf-anchor.a':
loc = anchor_locations.get(purl.fragment)
if loc is None:
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
else:
pnum = name_page_numbers.get(purl.fragment)
if pnum is None:
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
else:
loc = AnchorLocation(pnum, 0, 0, 0)
return loc
pdf_doc.alter_links(replace_link, mark_links)
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
container = Container(opf_path, log)
make_anchors_unique(container)
margin_groups = create_margin_groups(container)
links_page_uuid = add_all_links(container, margin_groups)
toc = get_toc(container)
toc_uuid = add_toc_links(container, toc, margin_groups)
(toc)
container.commit()
renderer = Renderer(opts)
page_layout = get_page_layout(opts)
pdf_doc = None
anchor_locations = {}
name_page_numbers = {}
num_pages = 0
for group in margin_groups:
name, margins = group[0]
name_page_numbers[name] = num_pages + 1
doc = render_name(container, name, margins, renderer, page_layout)
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
num_pages += doc.page_count()
if pdf_doc is None:
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
else:
pdf_doc.append(doc)
# TODO: Fix links using anchor_locations
fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)
if cover_data:
add_cover(pdf_doc, cover_data, page_layout, opts)

View File

@ -374,39 +374,39 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
PyObject *ans = PyDict_New();
if (ans == NULL) return NULL;
try {
if ((catalog = self->doc->GetCatalog()) != NULL) {
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
PdfPagesTree *tree = self->doc->GetPagesTree();
if (dests_ref && dests_ref->IsReference()) {
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
if (dests_obj && dests_obj->IsDictionary()) {
const PdfDictionary &dests = dests_obj->GetDictionary();
const TKeyMap &keys = dests.GetKeys();
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
if (itres->second->IsArray()) {
const PdfArray &dest = itres->second->GetArray();
// see section 8.2 of PDF spec for different types of destination arrays
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
const PdfPage *page = tree->GetPage(dest[0].GetReference());
if (page) {
unsigned int pagenum = page->GetPageNumber();
double left = dest[2].GetReal(), top = dest[3].GetReal();
long long zoom = dest[4].GetNumber();
const std::string &anchor = itres->first.GetName();
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
if (!tuple || !key) { break; }
int ret = PyDict_SetItem(ans, key, tuple);
Py_DECREF(key); Py_DECREF(tuple);
if (ret != 0) break;
}
}
}
}
}
}
}
if ((catalog = self->doc->GetCatalog()) != NULL) {
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
PdfPagesTree *tree = self->doc->GetPagesTree();
if (dests_ref && dests_ref->IsReference()) {
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
if (dests_obj && dests_obj->IsDictionary()) {
const PdfDictionary &dests = dests_obj->GetDictionary();
const TKeyMap &keys = dests.GetKeys();
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
if (itres->second->IsArray()) {
const PdfArray &dest = itres->second->GetArray();
// see section 8.2 of PDF spec for different types of destination arrays
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
const PdfPage *page = tree->GetPage(dest[0].GetReference());
if (page) {
unsigned int pagenum = page->GetPageNumber();
double left = dest[2].GetReal(), top = dest[3].GetReal();
long long zoom = dest[4].GetNumber();
const std::string &anchor = itres->first.GetName();
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
if (!tuple || !key) { break; }
int ret = PyDict_SetItem(ans, key, tuple);
Py_DECREF(key); Py_DECREF(tuple);
if (ret != 0) break;
}
}
}
}
}
}
}
} catch(const PdfError & err) {
podofo_set_exception(err);
Py_CLEAR(ans);
@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
return ans;
} // }}}
// alter_links() {{{
template<typename T>
static inline bool
dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
const PdfObject *val = d.GetKey(key);
if (val && val->IsName() && val->GetName().GetName() == name) return true;
return false;
}
static PyObject *
PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
int count = 0;
static const PdfName XYZ("XYZ");
PyObject *alter_callback, *py_mark_links;
if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
bool mark_links = PyObject_IsTrue(py_mark_links);
try {
PdfArray border, link_color;
border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
if((*it)->IsDictionary()) {
PdfDictionary &link = (*it)->GetDictionary();
if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
if (mark_links) {
link.AddKey("Border", border);
link.AddKey("C", link_color);
}
if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
PdfDictionary &A = link.GetKey("A")->GetDictionary();
if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
PdfObject *uo = A.GetKey("URI");
if (uo && uo->IsString()) {
const std::string &uri = uo->GetString().GetStringUtf8();
PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
if (!ret) { return NULL; }
if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
int pagenum; double left, top; long long zoom;
if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
PdfPage *page = NULL;
try {
page = self->doc->GetPage(pagenum - 1);
} catch(const PdfError &err) {
PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
Py_DECREF(ret);
return NULL;
}
if (page) {
const PdfReference &pageref = page->GetObject()->Reference();
PdfArray dest;
dest.push_back(pageref);
dest.push_back(XYZ);
dest.push_back(left);
dest.push_back(top);
dest.push_back((PoDoFo::pdf_int64)zoom);
link.RemoveKey("A");
link.AddKey("Dest", dest);
}
}
}
Py_DECREF(ret);
}
}
}
}
}
}
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
return Py_BuildValue("i", count);
} // }}}
// Properties {{{
static PyObject *
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
"extract_anchors() -> Extract information about links in the document."
},
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
"alter_links() -> Change links in the document."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
},