mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Get document links working when PDF is generated in parts
This commit is contained in:
parent
7b03c7567c
commit
f0584b8fdb
@ -439,13 +439,17 @@ class Container(ContainerBase): # {{{
|
||||
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
|
||||
media_type = self.mime_map.get(name, guess_type(name))
|
||||
if name == self.opf_name:
|
||||
replace_func.file_type = 'opf'
|
||||
for elem in self.opf_xpath('//*[@href]'):
|
||||
elem.set('href', replace_func(elem.get('href')))
|
||||
elif media_type.lower() in OEB_DOCS:
|
||||
replace_func.file_type = 'text'
|
||||
rewrite_links(self.parsed(name), replace_func)
|
||||
elif media_type.lower() in OEB_STYLES:
|
||||
replace_func.file_type = 'style'
|
||||
replaceUrls(self.parsed(name), replace_func)
|
||||
elif media_type.lower() == guess_type('toc.ncx'):
|
||||
replace_func.file_type = 'ncx'
|
||||
for elem in self.parsed(name).xpath('//*[@src]'):
|
||||
elem.set('src', replace_func(elem.get('src')))
|
||||
|
||||
|
@ -110,7 +110,7 @@ class TOC(object):
|
||||
return ans
|
||||
|
||||
def __str__(self):
|
||||
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
|
||||
return '\n'.join(self.get_lines())
|
||||
|
||||
def to_dict(self, node_counter=None):
|
||||
ans = {
|
||||
|
@ -29,6 +29,7 @@ from calibre.utils.logging import default_log
|
||||
from calibre.utils.podofo import get_podofo, set_metadata_implementation
|
||||
from calibre.utils.short_uuid import uuid4
|
||||
from polyglot.builtins import iteritems, range
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
OK, LOAD_FAILED, KILL_SIGNAL = range(0, 3)
|
||||
|
||||
@ -205,15 +206,12 @@ def add_anchors_markup(root, uuid, anchors):
|
||||
div[-1].tail = ' '
|
||||
|
||||
|
||||
def add_toc_links(container, toc, margin_groups):
|
||||
# TODO: Change this to work for all anchors so it can be used to fix
|
||||
# arbitrary links
|
||||
def add_all_links(container, margin_groups):
|
||||
uuid = uuid4()
|
||||
name_anchor_map = {}
|
||||
for item in toc.iterdescendants():
|
||||
if item.dest and item.frag:
|
||||
anchors = name_anchor_map.setdefault(item.dest, set())
|
||||
anchors.add(item.frag)
|
||||
for name, is_linear in container.spine_names:
|
||||
root = container.parsed(name)
|
||||
name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
|
||||
for group in margin_groups:
|
||||
name = group[0][0]
|
||||
anchors = name_anchor_map.get(name, set())
|
||||
@ -226,12 +224,15 @@ def make_anchors_unique(container):
|
||||
mapping = {}
|
||||
count = 0
|
||||
base = None
|
||||
spine_names = set()
|
||||
|
||||
def replacer(url):
|
||||
if replacer.file_type != 'text':
|
||||
return url
|
||||
if not url:
|
||||
return url
|
||||
if '#' not in url:
|
||||
return url
|
||||
url += '#'
|
||||
if url.startswith('#'):
|
||||
href, frag = base, url[1:]
|
||||
else:
|
||||
@ -239,16 +240,21 @@ def make_anchors_unique(container):
|
||||
name = container.href_to_name(href, base)
|
||||
if not name:
|
||||
return url
|
||||
if not frag and name in spine_names:
|
||||
replacer.replaced = True
|
||||
return 'https://calibre-pdf-anchor.n#' + name
|
||||
key = name, frag
|
||||
new_frag = mapping.get(key)
|
||||
if new_frag is None:
|
||||
return url
|
||||
replacer.replaced = True
|
||||
return 'https://calibre-pdf-anchor.a#' + new_frag
|
||||
if url.startswith('#'):
|
||||
return '#' + new_frag
|
||||
return href + '#' + new_frag
|
||||
|
||||
for spine_name, is_linear in container.spine_names:
|
||||
spine_names.add(spine_name)
|
||||
root = container.parsed(spine_name)
|
||||
for elem in root.xpath('//*[@id]'):
|
||||
count += 1
|
||||
@ -279,23 +285,48 @@ def get_anchor_locations(pdf_doc, first_page_num, toc_uuid):
|
||||
return ans
|
||||
|
||||
|
||||
def fix_links(pdf_doc, anchor_locations, name_page_numbers, mark_links, log):
|
||||
|
||||
def replace_link(url):
|
||||
purl = urlparse(url)
|
||||
if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
|
||||
return
|
||||
loc = None
|
||||
if purl.netloc == 'calibre-pdf-anchor.a':
|
||||
loc = anchor_locations.get(purl.fragment)
|
||||
if loc is None:
|
||||
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
|
||||
else:
|
||||
pnum = name_page_numbers.get(purl.fragment)
|
||||
if pnum is None:
|
||||
log.warn('Anchor location for link to {} not found'.format(purl.fragment))
|
||||
else:
|
||||
loc = AnchorLocation(pnum, 0, 0, 0)
|
||||
return loc
|
||||
|
||||
pdf_doc.alter_links(replace_link, mark_links)
|
||||
|
||||
|
||||
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None):
|
||||
container = Container(opf_path, log)
|
||||
make_anchors_unique(container)
|
||||
margin_groups = create_margin_groups(container)
|
||||
links_page_uuid = add_all_links(container, margin_groups)
|
||||
toc = get_toc(container)
|
||||
toc_uuid = add_toc_links(container, toc, margin_groups)
|
||||
(toc)
|
||||
container.commit()
|
||||
|
||||
renderer = Renderer(opts)
|
||||
page_layout = get_page_layout(opts)
|
||||
pdf_doc = None
|
||||
anchor_locations = {}
|
||||
name_page_numbers = {}
|
||||
num_pages = 0
|
||||
for group in margin_groups:
|
||||
name, margins = group[0]
|
||||
name_page_numbers[name] = num_pages + 1
|
||||
doc = render_name(container, name, margins, renderer, page_layout)
|
||||
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, toc_uuid))
|
||||
anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid))
|
||||
num_pages += doc.page_count()
|
||||
|
||||
if pdf_doc is None:
|
||||
@ -303,7 +334,7 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
|
||||
else:
|
||||
pdf_doc.append(doc)
|
||||
|
||||
# TODO: Fix links using anchor_locations
|
||||
fix_links(pdf_doc, anchor_locations, name_page_numbers, opts.pdf_mark_links, log)
|
||||
|
||||
if cover_data:
|
||||
add_cover(pdf_doc, cover_data, page_layout, opts)
|
||||
|
@ -374,39 +374,39 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
||||
PyObject *ans = PyDict_New();
|
||||
if (ans == NULL) return NULL;
|
||||
try {
|
||||
if ((catalog = self->doc->GetCatalog()) != NULL) {
|
||||
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
|
||||
PdfPagesTree *tree = self->doc->GetPagesTree();
|
||||
if (dests_ref && dests_ref->IsReference()) {
|
||||
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
|
||||
if (dests_obj && dests_obj->IsDictionary()) {
|
||||
const PdfDictionary &dests = dests_obj->GetDictionary();
|
||||
const TKeyMap &keys = dests.GetKeys();
|
||||
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
|
||||
if (itres->second->IsArray()) {
|
||||
const PdfArray &dest = itres->second->GetArray();
|
||||
// see section 8.2 of PDF spec for different types of destination arrays
|
||||
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
|
||||
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
|
||||
const PdfPage *page = tree->GetPage(dest[0].GetReference());
|
||||
if (page) {
|
||||
unsigned int pagenum = page->GetPageNumber();
|
||||
double left = dest[2].GetReal(), top = dest[3].GetReal();
|
||||
long long zoom = dest[4].GetNumber();
|
||||
const std::string &anchor = itres->first.GetName();
|
||||
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
|
||||
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
|
||||
if (!tuple || !key) { break; }
|
||||
int ret = PyDict_SetItem(ans, key, tuple);
|
||||
Py_DECREF(key); Py_DECREF(tuple);
|
||||
if (ret != 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((catalog = self->doc->GetCatalog()) != NULL) {
|
||||
const PdfObject *dests_ref = catalog->GetDictionary().GetKey("Dests");
|
||||
PdfPagesTree *tree = self->doc->GetPagesTree();
|
||||
if (dests_ref && dests_ref->IsReference()) {
|
||||
const PdfObject *dests_obj = self->doc->GetObjects().GetObject(dests_ref->GetReference());
|
||||
if (dests_obj && dests_obj->IsDictionary()) {
|
||||
const PdfDictionary &dests = dests_obj->GetDictionary();
|
||||
const TKeyMap &keys = dests.GetKeys();
|
||||
for (TCIKeyMap itres = keys.begin(); itres != keys.end(); ++itres) {
|
||||
if (itres->second->IsArray()) {
|
||||
const PdfArray &dest = itres->second->GetArray();
|
||||
// see section 8.2 of PDF spec for different types of destination arrays
|
||||
// but chromium apparently generates only [page /XYZ left top zoom] type arrays
|
||||
if (dest.GetSize() > 4 && dest[1].IsName() && dest[1].GetName().GetName() == "XYZ") {
|
||||
const PdfPage *page = tree->GetPage(dest[0].GetReference());
|
||||
if (page) {
|
||||
unsigned int pagenum = page->GetPageNumber();
|
||||
double left = dest[2].GetReal(), top = dest[3].GetReal();
|
||||
long long zoom = dest[4].GetNumber();
|
||||
const std::string &anchor = itres->first.GetName();
|
||||
PyObject *key = PyUnicode_DecodeUTF8(anchor.c_str(), anchor.length(), "replace");
|
||||
PyObject *tuple = Py_BuildValue("IddL", pagenum, left, top, zoom);
|
||||
if (!tuple || !key) { break; }
|
||||
int ret = PyDict_SetItem(ans, key, tuple);
|
||||
Py_DECREF(key); Py_DECREF(tuple);
|
||||
if (ret != 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(const PdfError & err) {
|
||||
podofo_set_exception(err);
|
||||
Py_CLEAR(ans);
|
||||
@ -420,6 +420,82 @@ PDFDoc_extract_anchors(PDFDoc *self, PyObject *args) {
|
||||
return ans;
|
||||
} // }}}
|
||||
|
||||
// alter_links() {{{
|
||||
|
||||
template<typename T>
|
||||
static inline bool
|
||||
dictionary_has_key_name(PdfDictionary &d, T key, const char *name) {
|
||||
const PdfObject *val = d.GetKey(key);
|
||||
if (val && val->IsName() && val->GetName().GetName() == name) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static PyObject *
|
||||
PDFDoc_alter_links(PDFDoc *self, PyObject *args) {
|
||||
int count = 0;
|
||||
static const PdfName XYZ("XYZ");
|
||||
PyObject *alter_callback, *py_mark_links;
|
||||
if (!PyArg_ParseTuple(args, "OO", &alter_callback, &py_mark_links)) return NULL;
|
||||
bool mark_links = PyObject_IsTrue(py_mark_links);
|
||||
try {
|
||||
PdfArray border, link_color;
|
||||
border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)16); border.push_back((PoDoFo::pdf_int64)1);
|
||||
link_color.push_back(1.); link_color.push_back(0.); link_color.push_back(0.);
|
||||
for(TCIVecObjects it = self->doc->GetObjects().begin(); it != self->doc->GetObjects().end(); it++) {
|
||||
if((*it)->IsDictionary()) {
|
||||
PdfDictionary &link = (*it)->GetDictionary();
|
||||
if (dictionary_has_key_name(link, PdfName::KeyType, "Annot") && dictionary_has_key_name(link, PdfName::KeySubtype, "Link")) {
|
||||
if (mark_links) {
|
||||
link.AddKey("Border", border);
|
||||
link.AddKey("C", link_color);
|
||||
}
|
||||
if (link.HasKey("A") && link.GetKey("A")->IsDictionary()) {
|
||||
PdfDictionary &A = link.GetKey("A")->GetDictionary();
|
||||
if (dictionary_has_key_name(A, PdfName::KeyType, "Action") && dictionary_has_key_name(A, "S", "URI")) {
|
||||
PdfObject *uo = A.GetKey("URI");
|
||||
if (uo && uo->IsString()) {
|
||||
const std::string &uri = uo->GetString().GetStringUtf8();
|
||||
PyObject *ret = PyObject_CallObject(alter_callback, Py_BuildValue("(N)", PyUnicode_DecodeUTF8(uri.c_str(), uri.length(), "replace")));
|
||||
if (!ret) { return NULL; }
|
||||
if (PyTuple_Check(ret) && PyTuple_GET_SIZE(ret) == 4) {
|
||||
int pagenum; double left, top; long long zoom;
|
||||
if (PyArg_ParseTuple(ret, "iddL", &pagenum, &left, &top, &zoom)) {
|
||||
PdfPage *page = NULL;
|
||||
try {
|
||||
page = self->doc->GetPage(pagenum - 1);
|
||||
} catch(const PdfError &err) {
|
||||
PyErr_Format(PyExc_ValueError, "No page number %d in the PDF file", pagenum);
|
||||
Py_DECREF(ret);
|
||||
return NULL;
|
||||
}
|
||||
if (page) {
|
||||
const PdfReference &pageref = page->GetObject()->Reference();
|
||||
PdfArray dest;
|
||||
dest.push_back(pageref);
|
||||
dest.push_back(XYZ);
|
||||
dest.push_back(left);
|
||||
dest.push_back(top);
|
||||
dest.push_back((PoDoFo::pdf_int64)zoom);
|
||||
link.RemoveKey("A");
|
||||
link.AddKey("Dest", dest);
|
||||
}
|
||||
}
|
||||
}
|
||||
Py_DECREF(ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(const PdfError & err) {
|
||||
podofo_set_exception(err);
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", count);
|
||||
} // }}}
|
||||
|
||||
// Properties {{{
|
||||
|
||||
static PyObject *
|
||||
@ -645,6 +721,9 @@ static PyMethodDef PDFDoc_methods[] = {
|
||||
{"extract_anchors", (PyCFunction)PDFDoc_extract_anchors, METH_VARARGS,
|
||||
"extract_anchors() -> Extract information about links in the document."
|
||||
},
|
||||
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
|
||||
"alter_links() -> Change links in the document."
|
||||
},
|
||||
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
||||
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
||||
},
|
||||
|
Loading…
x
Reference in New Issue
Block a user