From b44426f17072fd3f5fde0768ab4a979b82df5c70 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 9 Dec 2013 15:54:21 +0530 Subject: [PATCH] Check all links in the book --- src/calibre/ebooks/oeb/polish/check/links.py | 61 ++++++++++++++++++++ src/calibre/ebooks/oeb/polish/check/main.py | 3 + src/calibre/ebooks/oeb/polish/container.py | 12 +++- 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/oeb/polish/check/links.py diff --git a/src/calibre/ebooks/oeb/polish/check/links.py b/src/calibre/ebooks/oeb/polish/check/links.py new file mode 100644 index 0000000000..68b0f75793 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/check/links.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from collections import defaultdict +from urlparse import urlparse + +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES +from calibre.ebooks.oeb.polish.container import guess_type +from calibre.ebooks.oeb.polish.check.base import BaseError, WARN + +class BadLink(BaseError): + + HELP = _('The resource pointed to by this link does not exist. You should' + ' either fix, or remove the link.') + level = WARN + +class FileLink(BadLink): + + HELP = _('This link uses the file:// URL scheme. This does not work with many ebook readers.' + ' Remove the file:// prefix and make sure the link points to a file inside the book.') + +class LocalLink(BadLink): + + HELP = _('This link points to a file outside the book. It will not work if the' + ' book is read on any computer other than the one it was created on.' + ' Either fix or remove the link.') + +def check_links(container): + links_map = defaultdict(set) + xml_types = {guess_type('a.opf'), guess_type('a.ncx')} + errors = [] + a = errors.append + + def fl(x): + x = repr(x) + if x.startswith('u'): + x = x[1:] + return x + + for name, mt in container.mime_map.iteritems(): + if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types: + for href, lnum, col in container.iterlinks(name): + tname = container.href_to_name(href, name) + if tname is not None: + if container.exists(tname): + links_map[tname].add(name) + else: + a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col)) + else: + purl = urlparse(href) + if purl.scheme == 'file': + a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col)) + elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}: + a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col)) + + return errors diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index 93e5a65975..f6db7d2e7c 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -14,6 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.check.base import run_checkers from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing from calibre.ebooks.oeb.polish.check.images import check_raster_images +from calibre.ebooks.oeb.polish.check.links import check_links XML_TYPES = frozenset(map(guess_type, ('a.xml', 'a.svg', 'a.opf', 'a.ncx'))) @@ -37,6 +38,8 @@ def run_checks(container): errors.extend(run_checkers(check_xml_parsing, html_items)) errors.extend(run_checkers(check_raster_images, raster_images)) + errors += check_links(container) + return errors def fix_errors(container, errors): diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 5f0ab3c07d..c6adc01c0f 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' import os, logging, sys, hashlib, uuid, re, shutil from collections import defaultdict +from bisect import bisect from io import BytesIO from urlparse import urlparse from future_builtins import zip @@ -293,8 +294,17 @@ class Container(object): # {{{ if get_line_numbers: with self.open(name) as f: raw = self.decode(f.read()) + new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw)) + def position(pos): + lnum = bisect(new_lines, pos) + try: + offset = abs(pos - new_lines[lnum - 1]) + except IndexError: + offset = pos + return (lnum + 1, offset) for link, offset in itercsslinks(raw): - yield link, 0, offset + lnum, col = position(offset) + yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link