Check all links in the book

2025-07-09 03:04:10 -04:00 · 2013-12-09 15:54:21 +05:30 · 2013-12-09 15:54:21 +05:30 · b44426f170
commit b44426f170
parent 3b7c0e9ac6
3 changed files with 75 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/check/links.py
+++ b/src/calibre/ebooks/oeb/polish/check/links.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 from collections import defaultdict
 from urlparse import urlparse
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
 from calibre.ebooks.oeb.polish.container import guess_type
 from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
 class BadLink(BaseError):
    HELP = _('The resource pointed to by this link does not exist. You should'
             ' either fix, or remove the link.')
    level = WARN
 class FileLink(BadLink):
    HELP = _('This link uses the file:// URL scheme. This does not work with many ebook readers.'
             ' Remove the file:// prefix and make sure the link points to a file inside the book.')
 class LocalLink(BadLink):
    HELP = _('This link points to a file outside the book. It will not work if the'
             ' book is read on any computer other than the one it was created on.'
             ' Either fix or remove the link.')
 def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append
    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x
    for name, mt in container.mime_map.iteritems():
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                tname = container.href_to_name(href, name)
                if tname is not None:
                    if container.exists(tname):
                        links_map[tname].add(name)
                    else:
                        a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col))
                    elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}:
                        a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col))
    return errors
--- a/src/calibre/ebooks/oeb/polish/check/main.py
+++ b/src/calibre/ebooks/oeb/polish/check/main.py
@ -14,6 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
 from calibre.ebooks.oeb.polish.check.base import run_checkers
 from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing
 from calibre.ebooks.oeb.polish.check.images import check_raster_images
 from calibre.ebooks.oeb.polish.check.links import check_links
 XML_TYPES = frozenset(map(guess_type, ('a.xml', 'a.svg', 'a.opf', 'a.ncx')))
@ -37,6 +38,8 @@ def run_checks(container):
    errors.extend(run_checkers(check_xml_parsing, html_items))
    errors.extend(run_checkers(check_raster_images, raster_images))
    errors += check_links(container)
    return errors
 def fix_errors(container, errors):
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
 import os, logging, sys, hashlib, uuid, re, shutil
 from collections import defaultdict
 from bisect import bisect
 from io import BytesIO
 from urlparse import urlparse
 from future_builtins import zip
@ -293,8 +294,17 @@ class Container(object):  # {{{
            if get_line_numbers:
                with self.open(name) as f:
                    raw = self.decode(f.read())
                    new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
                    def position(pos):
                        lnum = bisect(new_lines, pos)
                        try:
                            offset = abs(pos - new_lines[lnum - 1])
                        except IndexError:
                            offset = pos
                        return (lnum + 1, offset)
                    for link, offset in itercsslinks(raw):
-                        yield link, 0, offset
+                        lnum, col = position(offset)
                        yield link, lnum, col
            else:
                for link in getUrls(self.parsed(name)):
                    yield link