Check all links in the book

This commit is contained in:
Kovid Goyal 2013-12-09 15:54:21 +05:30
parent 3b7c0e9ac6
commit b44426f170
3 changed files with 75 additions and 1 deletions

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from urlparse import urlparse
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
class BadLink(BaseError):
HELP = _('The resource pointed to by this link does not exist. You should'
' either fix, or remove the link.')
level = WARN
class FileLink(BadLink):
HELP = _('This link uses the file:// URL scheme. This does not work with many ebook readers.'
' Remove the file:// prefix and make sure the link points to a file inside the book.')
class LocalLink(BadLink):
HELP = _('This link points to a file outside the book. It will not work if the'
' book is read on any computer other than the one it was created on.'
' Either fix or remove the link.')
def check_links(container):
links_map = defaultdict(set)
xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
errors = []
a = errors.append
def fl(x):
x = repr(x)
if x.startswith('u'):
x = x[1:]
return x
for name, mt in container.mime_map.iteritems():
if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
for href, lnum, col in container.iterlinks(name):
tname = container.href_to_name(href, name)
if tname is not None:
if container.exists(tname):
links_map[tname].add(name)
else:
a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col))
else:
purl = urlparse(href)
if purl.scheme == 'file':
a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col))
elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}:
a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col))
return errors

View File

@ -14,6 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing
from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links
XML_TYPES = frozenset(map(guess_type, ('a.xml', 'a.svg', 'a.opf', 'a.ncx'))) XML_TYPES = frozenset(map(guess_type, ('a.xml', 'a.svg', 'a.opf', 'a.ncx')))
@ -37,6 +38,8 @@ def run_checks(container):
errors.extend(run_checkers(check_xml_parsing, html_items)) errors.extend(run_checkers(check_xml_parsing, html_items))
errors.extend(run_checkers(check_raster_images, raster_images)) errors.extend(run_checkers(check_raster_images, raster_images))
errors += check_links(container)
return errors return errors
def fix_errors(container, errors): def fix_errors(container, errors):

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid, re, shutil import os, logging, sys, hashlib, uuid, re, shutil
from collections import defaultdict from collections import defaultdict
from bisect import bisect
from io import BytesIO from io import BytesIO
from urlparse import urlparse from urlparse import urlparse
from future_builtins import zip from future_builtins import zip
@ -293,8 +294,17 @@ class Container(object): # {{{
if get_line_numbers: if get_line_numbers:
with self.open(name) as f: with self.open(name) as f:
raw = self.decode(f.read()) raw = self.decode(f.read())
new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
def position(pos):
lnum = bisect(new_lines, pos)
try:
offset = abs(pos - new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)
for link, offset in itercsslinks(raw): for link, offset in itercsslinks(raw):
yield link, 0, offset lnum, col = position(offset)
yield link, lnum, col
else: else:
for link in getUrls(self.parsed(name)): for link in getUrls(self.parsed(name)):
yield link yield link