Edit book: Add checks for duplicate ids in HTML/OPF/NCX files

This commit is contained in:
Kovid Goyal 2014-01-15 11:14:00 +05:30
parent e2647b735f
commit 4cf26dd397
2 changed files with 44 additions and 2 deletions

View File

@ -13,7 +13,7 @@ from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import ( from calibre.ebooks.oeb.polish.check.parsing import (
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size) check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size, check_ids)
from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes
from calibre.ebooks.oeb.polish.check.fonts import check_fonts from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -60,6 +60,7 @@ def run_checks(container):
errors += check_links(container) errors += check_links(container)
errors += check_fonts(container) errors += check_fonts(container)
errors += check_filenames(container) errors += check_filenames(container)
errors += check_ids(container)
return errors return errors

View File

@ -14,7 +14,7 @@ import cssutils
from calibre import force_unicode, human_readable, prepare_string_for_xml from calibre import force_unicode, human_readable, prepare_string_for_xml
from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
from calibre.ebooks.oeb.polish.utils import PositionFinder from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE
@ -237,6 +237,28 @@ class CSSError(BaseError):
pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)')) pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)'))
class DuplicateId(BaseError):
has_multiple_locations = True
INDIVIDUAL_FIX = _(
'Remove the duplicate ids from all but the first element')
def __init__(self, name, eid, locs):
BaseError.__init__(self, _('Duplicate id: %s') % eid, name)
self.HELP = _(
'The id {0} is present on more than one element in {1}. This is'
' not allowed. Remove the id from all but one of the elements').format(eid, name)
self.all_locations = [(name, lnum, None) for lnum in sorted(locs)]
self.duplicate_id = eid
def __call__(self, container):
elems = [e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.duplicate_id]
for e in elems[1:]:
e.attrib.pop('id')
container.dirty(self.name)
return True
class ErrorHandler(object): class ErrorHandler(object):
' Replacement logger to get useful error/warning info out of cssutils during parsing ' ' Replacement logger to get useful error/warning info out of cssutils during parsing '
@ -289,3 +311,22 @@ def check_filenames(container):
if urlquote(name) != name: if urlquote(name) != name:
errors.append(EscapedName(name)) errors.append(EscapedName(name))
return errors return errors
def check_ids(container):
errors = []
mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
for name, mt in container.mime_map.iteritems():
if mt in mts:
root = container.parsed(name)
seen_ids = {}
dups = {}
for elem in root.xpath('//*[@id]'):
eid = elem.get('id')
if eid in seen_ids:
if eid not in dups:
dups[eid] = [seen_ids[eid]]
dups[eid].append(elem.sourceline)
else:
seen_ids[eid] = elem.sourceline
errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
return errors