Edit book: Add checks for duplicate ids in HTML/OPF/NCX files

2025-07-09 03:04:10 -04:00 · 2014-01-15 11:14:00 +05:30 · 2014-01-15 11:14:00 +05:30 · 4cf26dd397
commit 4cf26dd397
parent e2647b735f
2 changed files with 44 additions and 2 deletions
--- a/src/calibre/ebooks/oeb/polish/check/main.py
+++ b/src/calibre/ebooks/oeb/polish/check/main.py
@ -13,7 +13,7 @@ from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ebooks.oeb.polish.cover import is_raster_image
 from calibre.ebooks.oeb.polish.check.base import run_checkers
 from calibre.ebooks.oeb.polish.check.parsing import (
-    check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size)
+    check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size, check_ids)
 from calibre.ebooks.oeb.polish.check.images import check_raster_images
 from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes
 from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -60,6 +60,7 @@ def run_checks(container):
    errors += check_links(container)
    errors += check_fonts(container)
    errors += check_filenames(container)
    errors += check_ids(container)
    return errors
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@ -14,7 +14,7 @@ import cssutils
 from calibre import force_unicode, human_readable, prepare_string_for_xml
 from calibre.ebooks.html_entities import html5_entities
 from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
-from calibre.ebooks.oeb.polish.utils import PositionFinder
+from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
 from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
 from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE
@ -237,6 +237,28 @@ class CSSError(BaseError):
 pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)'))
 class DuplicateId(BaseError):
    has_multiple_locations = True
    INDIVIDUAL_FIX = _(
        'Remove the duplicate ids from all but the first element')
    def __init__(self, name, eid, locs):
        BaseError.__init__(self, _('Duplicate id: %s') % eid, name)
        self.HELP = _(
            'The id {0} is present on more than one element in {1}. This is'
            ' not allowed. Remove the id from all but one of the elements').format(eid, name)
        self.all_locations = [(name, lnum, None) for lnum in sorted(locs)]
        self.duplicate_id = eid
    def __call__(self, container):
        elems = [e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.duplicate_id]
        for e in elems[1:]:
            e.attrib.pop('id')
        container.dirty(self.name)
        return True
 class ErrorHandler(object):
    ' Replacement logger to get useful error/warning info out of cssutils during parsing '
@ -289,3 +311,22 @@ def check_filenames(container):
        if urlquote(name) != name:
            errors.append(EscapedName(name))
    return errors
 def check_ids(container):
    errors = []
    mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
    for name, mt in container.mime_map.iteritems():
        if mt in mts:
            root = container.parsed(name)
            seen_ids = {}
            dups = {}
            for elem in root.xpath('//*[@id]'):
                eid = elem.get('id')
                if eid in seen_ids:
                    if eid not in dups:
                        dups[eid] = [seen_ids[eid]]
                    dups[eid].append(elem.sourceline)
                else:
                    seen_ids[eid] = elem.sourceline
            errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
    return errors