Edit book: Add checks for duplicate ids in HTML/OPF/NCX files

2025-07-09 03:04:10 -04:00 · 2014-01-15 11:14:00 +05:30 · 2014-01-15 11:14:00 +05:30 · 4cf26dd397
commit 4cf26dd397
parent e2647b735f
2 changed files with 44 additions and 2 deletions
--- a/src/calibre/ebooks/oeb/polish/check/main.py
+++ b/src/calibre/ebooks/oeb/polish/check/main.py
@ -13,7 +13,7 @@ from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ebooks.oeb.polish.cover import is_raster_image
 from calibre.ebooks.oeb.polish.check.base import run_checkers
 from calibre.ebooks.oeb.polish.check.parsing import (
-    check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size)
+    check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size, check_ids)
 from calibre.ebooks.oeb.polish.check.images import check_raster_images
 from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes
 from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -60,6 +60,7 @@ def run_checks(container):
    errors += check_links(container)
    errors += check_fonts(container)
    errors += check_filenames(container)
+    errors += check_ids(container)

    return errors

--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@ -14,7 +14,7 @@ import cssutils
 from calibre import force_unicode, human_readable, prepare_string_for_xml
 from calibre.ebooks.html_entities import html5_entities
 from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
-from calibre.ebooks.oeb.polish.utils import PositionFinder
+from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
 from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
 from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE

@ -237,6 +237,28 @@ class CSSError(BaseError):

 pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)'))

+class DuplicateId(BaseError):
+
+    has_multiple_locations = True
+
+    INDIVIDUAL_FIX = _(
+        'Remove the duplicate ids from all but the first element')
+
+    def __init__(self, name, eid, locs):
+        BaseError.__init__(self, _('Duplicate id: %s') % eid, name)
+        self.HELP = _(
+            'The id {0} is present on more than one element in {1}. This is'
+            ' not allowed. Remove the id from all but one of the elements').format(eid, name)
+        self.all_locations = [(name, lnum, None) for lnum in sorted(locs)]
+        self.duplicate_id = eid
+
+    def __call__(self, container):
+        elems = [e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.duplicate_id]
+        for e in elems[1:]:
+            e.attrib.pop('id')
+        container.dirty(self.name)
+        return True
+
 class ErrorHandler(object):

    ' Replacement logger to get useful error/warning info out of cssutils during parsing '
@ -289,3 +311,22 @@ def check_filenames(container):
        if urlquote(name) != name:
            errors.append(EscapedName(name))
    return errors
+
+def check_ids(container):
+    errors = []
+    mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
+    for name, mt in container.mime_map.iteritems():
+        if mt in mts:
+            root = container.parsed(name)
+            seen_ids = {}
+            dups = {}
+            for elem in root.xpath('//*[@id]'):
+                eid = elem.get('id')
+                if eid in seen_ids:
+                    if eid not in dups:
+                        dups[eid] = [seen_ids[eid]]
+                    dups[eid].append(elem.sourceline)
+                else:
+                    seen_ids[eid] = elem.sourceline
+            errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
+    return errors