Edit Book: Check Book: Add a check for "invalid" ids

Some sad-sack ebook services barf on "invalid" ids. See #1467765 (Private bug) They should just accept all valid HTML 5 ids instead of relying on the terrible epubcheck.
2025-07-09 03:04:10 -04:00 · 2015-07-22 18:21:41 +05:30 · 2015-07-22 18:21:41 +05:30 · f46d465475
commit f46d465475
parent f37b91ad49
2 changed files with 78 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@ -328,6 +328,34 @@ class DuplicateId(BaseError):
        container.dirty(self.name)
        return True
 class InvalidId(BaseError):
    INDIVIDUAL_FIX = _(
        'Replace this id with a randomly generated valid id')
    def __init__(self, name, line, eid):
        BaseError.__init__(self, _('Invalid id: %s') % eid, name, line)
        self.HELP = _(
            'The id {0} is not a valid id. IDs must start with a letter ([A-Za-z]) and may be'
            ' followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_")'
            ', colons (":"), and periods ("."). This is to ensure maximum compatibility'
            ' with a wide range of devices.').format(eid)
        self.invalid_id = eid
    def __call__(self, container):
        import uuid
        from calibre.ebooks.oeb.polish.replace import replace_ids
        newid = 'g' + uuid.uuid4().hex
        changed = False
        elems = (e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.invalid_id)
        for e in elems:
            e.set('id', newid)
            changed = True
            container.dirty(self.name)
        if changed:
            replace_ids(container, {self.name:{self.invalid_id:newid}})
        return changed
 class BareTextInBody(BaseError):
    INDIVIDUAL_FIX = _('Wrap the bare text in a p tag')
@ -416,6 +444,8 @@ def check_filenames(container):
            errors.append(EscapedName(name))
    return errors
 valid_id = re.compile(r'^[a-zA-Z][a-zA-Z0-9_:.-]*$')
 def check_ids(container):
    errors = []
    mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
@ -432,6 +462,8 @@ def check_ids(container):
                    dups[eid].append(elem.sourceline)
                else:
                    seen_ids[eid] = elem.sourceline
                if eid and valid_id.match(eid) is None:
                    errors.append(InvalidId(name, elem.sourceline, eid))
            errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
    return errors
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import codecs, shutil, os, posixpath
-from urlparse import urlparse
+from urlparse import urlparse, urlunparse
 from collections import Counter, defaultdict
 from calibre import sanitize_file_name_unicode
@ -46,6 +46,35 @@ class LinkReplacer(object):
            self.replaced = True
        return href
 class IdReplacer(object):
    def __init__(self, base, container, id_map):
        self.base, self.container, self.replaced = base, container, False
        self.id_map = id_map
    def __call__(self, url):
        if url and url.startswith('#'):
            repl = self.id_map.get(self.base, {}).get(url[1:])
            if repl is None or repl == url[1:]:
                return url
            self.replaced = True
            return '#' + repl
        name = self.container.href_to_name(url, self.base)
        if not name:
            return url
        id_map = self.id_map.get(name)
        if id_map is None:
            return url
        purl = urlparse(url)
        nfrag = id_map.get(purl.fragment)
        if nfrag is None:
            return url
        purl = purl._replace(fragment=nfrag)
        href = urlunparse(purl)
        if href != url:
            self.replaced = True
        return href
 class LinkRebaser(object):
    def __init__(self, container, old_name, new_name):
@ -88,6 +117,22 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_
        repl = LinkReplacer(name, container, link_map, frag_map)
        container.replace_links(name, repl)
 def replace_ids(container, id_map):
    '''
    Replace all links in the container that pointed to the changed ids.
    :param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
    :return: True iff at least one link was changed
    '''
    changed = False
    for name, media_type in container.mime_map.iteritems():
        repl = IdReplacer(name, container, id_map)
        container.replace_links(name, repl)
        if repl.replaced:
            changed = True
    return changed
 def smarten_punctuation(container, report):
    from calibre.ebooks.conversion.preprocess import smarten_punctuation
    smartened = False