diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index 076741a597..3cee4197f8 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -328,6 +328,34 @@ class DuplicateId(BaseError): container.dirty(self.name) return True +class InvalidId(BaseError): + + INDIVIDUAL_FIX = _( + 'Replace this id with a randomly generated valid id') + + def __init__(self, name, line, eid): + BaseError.__init__(self, _('Invalid id: %s') % eid, name, line) + self.HELP = _( + 'The id {0} is not a valid id. IDs must start with a letter ([A-Za-z]) and may be' + ' followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_")' + ', colons (":"), and periods ("."). This is to ensure maximum compatibility' + ' with a wide range of devices.').format(eid) + self.invalid_id = eid + + def __call__(self, container): + import uuid + from calibre.ebooks.oeb.polish.replace import replace_ids + newid = 'g' + uuid.uuid4().hex + changed = False + elems = (e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.invalid_id) + for e in elems: + e.set('id', newid) + changed = True + container.dirty(self.name) + if changed: + replace_ids(container, {self.name:{self.invalid_id:newid}}) + return changed + class BareTextInBody(BaseError): INDIVIDUAL_FIX = _('Wrap the bare text in a p tag') @@ -416,6 +444,8 @@ def check_filenames(container): errors.append(EscapedName(name)) return errors +valid_id = re.compile(r'^[a-zA-Z][a-zA-Z0-9_:.-]*$') + def check_ids(container): errors = [] mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')} @@ -432,6 +462,8 @@ def check_ids(container): dups[eid].append(elem.sourceline) else: seen_ids[eid] = elem.sourceline + if eid and valid_id.match(eid) is None: + errors.append(InvalidId(name, elem.sourceline, eid)) errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems()) return errors diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py index 4248ecef05..9b06cef4da 100644 --- a/src/calibre/ebooks/oeb/polish/replace.py +++ b/src/calibre/ebooks/oeb/polish/replace.py @@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' import codecs, shutil, os, posixpath -from urlparse import urlparse +from urlparse import urlparse, urlunparse from collections import Counter, defaultdict from calibre import sanitize_file_name_unicode @@ -46,6 +46,35 @@ class LinkReplacer(object): self.replaced = True return href +class IdReplacer(object): + + def __init__(self, base, container, id_map): + self.base, self.container, self.replaced = base, container, False + self.id_map = id_map + + def __call__(self, url): + if url and url.startswith('#'): + repl = self.id_map.get(self.base, {}).get(url[1:]) + if repl is None or repl == url[1:]: + return url + self.replaced = True + return '#' + repl + name = self.container.href_to_name(url, self.base) + if not name: + return url + id_map = self.id_map.get(name) + if id_map is None: + return url + purl = urlparse(url) + nfrag = id_map.get(purl.fragment) + if nfrag is None: + return url + purl = purl._replace(fragment=nfrag) + href = urlunparse(purl) + if href != url: + self.replaced = True + return href + class LinkRebaser(object): def __init__(self, container, old_name, new_name): @@ -88,6 +117,22 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_ repl = LinkReplacer(name, container, link_map, frag_map) container.replace_links(name, repl) +def replace_ids(container, id_map): + ''' + Replace all links in the container that pointed to the changed ids. + + :param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id} + :return: True iff at least one link was changed + + ''' + changed = False + for name, media_type in container.mime_map.iteritems(): + repl = IdReplacer(name, container, id_map) + container.replace_links(name, repl) + if repl.replaced: + changed = True + return changed + def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation smartened = False