Edit Book: Check Book: Add a check for "invalid" ids

Some sad-sack ebook services barf on "invalid" ids. See #1467765 (Private bug)
They should just accept all valid HTML 5 ids instead of relying
on the terrible epubcheck.
This commit is contained in:
Kovid Goyal 2015-07-22 18:21:41 +05:30
parent f37b91ad49
commit f46d465475
2 changed files with 78 additions and 1 deletions

View File

@ -328,6 +328,34 @@ class DuplicateId(BaseError):
container.dirty(self.name) container.dirty(self.name)
return True return True
class InvalidId(BaseError):
INDIVIDUAL_FIX = _(
'Replace this id with a randomly generated valid id')
def __init__(self, name, line, eid):
BaseError.__init__(self, _('Invalid id: %s') % eid, name, line)
self.HELP = _(
'The id {0} is not a valid id. IDs must start with a letter ([A-Za-z]) and may be'
' followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_")'
', colons (":"), and periods ("."). This is to ensure maximum compatibility'
' with a wide range of devices.').format(eid)
self.invalid_id = eid
def __call__(self, container):
import uuid
from calibre.ebooks.oeb.polish.replace import replace_ids
newid = 'g' + uuid.uuid4().hex
changed = False
elems = (e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.invalid_id)
for e in elems:
e.set('id', newid)
changed = True
container.dirty(self.name)
if changed:
replace_ids(container, {self.name:{self.invalid_id:newid}})
return changed
class BareTextInBody(BaseError): class BareTextInBody(BaseError):
INDIVIDUAL_FIX = _('Wrap the bare text in a p tag') INDIVIDUAL_FIX = _('Wrap the bare text in a p tag')
@ -416,6 +444,8 @@ def check_filenames(container):
errors.append(EscapedName(name)) errors.append(EscapedName(name))
return errors return errors
valid_id = re.compile(r'^[a-zA-Z][a-zA-Z0-9_:.-]*$')
def check_ids(container): def check_ids(container):
errors = [] errors = []
mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')} mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
@ -432,6 +462,8 @@ def check_ids(container):
dups[eid].append(elem.sourceline) dups[eid].append(elem.sourceline)
else: else:
seen_ids[eid] = elem.sourceline seen_ids[eid] = elem.sourceline
if eid and valid_id.match(eid) is None:
errors.append(InvalidId(name, elem.sourceline, eid))
errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems()) errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
return errors return errors

View File

@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import codecs, shutil, os, posixpath import codecs, shutil, os, posixpath
from urlparse import urlparse from urlparse import urlparse, urlunparse
from collections import Counter, defaultdict from collections import Counter, defaultdict
from calibre import sanitize_file_name_unicode from calibre import sanitize_file_name_unicode
@ -46,6 +46,35 @@ class LinkReplacer(object):
self.replaced = True self.replaced = True
return href return href
class IdReplacer(object):
def __init__(self, base, container, id_map):
self.base, self.container, self.replaced = base, container, False
self.id_map = id_map
def __call__(self, url):
if url and url.startswith('#'):
repl = self.id_map.get(self.base, {}).get(url[1:])
if repl is None or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
id_map = self.id_map.get(name)
if id_map is None:
return url
purl = urlparse(url)
nfrag = id_map.get(purl.fragment)
if nfrag is None:
return url
purl = purl._replace(fragment=nfrag)
href = urlunparse(purl)
if href != url:
self.replaced = True
return href
class LinkRebaser(object): class LinkRebaser(object):
def __init__(self, container, old_name, new_name): def __init__(self, container, old_name, new_name):
@ -88,6 +117,22 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_
repl = LinkReplacer(name, container, link_map, frag_map) repl = LinkReplacer(name, container, link_map, frag_map)
container.replace_links(name, repl) container.replace_links(name, repl)
def replace_ids(container, id_map):
'''
Replace all links in the container that pointed to the changed ids.
:param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
:return: True iff at least one link was changed
'''
changed = False
for name, media_type in container.mime_map.iteritems():
repl = IdReplacer(name, container, id_map)
container.replace_links(name, repl)
if repl.replaced:
changed = True
return changed
def smarten_punctuation(container, report): def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation from calibre.ebooks.conversion.preprocess import smarten_punctuation
smartened = False smartened = False