Edit Book: Check Book: Add a check for "invalid" ids

Some sad-sack ebook services barf on "invalid" ids. See #1467765 (Private bug)
They should just accept all valid HTML 5 ids instead of relying
on the terrible epubcheck.
This commit is contained in:
Kovid Goyal 2015-07-22 18:21:41 +05:30
parent f37b91ad49
commit f46d465475
2 changed files with 78 additions and 1 deletions

View File

@ -328,6 +328,34 @@ class DuplicateId(BaseError):
container.dirty(self.name)
return True
class InvalidId(BaseError):
INDIVIDUAL_FIX = _(
'Replace this id with a randomly generated valid id')
def __init__(self, name, line, eid):
BaseError.__init__(self, _('Invalid id: %s') % eid, name, line)
self.HELP = _(
'The id {0} is not a valid id. IDs must start with a letter ([A-Za-z]) and may be'
' followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_")'
', colons (":"), and periods ("."). This is to ensure maximum compatibility'
' with a wide range of devices.').format(eid)
self.invalid_id = eid
def __call__(self, container):
import uuid
from calibre.ebooks.oeb.polish.replace import replace_ids
newid = 'g' + uuid.uuid4().hex
changed = False
elems = (e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.invalid_id)
for e in elems:
e.set('id', newid)
changed = True
container.dirty(self.name)
if changed:
replace_ids(container, {self.name:{self.invalid_id:newid}})
return changed
class BareTextInBody(BaseError):
INDIVIDUAL_FIX = _('Wrap the bare text in a p tag')
@ -416,6 +444,8 @@ def check_filenames(container):
errors.append(EscapedName(name))
return errors
valid_id = re.compile(r'^[a-zA-Z][a-zA-Z0-9_:.-]*$')
def check_ids(container):
errors = []
mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
@ -432,6 +462,8 @@ def check_ids(container):
dups[eid].append(elem.sourceline)
else:
seen_ids[eid] = elem.sourceline
if eid and valid_id.match(eid) is None:
errors.append(InvalidId(name, elem.sourceline, eid))
errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
return errors

View File

@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs, shutil, os, posixpath
from urlparse import urlparse
from urlparse import urlparse, urlunparse
from collections import Counter, defaultdict
from calibre import sanitize_file_name_unicode
@ -46,6 +46,35 @@ class LinkReplacer(object):
self.replaced = True
return href
class IdReplacer(object):
def __init__(self, base, container, id_map):
self.base, self.container, self.replaced = base, container, False
self.id_map = id_map
def __call__(self, url):
if url and url.startswith('#'):
repl = self.id_map.get(self.base, {}).get(url[1:])
if repl is None or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
id_map = self.id_map.get(name)
if id_map is None:
return url
purl = urlparse(url)
nfrag = id_map.get(purl.fragment)
if nfrag is None:
return url
purl = purl._replace(fragment=nfrag)
href = urlunparse(purl)
if href != url:
self.replaced = True
return href
class LinkRebaser(object):
def __init__(self, container, old_name, new_name):
@ -88,6 +117,22 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_
repl = LinkReplacer(name, container, link_map, frag_map)
container.replace_links(name, repl)
def replace_ids(container, id_map):
'''
Replace all links in the container that pointed to the changed ids.
:param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
:return: True iff at least one link was changed
'''
changed = False
for name, media_type in container.mime_map.iteritems():
repl = IdReplacer(name, container, id_map)
container.replace_links(name, repl)
if repl.replaced:
changed = True
return changed
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
smartened = False