KF8 Input: Handle invalif KF8 files with links pointing to non-existent locations and incorrect values in the dic table. Fixes #1082669 (AZW3 fails to open with Error; No file contains pos: 2290264)

This commit is contained in:
Kovid Goyal 2012-11-29 10:25:55 +05:30
parent 21f53431ad
commit 6ceb6af94e
2 changed files with 51 additions and 8 deletions

View File

@ -11,7 +11,7 @@ import re, os
from calibre.ebooks.chardet import strip_encoding_declarations
def update_internal_links(mobi8_reader):
def update_internal_links(mobi8_reader, log):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
@ -35,11 +35,16 @@ def update_internal_links(mobi8_reader):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32),
int(offset, 32))
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.split('/')[-1].encode(
mr.header.codec) + suffix
try:
filename, idtag = mr.get_id_tag_by_pos_fid(
int(posfid, 32), int(offset, 32))
except ValueError:
log.warn('Invalid link, points to nowhere, ignoring')
replacement = b'#'
else:
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.split('/')[-1].encode(
mr.header.codec) + suffix
tag = posfid_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
raw = b''.join(srcpieces)
@ -298,7 +303,7 @@ def upshift_markup(parts):
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader)
parts = update_internal_links(mobi8_reader, log)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts)

View File

@ -34,6 +34,16 @@ Elem = namedtuple('Elem',
FlowInfo = namedtuple('FlowInfo',
'type format dir fname')
# locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid):
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
aid_pattern = re.compile(pattern, re.IGNORECASE)
for m in re.finditer(aid_pattern, ml):
plt = m.start()
pgt = ml.find(b'>', plt+1)
return plt, pgt
return 0, 0
class Mobi8Reader(object):
def __init__(self, mobi6_reader, log):
@ -148,6 +158,7 @@ class Mobi8Reader(object):
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
baseptr = skelpos + skellen
skeleton = text[skelpos:baseptr]
inspos_warned = False
for i in xrange(divcnt):
insertpos, idtext, filenum, seqnum, startpos, length = \
self.elems[divptr]
@ -156,6 +167,23 @@ class Mobi8Reader(object):
filename = 'part%04d.html' % filenum
part = text[baseptr:baseptr + length]
insertpos = insertpos - skelpos
head = skeleton[:insertpos]
tail = skeleton[insertpos:]
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
head.rfind(b'<')):
# There is an incomplete tag in either the head or tail.
# This can happen for some badly formed KF8 files, see for
# example, https://bugs.launchpad.net/bugs/1082669
if not inspos_warned:
self.log.warn(
'The div table for %s has incorrect insert '
'positions. Calculating manually.'%skelname)
inspos_warned = True
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
if bp != ep:
insertpos = ep + 1 + startpos
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
baseptr = baseptr + length
divptr += 1
@ -320,6 +348,7 @@ class Mobi8Reader(object):
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
remove = []
# Add href and anchor info to the index entries
for entry in index_entries:
@ -332,11 +361,20 @@ class Mobi8Reader(object):
idtag = self.get_id_tag(pos).decode(self.header.codec)
href = '%s/%s'%(fi.type, fi.filename)
else:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
try:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
except ValueError:
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
%entry['text'])
remove.append(entry)
continue
entry['href'] = href
entry['idtag'] = idtag
for e in remove:
index_entries.remove(e)
# Build the TOC object
return build_toc(index_entries)