mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
MOBI Indexing: Make error handling more robust
This commit is contained in:
parent
541039fe75
commit
e35ac4441e
@ -325,6 +325,8 @@ class MobiWriter(object):
|
|||||||
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
|
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
|
||||||
self._prefer_author_sort = prefer_author_sort
|
self._prefer_author_sort = prefer_author_sort
|
||||||
self._primary_index_record = None
|
self._primary_index_record = None
|
||||||
|
self._hasValideNCXEntries = False
|
||||||
|
self._ctoc = ""
|
||||||
self._HTMLRecords = []
|
self._HTMLRecords = []
|
||||||
self._tbSequence = ""
|
self._tbSequence = ""
|
||||||
self._initialIndexRecordFound = False
|
self._initialIndexRecordFound = False
|
||||||
@ -364,7 +366,7 @@ class MobiWriter(object):
|
|||||||
self._map_image_names()
|
self._map_image_names()
|
||||||
self._generate_text()
|
self._generate_text()
|
||||||
#if INDEXING and not self.opts.no_mobi_index:
|
#if INDEXING and not self.opts.no_mobi_index:
|
||||||
if INDEXING :
|
if INDEXING and self._hasValidNCXEntries :
|
||||||
try:
|
try:
|
||||||
self._generate_index()
|
self._generate_index()
|
||||||
except:
|
except:
|
||||||
@ -411,6 +413,8 @@ class MobiWriter(object):
|
|||||||
|
|
||||||
def _build_HTMLRecords_Data_List(self):
|
def _build_HTMLRecords_Data_List(self):
|
||||||
# Assemble a HTMLRecordData instance for each HTML record
|
# Assemble a HTMLRecordData instance for each HTML record
|
||||||
|
# Return True if valid, False if invalid
|
||||||
|
self._oeb.logger.info('Indexing navPoints ...')
|
||||||
|
|
||||||
numberOfHTMLRecords = ( self._content_length // RECORD_SIZE ) + 1
|
numberOfHTMLRecords = ( self._content_length // RECORD_SIZE ) + 1
|
||||||
|
|
||||||
@ -423,17 +427,28 @@ class MobiWriter(object):
|
|||||||
toc = self._oeb.toc
|
toc = self._oeb.toc
|
||||||
myIndex = 0
|
myIndex = 0
|
||||||
myEndingRecord = 0
|
myEndingRecord = 0
|
||||||
|
previousOffset = 0
|
||||||
|
previousLength = 0
|
||||||
|
offset = 0
|
||||||
|
length = 0
|
||||||
entries = list(toc.iter())[1:]
|
entries = list(toc.iter())[1:]
|
||||||
# borrowed from _generate_indxt
|
|
||||||
|
|
||||||
|
# Get offset, length per entry
|
||||||
for i, child in enumerate(entries):
|
for i, child in enumerate(entries):
|
||||||
|
|
||||||
|
'''
|
||||||
if not child.title or not child.title.strip():
|
if not child.title or not child.title.strip():
|
||||||
child.title = _('Unnamed')
|
child.title = "(none)"
|
||||||
|
'''
|
||||||
h = child.href
|
h = child.href
|
||||||
if h not in self._id_offsets:
|
if h not in self._id_offsets:
|
||||||
self._oeb.log.warning('Could not find TOC entry:', child.title)
|
self._oeb.log.warning('Could not find TOC entry:', child.title)
|
||||||
continue
|
continue
|
||||||
offset = self._id_offsets[h]
|
offset = self._id_offsets[h]
|
||||||
|
|
||||||
length = None
|
length = None
|
||||||
|
|
||||||
for sibling in entries[i+1:]:
|
for sibling in entries[i+1:]:
|
||||||
h2 = sibling.href
|
h2 = sibling.href
|
||||||
if h2 in self._id_offsets:
|
if h2 in self._id_offsets:
|
||||||
@ -441,9 +456,25 @@ class MobiWriter(object):
|
|||||||
if offset2 > offset:
|
if offset2 > offset:
|
||||||
length = offset2 - offset
|
length = offset2 - offset
|
||||||
break
|
break
|
||||||
|
|
||||||
if length is None:
|
if length is None:
|
||||||
length = self._content_length - offset
|
length = self._content_length - offset
|
||||||
|
|
||||||
|
# Look a gap between nodes
|
||||||
|
if (i) :
|
||||||
|
if offset != previousOffset + previousLength :
|
||||||
|
self._oeb.log.warning("\tnodes %d and %d have a gap:" % (i-1, i))
|
||||||
|
self._oeb.log.warning("\tnode %d offset: 0x%X \t node %d: offset: 0x%X length: 0x%X" % \
|
||||||
|
(i, offset, i-1, previousOffset, previousLength) )
|
||||||
|
self._oeb.log.warning('Failed to generate index')
|
||||||
|
# Zero out self._HTMLRecords, return False
|
||||||
|
self._HTMLRecords = []
|
||||||
|
last_name = None
|
||||||
|
return False
|
||||||
|
|
||||||
|
previousOffset = offset
|
||||||
|
previousLength = length
|
||||||
|
|
||||||
# Calculate the HTML record for this entry
|
# Calculate the HTML record for this entry
|
||||||
myStartingRecord = offset // RECORD_SIZE
|
myStartingRecord = offset // RECORD_SIZE
|
||||||
|
|
||||||
@ -461,19 +492,30 @@ class MobiWriter(object):
|
|||||||
# Calculate the ending HTMLRecord of this entry
|
# Calculate the ending HTMLRecord of this entry
|
||||||
myEndingRecord = (offset + length) // RECORD_SIZE
|
myEndingRecord = (offset + length) // RECORD_SIZE
|
||||||
|
|
||||||
# Tell the future HTML records about us
|
|
||||||
if myEndingRecord > myStartingRecord :
|
if myEndingRecord > myStartingRecord :
|
||||||
interimSpanRecord = myStartingRecord + 1
|
interimSpanRecord = myStartingRecord + 1
|
||||||
while interimSpanRecord <= myEndingRecord :
|
while interimSpanRecord <= myEndingRecord :
|
||||||
self._HTMLRecords[interimSpanRecord].continuingNode = myIndex
|
self._HTMLRecords[interimSpanRecord].continuingNode = myIndex
|
||||||
self._HTMLRecords[interimSpanRecord].currentSectionNodeCount = 1
|
self._HTMLRecords[interimSpanRecord].currentSectionNodeCount = 1
|
||||||
interimSpanRecord += 1
|
interimSpanRecord += 1
|
||||||
|
if self.opts.verbose > 3 :self._oeb.logger.info("\tnode %03d %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \
|
||||||
|
(myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, interimSpanRecord, offset, length) )
|
||||||
|
else :
|
||||||
|
if self.opts.verbose > 3 : self._oeb.logger.info("\tnode %03d %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \
|
||||||
|
(myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, myStartingRecord, offset, length) )
|
||||||
|
|
||||||
ctoc_offset = self._ctoc_map[child]
|
ctoc_offset = self._ctoc_map[child]
|
||||||
last_name = "%04d" % myIndex
|
last_name = "%04X" % myIndex
|
||||||
myIndex += 1
|
myIndex += 1
|
||||||
|
|
||||||
|
# Successfully parsed the entries
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _build_TBS_Book(self, nrecords, lastrecord):
|
def _build_TBS_Book(self, nrecords, lastrecord):
|
||||||
|
if self.opts.verbose > 3 and False :
|
||||||
|
self._oeb.logger.info("_build_TBS_Book: HTML record %d of %d" % (nrecords, lastrecord) )
|
||||||
|
self._HTMLRecords[nrecords].dumpData(nrecords,self._oeb)
|
||||||
|
|
||||||
# Variables for trailing byte sequence
|
# Variables for trailing byte sequence
|
||||||
tbsType = 0x00
|
tbsType = 0x00
|
||||||
@ -533,7 +575,9 @@ class MobiWriter(object):
|
|||||||
tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount)
|
tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount)
|
||||||
tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD)
|
tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD)
|
||||||
|
|
||||||
# print "record %d: tbsType %d" % (nrecords, tbsType)
|
if self.opts.verbose > 3 and False:
|
||||||
|
self._oeb.logger.info("record %d: tbsType %d" % (nrecords, tbsType) )
|
||||||
|
|
||||||
self._tbSequence = tbSequence
|
self._tbSequence = tbSequence
|
||||||
|
|
||||||
|
|
||||||
@ -556,16 +600,16 @@ class MobiWriter(object):
|
|||||||
self._oeb.logger.info('Compressing markup content...')
|
self._oeb.logger.info('Compressing markup content...')
|
||||||
data, overlap = self._read_text_record(text)
|
data, overlap = self._read_text_record(text)
|
||||||
|
|
||||||
# GR borrowed this from generate_index
|
# We need entries[] before calling self._build_HTMLRecords_Data_List()
|
||||||
# We seem to need it before calling self._build_HTMLRecords_Data_List()
|
self._ctoc = self._generate_ctoc()
|
||||||
ctoc = self._generate_ctoc()
|
|
||||||
|
|
||||||
# Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop
|
# Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop
|
||||||
toc = self._oeb.toc
|
toc = self._oeb.toc
|
||||||
entries = list(toc.iter())[1:]
|
entries = list(toc.iter())[1:]
|
||||||
hasNCXEntries = True if len(entries) else False
|
if len(entries) :
|
||||||
if hasNCXEntries :
|
self._hasValidNCXEntries = self._build_HTMLRecords_Data_List()
|
||||||
self._build_HTMLRecords_Data_List()
|
else :
|
||||||
|
self._hasValidNCXEntries = False
|
||||||
|
|
||||||
while len(data) > 0:
|
while len(data) > 0:
|
||||||
if self._compression == PALMDOC:
|
if self._compression == PALMDOC:
|
||||||
@ -579,7 +623,7 @@ class MobiWriter(object):
|
|||||||
running = offset
|
running = offset
|
||||||
|
|
||||||
# Write Trailing Byte Sequence
|
# Write Trailing Byte Sequence
|
||||||
if INDEXING and hasNCXEntries :
|
if INDEXING and self._hasValidNCXEntries:
|
||||||
# Dispatch to different TBS generators based upon publication type
|
# Dispatch to different TBS generators based upon publication type
|
||||||
booktype = 0x101 if self.opts.mobi_periodical else 0x002
|
booktype = 0x101 if self.opts.mobi_periodical else 0x002
|
||||||
if booktype == 0x002 :
|
if booktype == 0x002 :
|
||||||
@ -639,6 +683,7 @@ class MobiWriter(object):
|
|||||||
self._text_nrecords = nrecords
|
self._text_nrecords = nrecords
|
||||||
|
|
||||||
def _generate_indxt(self, ctoc):
|
def _generate_indxt(self, ctoc):
|
||||||
|
|
||||||
if self.opts.mobi_periodical:
|
if self.opts.mobi_periodical:
|
||||||
raise NotImplementedError('Indexing for periodicals not implemented')
|
raise NotImplementedError('Indexing for periodicals not implemented')
|
||||||
toc = self._oeb.toc
|
toc = self._oeb.toc
|
||||||
@ -655,7 +700,7 @@ class MobiWriter(object):
|
|||||||
|
|
||||||
pos = 0xc0 + indxt.tell()
|
pos = 0xc0 + indxt.tell()
|
||||||
indices.write(pack('>H', pos))
|
indices.write(pack('>H', pos))
|
||||||
name = "%04d"%count
|
name = "%04X"%count
|
||||||
indxt.write(chr(len(name)) + name)
|
indxt.write(chr(len(name)) + name)
|
||||||
indxt.write(INDXT['chapter'])
|
indxt.write(INDXT['chapter'])
|
||||||
indxt.write(decint(offset, DECINT_FORWARD))
|
indxt.write(decint(offset, DECINT_FORWARD))
|
||||||
@ -666,8 +711,6 @@ class MobiWriter(object):
|
|||||||
|
|
||||||
entries = list(toc.iter())[1:]
|
entries = list(toc.iter())[1:]
|
||||||
for i, child in enumerate(entries):
|
for i, child in enumerate(entries):
|
||||||
if not child.title or not child.title.strip():
|
|
||||||
continue
|
|
||||||
h = child.href
|
h = child.href
|
||||||
if h not in self._id_offsets:
|
if h not in self._id_offsets:
|
||||||
self._oeb.log.warning('Could not find TOC entry:', child.title)
|
self._oeb.log.warning('Could not find TOC entry:', child.title)
|
||||||
@ -686,7 +729,7 @@ class MobiWriter(object):
|
|||||||
|
|
||||||
add_node(child, offset, length, c)
|
add_node(child, offset, length, c)
|
||||||
ctoc_offset = self._ctoc_map[child]
|
ctoc_offset = self._ctoc_map[child]
|
||||||
last_name = "%04d"%c
|
last_name = "%04X"%c
|
||||||
c += 1
|
c += 1
|
||||||
|
|
||||||
return align_block(indxt.getvalue()), c, \
|
return align_block(indxt.getvalue()), c, \
|
||||||
@ -696,9 +739,9 @@ class MobiWriter(object):
|
|||||||
def _generate_index(self):
|
def _generate_index(self):
|
||||||
self._oeb.log('Generating primary index...')
|
self._oeb.log('Generating primary index...')
|
||||||
self._primary_index_record = None
|
self._primary_index_record = None
|
||||||
ctoc = self._generate_ctoc()
|
|
||||||
indxt, indxt_count, indices, last_name = \
|
indxt, indxt_count, indices, last_name = \
|
||||||
self._generate_indxt(ctoc)
|
self._generate_indxt(self._ctoc)
|
||||||
if last_name is None:
|
if last_name is None:
|
||||||
self._oeb.log.warn('Input document has no TOC. No index generated.')
|
self._oeb.log.warn('Input document has no TOC. No index generated.')
|
||||||
return
|
return
|
||||||
@ -801,7 +844,7 @@ class MobiWriter(object):
|
|||||||
indx0 = indx0.getvalue()
|
indx0 = indx0.getvalue()
|
||||||
|
|
||||||
self._primary_index_record = len(self._records)
|
self._primary_index_record = len(self._records)
|
||||||
self._records.extend([indx0, indx1, ctoc])
|
self._records.extend([indx0, indx1, self._ctoc])
|
||||||
|
|
||||||
# Turn this off for now
|
# Turn this off for now
|
||||||
if False:
|
if False:
|
||||||
@ -862,30 +905,36 @@ class MobiWriter(object):
|
|||||||
def _generate_ctoc(self):
|
def _generate_ctoc(self):
|
||||||
if self.opts.mobi_periodical:
|
if self.opts.mobi_periodical:
|
||||||
raise NotImplementedError('Indexing for periodicals not implemented')
|
raise NotImplementedError('Indexing for periodicals not implemented')
|
||||||
|
self._oeb.logger.info('Generating CTOC ...')
|
||||||
|
|
||||||
toc = self._oeb.toc
|
toc = self._oeb.toc
|
||||||
self._ctoc_map = {}
|
self._ctoc_map = {}
|
||||||
self._ctoc_name_map = {}
|
self._ctoc_name_map = {}
|
||||||
self._last_toc_entry = None
|
self._last_toc_entry = None
|
||||||
ctoc = StringIO()
|
ctoc = StringIO()
|
||||||
|
|
||||||
def add_node(node, cls):
|
def add_node(node, cls, title=None):
|
||||||
t = node.title
|
t = node.title if title is None else title
|
||||||
if not t:
|
|
||||||
t = _('Unnamed')
|
if t and t.strip():
|
||||||
t = t.strip()
|
t = t.strip()
|
||||||
if not isinstance(t, unicode):
|
if not isinstance(t, unicode):
|
||||||
t = t.decode('utf-8', 'replace')
|
t = t.decode('utf-8', 'replace')
|
||||||
t = t.encode('utf-8')
|
t = t.encode('utf-8')
|
||||||
self._last_toc_entry = t
|
self._last_toc_entry = t
|
||||||
self._ctoc_map[node] = ctoc.tell()
|
self._ctoc_map[node] = ctoc.tell()
|
||||||
self._ctoc_name_map[node] = t
|
self._ctoc_name_map[node] = t
|
||||||
ctoc.write(decint(len(t), DECINT_FORWARD)+t)
|
ctoc.write(decint(len(t), DECINT_FORWARD)+t)
|
||||||
|
else :
|
||||||
|
t = "(none)".encode('utf-8')
|
||||||
|
self._last_toc_entry = t
|
||||||
|
self._ctoc_map[node] = ctoc.tell()
|
||||||
|
self._ctoc_name_map[node] = t
|
||||||
|
ctoc.write(decint(len(t), DECINT_FORWARD)+t)
|
||||||
|
|
||||||
first = True
|
first = True
|
||||||
for child in toc.iter():
|
for child in toc.iter():
|
||||||
if not child.title:
|
add_node(child, 'chapter')#, title='Title Page' if first else None)
|
||||||
child.title = _('Unnamed')
|
|
||||||
add_node(child, 'chapter')
|
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
return align_block(ctoc.getvalue())
|
return align_block(ctoc.getvalue())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user