This commit is contained in:
Kovid Goyal 2011-07-18 14:06:09 -06:00
parent 59d9e15580
commit dbefbfbd86

View File

@ -1642,6 +1642,61 @@ class MobiWriter(object):
for record in self._records:
self._write(record)
def _clean_text_value(self, text):
if text is not None and text.strip() :
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else :
text = "(none)".encode('utf-8')
return text
def _compute_offset_length(self, i, node, entries) :
h = node.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry:', node.title)
return -1, -1
offset = self._id_offsets[h]
length = None
# Calculate length based on next entry's offset
for sibling in entries[i+1:]:
h2 = sibling.href
if h2 in self._id_offsets:
offset2 = self._id_offsets[h2]
if offset2 > offset:
length = offset2 - offset
break
if length is None:
length = self._content_length - offset
return offset, length
def _establish_document_structure(self) :
documentType = None
try :
klass = self._ctoc_map[0]['klass']
except :
klass = None
if klass == 'chapter' or klass == None :
documentType = 'book'
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiBook to self._MobiDoc")
self._MobiDoc.documentStructure = MobiBook()
elif klass == 'periodical' :
documentType = klass
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc")
self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode())
self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle
else :
raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass)
return documentType
# Index {{{
def _generate_index(self):
self._oeb.log('Generating INDX ...')
self._primary_index_record = None
@ -1815,276 +1870,7 @@ class MobiWriter(object):
open(os.path.join(t, n+'.bin'), 'wb').write(self._records[-(i+1)])
self._oeb.log.debug('Index records dumped to', t)
def _clean_text_value(self, text):
if text is not None and text.strip() :
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else :
text = "(none)".encode('utf-8')
return text
def _add_to_ctoc(self, ctoc_str, record_offset):
# Write vwilen + string to ctoc
# Return offset
# Is there enough room for this string in the current ctoc record?
if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
# flush this ctoc, start a new one
# print "closing ctoc_record at 0x%X" % self._ctoc.tell()
# print "starting new ctoc with '%-50.50s ...'" % ctoc_str
# pad with 00
pad = 0xfbf8 - self._ctoc.tell()
# print "padding %d bytes of 00" % pad
self._ctoc.write('\0' * (pad))
self._ctoc_records.append(self._ctoc.getvalue())
self._ctoc.truncate(0)
self._ctoc_offset += 0x10000
record_offset = self._ctoc_offset
offset = self._ctoc.tell() + record_offset
self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
return offset
def _add_flat_ctoc_node(self, node, ctoc, title=None):
# Process 'chapter' or 'article' nodes only, force either to 'chapter'
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# article = chapter
if node.klass == 'article' :
ctoc_name_map['klass'] = 'chapter'
else :
ctoc_name_map['klass'] = node.klass
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
return
def _add_structured_ctoc_node(self, node, ctoc, title=None):
# Process 'periodical', 'section' and 'article'
# Fetch the offset referencing the current ctoc_record
if node.klass is None :
return
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# Add the klass of this node
ctoc_name_map['klass'] = node.klass
if node.klass == 'chapter':
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
elif node.klass == 'periodical' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'periodical' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'periodical':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._periodicalCount += 1
elif node.klass == 'section' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'section' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'section':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._sectionCount += 1
elif node.klass == 'article' :
# Add title offset/title
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'article' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'article':
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
# Add description offset/description
if node.description :
d = self._clean_text_value(node.description)
ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
else :
ctoc_name_map['descriptionOffset'] = None
# Add author offset/attribution
if node.author :
a = self._clean_text_value(node.author)
ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
else :
ctoc_name_map['authorOffset'] = None
self._articleCount += 1
else :
raise NotImplementedError( \
'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
(node.title, node.klass, node.play_order))
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
def _generate_ctoc(self):
# Generate the compiled TOC strings
# Each node has 1-4 CTOC entries:
# Periodical (0xDF)
# title, class
# Section (0xFF)
# title, class
# Article (0x3F)
# title, class, description, author
# Chapter (0x0F)
# title, class
# nb: Chapters don't actually have @class, so we synthesize it
# in reader._toc_from_navpoint
toc = self._oeb.toc
reduced_toc = []
self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets
self._last_toc_entry = None
#ctoc = StringIO()
self._ctoc = StringIO()
# Track the individual node types
self._periodicalCount = 0
self._sectionCount = 0
self._articleCount = 0
self._chapterCount = 0
#first = True
if self._conforming_periodical_toc :
self._oeb.logger.info('Generating structured CTOC ...')
for (child) in toc.iter():
if self.opts.verbose > 2 :
self._oeb.logger.info(" %s" % child)
self._add_structured_ctoc_node(child, self._ctoc)
#first = False
else :
self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1
currentOffset = 0
for (i, child) in enumerate(toc.iterdescendants()):
# Only add chapters or articles at depth==1
# no class defaults to 'chapter'
if child.klass is None : child.klass = 'chapter'
if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 :
if self.opts.verbose > 2 :
self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \
(child.klass, child.depth(), child) )
# Test to see if this child's offset is the same as the previous child's
# offset, skip it
h = child.href
if h is None:
self._oeb.logger.warn(' Ignoring TOC entry with no href:',
child.title)
continue
if h not in self._id_offsets:
self._oeb.logger.warn(' Ignoring missing TOC entry:',
unicode(child))
continue
currentOffset = self._id_offsets[h]
# print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, self._ctoc)
reduced_toc.append(child)
previousOffset = currentOffset
else :
self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title))
else :
if self.opts.verbose > 2 :
self._oeb.logger.info("skipping class: %s depth %d at position %d" % \
(child.klass, child.depth(),i))
# Update the TOC with our edited version
self._oeb.toc.nodes = reduced_toc
# Instantiate a MobiDocument(mobitype)
if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \
not self.opts.mobi_periodical :
mobiType = 0x002
elif self._periodicalCount:
pt = None
if self._oeb.metadata.publication_type:
x = unicode(self._oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1]
mobiType = {'newspaper':0x101}.get(pt, 0x103)
else :
raise NotImplementedError('_generate_ctoc: Unrecognized document structured')
self._MobiDoc = MobiDocument(mobiType)
if self.opts.verbose > 2 :
structType = 'book'
if mobiType > 0x100 :
structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical'
self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) )
if mobiType > 0x100 :
self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \
(self._periodicalCount, self._sectionCount, self._articleCount) )
else :
self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
# Apparently the CTOC must end with a null byte
self._ctoc.write('\0')
ctoc = self._ctoc.getvalue()
rec_count = len(self._ctoc_records)
self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \
(rec_count + 1, 'records, last record' if rec_count else 'record,',
len(ctoc)/655) )
return align_block(ctoc)
# Index nodes {{{
def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) :
pos = 0xc0 + indxt.tell()
indices.write(pack('>H', pos)) # Save the offset for IDXTIndices
@ -2176,48 +1962,8 @@ class MobiWriter(object):
indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX
indxt.write(decint(0, DECINT_FORWARD)) # unknown byte
def _compute_offset_length(self, i, node, entries) :
h = node.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry:', node.title)
return -1, -1
# }}}
offset = self._id_offsets[h]
length = None
# Calculate length based on next entry's offset
for sibling in entries[i+1:]:
h2 = sibling.href
if h2 in self._id_offsets:
offset2 = self._id_offsets[h2]
if offset2 > offset:
length = offset2 - offset
break
if length is None:
length = self._content_length - offset
return offset, length
def _establish_document_structure(self) :
documentType = None
try :
klass = self._ctoc_map[0]['klass']
except :
klass = None
if klass == 'chapter' or klass == None :
documentType = 'book'
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiBook to self._MobiDoc")
self._MobiDoc.documentStructure = MobiBook()
elif klass == 'periodical' :
documentType = klass
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc")
self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode())
self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle
else :
raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass)
return documentType
def _generate_section_indices(self, child, currentSection, myPeriodical, myDoc ) :
sectionTitles = list(child.iter())[1:]
@ -2495,6 +2241,270 @@ class MobiWriter(object):
last_name, c = self._add_periodical_structured_articles(myDoc, indxt, indices)
return align_block(indxt.getvalue()), c, align_block(indices.getvalue()), last_name
# }}}
# CTOC {{{
def _add_to_ctoc(self, ctoc_str, record_offset):
# Write vwilen + string to ctoc
# Return offset
# Is there enough room for this string in the current ctoc record?
if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
# flush this ctoc, start a new one
# print "closing ctoc_record at 0x%X" % self._ctoc.tell()
# print "starting new ctoc with '%-50.50s ...'" % ctoc_str
# pad with 00
pad = 0xfbf8 - self._ctoc.tell()
# print "padding %d bytes of 00" % pad
self._ctoc.write('\0' * (pad))
self._ctoc_records.append(self._ctoc.getvalue())
self._ctoc.truncate(0)
self._ctoc_offset += 0x10000
record_offset = self._ctoc_offset
offset = self._ctoc.tell() + record_offset
self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
return offset
def _add_flat_ctoc_node(self, node, ctoc, title=None):
# Process 'chapter' or 'article' nodes only, force either to 'chapter'
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# article = chapter
if node.klass == 'article' :
ctoc_name_map['klass'] = 'chapter'
else :
ctoc_name_map['klass'] = node.klass
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
return
def _add_structured_ctoc_node(self, node, ctoc, title=None):
# Process 'periodical', 'section' and 'article'
# Fetch the offset referencing the current ctoc_record
if node.klass is None :
return
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# Add the klass of this node
ctoc_name_map['klass'] = node.klass
if node.klass == 'chapter':
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
elif node.klass == 'periodical' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'periodical' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'periodical':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._periodicalCount += 1
elif node.klass == 'section' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'section' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'section':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._sectionCount += 1
elif node.klass == 'article' :
# Add title offset/title
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'article' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'article':
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
# Add description offset/description
if node.description :
d = self._clean_text_value(node.description)
ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
else :
ctoc_name_map['descriptionOffset'] = None
# Add author offset/attribution
if node.author :
a = self._clean_text_value(node.author)
ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
else :
ctoc_name_map['authorOffset'] = None
self._articleCount += 1
else :
raise NotImplementedError( \
'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
(node.title, node.klass, node.play_order))
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
def _generate_ctoc(self):
# Generate the compiled TOC strings
# Each node has 1-4 CTOC entries:
# Periodical (0xDF)
# title, class
# Section (0xFF)
# title, class
# Article (0x3F)
# title, class, description, author
# Chapter (0x0F)
# title, class
# nb: Chapters don't actually have @class, so we synthesize it
# in reader._toc_from_navpoint
toc = self._oeb.toc
reduced_toc = []
self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets
self._last_toc_entry = None
#ctoc = StringIO()
self._ctoc = StringIO()
# Track the individual node types
self._periodicalCount = 0
self._sectionCount = 0
self._articleCount = 0
self._chapterCount = 0
#first = True
if self._conforming_periodical_toc :
self._oeb.logger.info('Generating structured CTOC ...')
for (child) in toc.iter():
if self.opts.verbose > 2 :
self._oeb.logger.info(" %s" % child)
self._add_structured_ctoc_node(child, self._ctoc)
#first = False
else :
self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1
currentOffset = 0
for (i, child) in enumerate(toc.iterdescendants()):
# Only add chapters or articles at depth==1
# no class defaults to 'chapter'
if child.klass is None : child.klass = 'chapter'
if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 :
if self.opts.verbose > 2 :
self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \
(child.klass, child.depth(), child) )
# Test to see if this child's offset is the same as the previous child's
# offset, skip it
h = child.href
if h is None:
self._oeb.logger.warn(' Ignoring TOC entry with no href:',
child.title)
continue
if h not in self._id_offsets:
self._oeb.logger.warn(' Ignoring missing TOC entry:',
unicode(child))
continue
currentOffset = self._id_offsets[h]
# print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, self._ctoc)
reduced_toc.append(child)
previousOffset = currentOffset
else :
self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title))
else :
if self.opts.verbose > 2 :
self._oeb.logger.info("skipping class: %s depth %d at position %d" % \
(child.klass, child.depth(),i))
# Update the TOC with our edited version
self._oeb.toc.nodes = reduced_toc
# Instantiate a MobiDocument(mobitype)
if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \
not self.opts.mobi_periodical :
mobiType = 0x002
elif self._periodicalCount:
pt = None
if self._oeb.metadata.publication_type:
x = unicode(self._oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1]
mobiType = {'newspaper':0x101}.get(pt, 0x103)
else :
raise NotImplementedError('_generate_ctoc: Unrecognized document structured')
self._MobiDoc = MobiDocument(mobiType)
if self.opts.verbose > 2 :
structType = 'book'
if mobiType > 0x100 :
structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical'
self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) )
if mobiType > 0x100 :
self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \
(self._periodicalCount, self._sectionCount, self._articleCount) )
else :
self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
# Apparently the CTOC must end with a null byte
self._ctoc.write('\0')
ctoc = self._ctoc.getvalue()
rec_count = len(self._ctoc_records)
self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \
(rec_count + 1, 'records, last record' if rec_count else 'record,',
len(ctoc)/655) )
return align_block(ctoc)
# }}}
class HTMLRecordData(object):
""" A data structure containing indexing/navigation data for an HTML record """