This commit is contained in:
Kovid Goyal 2011-07-26 11:25:56 -06:00
parent 919011f8e5
commit e61b86cd24
4 changed files with 77 additions and 41 deletions

View File

@ -957,15 +957,17 @@ class TBSIndexing(object): # {{{
return str({bin4(k):v for k, v in extra.iteritems()}) return str({bin4(k):v for k, v in extra.iteritems()})
tbs_type = 0 tbs_type = 0
is_periodical = self.doc_type in (257, 258, 259)
if len(byts): if len(byts):
outermost_index, extra, consumed = decode_tbs(byts) outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
is_periodical else 3)
byts = byts[consumed:] byts = byts[consumed:]
for k in extra: for k in extra:
tbs_type |= k tbs_type |= k
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
ans.append('Outermost index: %d'%outermost_index) ans.append('Outermost index: %d'%outermost_index)
ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
if self.doc_type in (257, 259): # Hierarchical periodical if is_periodical: # Hierarchical periodical
byts, a = self.interpret_periodical(tbs_type, byts, byts, a = self.interpret_periodical(tbs_type, byts,
dat['geom'][0]) dat['geom'][0])
ans += a ans += a

View File

@ -66,11 +66,14 @@ def encint(value, forward=True):
If forward is True the bytes returned are suitable for prepending to the If forward is True the bytes returned are suitable for prepending to the
output buffer, otherwise they must be append to the output buffer. output buffer, otherwise they must be append to the output buffer.
''' '''
if value < 0:
raise ValueError('Cannot encode negative numbers as vwi')
# Encode vwi # Encode vwi
byts = bytearray() byts = bytearray()
while True: while True:
b = value & 0b01111111 b = value & 0b01111111
value >>= 7 # shift value to the right by 7 bits value >>= 7 # shift value to the right by 7 bits
byts.append(b) byts.append(b)
if value == 0: if value == 0:
break break
@ -198,24 +201,31 @@ def encode_trailing_data(raw):
lsize += 1 lsize += 1
return raw + encoded return raw + encoded
def encode_fvwi(val, flags): def encode_fvwi(val, flags, flag_size=4):
''' '''
Encode the value val and the 4 bit flags flags as a fvwi. This encoding is Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
used in the trailing byte sequences for indexing. Returns encoded used in the trailing byte sequences for indexing. Returns encoded
bytestring. bytestring.
''' '''
ans = (val << 4) | (flags & 0b1111) ans = val << flag_size
for i in xrange(flag_size):
ans |= (flags & (1 << i))
return encint(ans) return encint(ans)
def decode_fvwi(byts): def decode_fvwi(byts, flag_size=4):
''' '''
Decode encoded fvwi. Returns number, flags, consumed Decode encoded fvwi. Returns number, flags, consumed
''' '''
arg, consumed = decint(bytes(byts)) arg, consumed = decint(bytes(byts))
return (arg >> 4), (arg & 0b1111), consumed val = arg >> flag_size
flags = 0
for i in xrange(flag_size):
flags |= (arg & (1 << i))
return val, flags, consumed
def decode_tbs(byts):
def decode_tbs(byts, flag_size=4):
''' '''
Trailing byte sequences for indexing consists of series of fvwi numbers. Trailing byte sequences for indexing consists of series of fvwi numbers.
This function reads the fvwi number and its associated flags. It them uses This function reads the fvwi number and its associated flags. It them uses
@ -226,10 +236,10 @@ def decode_tbs(byts):
data and the number of bytes consumed. data and the number of bytes consumed.
''' '''
byts = bytes(byts) byts = bytes(byts)
val, flags, consumed = decode_fvwi(byts) val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
extra = {} extra = {}
byts = byts[consumed:] byts = byts[consumed:]
if flags & 0b1000: if flags & 0b1000 and flag_size > 3:
extra[0b1000] = True extra[0b1000] = True
if flags & 0b0010: if flags & 0b0010:
x, consumed2 = decint(byts) x, consumed2 = decint(byts)
@ -247,7 +257,7 @@ def decode_tbs(byts):
consumed += consumed2 consumed += consumed2
return val, extra, consumed return val, extra, consumed
def encode_tbs(val, extra): def encode_tbs(val, extra, flag_size=4):
''' '''
Encode the number val and the extra data in the extra dict as an fvwi. See Encode the number val and the extra data in the extra dict as an fvwi. See
decode_tbs above. decode_tbs above.
@ -255,7 +265,7 @@ def encode_tbs(val, extra):
flags = 0 flags = 0
for flag in extra: for flag in extra:
flags |= flag flags |= flag
ans = encode_fvwi(val, flags) ans = encode_fvwi(val, flags, flag_size=flag_size)
if 0b0010 in extra: if 0b0010 in extra:
ans += encint(extra[0b0010]) ans += encint(extra[0b0010])

View File

@ -28,13 +28,12 @@ class CNCX(object): # {{{
MAX_STRING_LENGTH = 500 MAX_STRING_LENGTH = 500
def __init__(self, toc, opts): def __init__(self, toc, is_periodical):
self.strings = OrderedDict() self.strings = OrderedDict()
for item in toc: for item in toc.iterdescendants():
if item is self.toc: continue
self.strings[item.title] = 0 self.strings[item.title] = 0
if opts.mobi_periodical: if is_periodical:
self.strings[item.klass] = 0 self.strings[item.klass] = 0
self.records = [] self.records = []
@ -91,6 +90,17 @@ class IndexEntry(object): # {{{
self.first_child_index = None self.first_child_index = None
self.last_child_index = None self.last_child_index = None
def __repr__(self):
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
' parent_index=%r)')%(self.offset, self.depth, self.length,
self.index, self.parent_index)
@dynamic_property
def size(self):
def fget(self): return self.length
def fset(self, val): self.length = val
return property(fget=fget, fset=fset, doc='Alias for length')
@classmethod @classmethod
def tagx_block(cls, for_periodical=True): def tagx_block(cls, for_periodical=True):
buf = bytearray() buf = bytearray()
@ -137,7 +147,7 @@ class IndexEntry(object): # {{{
def entry_type(self): def entry_type(self):
ans = 0 ans = 0
for tag in self.tag_nums: for tag in self.tag_nums:
ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x
return ans return ans
@property @property
@ -152,7 +162,7 @@ class IndexEntry(object): # {{{
val = getattr(self, attr) val = getattr(self, attr)
buf.write(encint(val)) buf.write(encint(val))
ans = buf.get_value() ans = buf.getvalue()
return ans return ans
# }}} # }}}
@ -175,13 +185,16 @@ class TBS(object): # {{{
# The starting bytes. # The starting bytes.
# The value is zero which I think indicates the periodical # The value is zero which I think indicates the periodical
# index entry. The values for the various flags seem to be # index entry. The values for the various flags seem to be
# unused. If the 0b0100 is present, it means that the record # unused. If the 0b100 is present, it means that the record
# deals with section 1 (or is the final record with section # deals with section 1 (or is the final record with section
# transitions). # transitions).
self.type_010 = encode_tbs(0, {0b0010: 0}) self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) flag_size=3)
self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
flag_size=3)
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3)
depth_map = defaultdict(list) depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'): for x in ('starts', 'ends', 'completes'):
@ -221,12 +234,18 @@ class TBS(object): # {{{
self.type_010) self.type_010)
elif not depth_map[1]: elif not depth_map[1]:
# has only article nodes, i.e. spanned by a section # has only article nodes, i.e. spanned by a section
parent_section_index = self.depth_map[2][0].parent_index parent_section_index = depth_map[2][0].parent_index
typ = (self.type_111 if parent_section_index == 1 else typ = (self.type_111 if parent_section_index == 1 else
self.type_010) self.type_010)
else: else:
# has section transitions # has section transitions
parent_section_index = self.depth_map[2][0].parent_index if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index
typ = self.type_011
else:
parent_section_index = depth_map[1][0].index
typ = (self.type_110 if parent_section_index == 1 else
self.type_011)
buf.write(typ) buf.write(typ)
@ -243,9 +262,10 @@ class TBS(object): # {{{
if spanner is None: if spanner is None:
articles = depth_map[2] articles = depth_map[2]
sections = [self.section_map[a.parent_index] for a in articles] sections = set([self.section_map[a.parent_index] for a in
sections.sort(key=lambda x:x.offset) articles])
section_map = {s:[a for a in articles is a.parent_index == sections = sorted(sections, key=lambda x:x.offset)
section_map = {s:[a for a in articles if a.parent_index ==
s.index] for s in sections} s.index] for s in sections}
for i, section in enumerate(sections): for i, section in enumerate(sections):
# All the articles in this record that belong to section # All the articles in this record that belong to section
@ -257,7 +277,7 @@ class TBS(object): # {{{
try: try:
next_sec = sections[i+1] next_sec = sections[i+1]
except: except:
next_sec == None next_sec = None
extra = {} extra = {}
if num > 1: if num > 1:
@ -299,14 +319,14 @@ class Indexer(object): # {{{
self.log('Generating MOBI index for a %s'%('periodical' if self.log('Generating MOBI index for a %s'%('periodical' if
self.is_periodical else 'book')) self.is_periodical else 'book'))
self.is_flat_periodical = False self.is_flat_periodical = False
if opts.mobi_periodical: if self.is_periodical:
periodical_node = iter(oeb.toc).next() periodical_node = iter(oeb.toc).next()
sections = tuple(periodical_node) sections = tuple(periodical_node)
self.is_flat_periodical = len(sections) == 1 self.is_flat_periodical = len(sections) == 1
self.records = [] self.records = []
self.cncx = CNCX(oeb.toc, opts) self.cncx = CNCX(oeb.toc, self.is_periodical)
if self.is_periodical: if self.is_periodical:
self.indices = self.create_periodical_index() self.indices = self.create_periodical_index()
@ -405,7 +425,7 @@ class Indexer(object): # {{{
buf.write(pack(b'>I', 0)) # Filled in later buf.write(pack(b'>I', 0)) # Filled in later
# Number of index records 24-28 # Number of index records 24-28
buf.write(pack('b>I', len(self.records))) buf.write(pack(b'>I', len(self.records)))
# Index Encoding 28-32 # Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8 buf.write(pack(b'>I', 65001)) # utf-8
@ -457,7 +477,7 @@ class Indexer(object): # {{{
idxt_offset = buf.tell() idxt_offset = buf.tell()
buf.write(b'IDXT') buf.write(b'IDXT')
buf.write(header_length + len(tagx_block)) buf.write(pack(b'>H', header_length + len(tagx_block)))
buf.write(b'\0') buf.write(b'\0')
buf.seek(20) buf.seek(20)
buf.write(pack(b'>I', idxt_offset)) buf.write(pack(b'>I', idxt_offset))
@ -567,7 +587,7 @@ class Indexer(object): # {{{
for s, x in enumerate(normalized_sections): for s, x in enumerate(normalized_sections):
sec, normalized_articles = x sec, normalized_articles = x
try: try:
sec.length = normalized_sections[s+1].offset - sec.offset sec.length = normalized_sections[s+1][0].offset - sec.offset
except: except:
sec.length = self.serializer.body_end_offset - sec.offset sec.length = self.serializer.body_end_offset - sec.offset
for i, art in enumerate(normalized_articles): for i, art in enumerate(normalized_articles):
@ -583,17 +603,18 @@ class Indexer(object): # {{{
normalized_articles)) normalized_articles))
normalized_sections[i] = (sec, normalized_articles) normalized_sections[i] = (sec, normalized_articles)
normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1], normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
normalized_sections)) normalized_sections))
# Set indices # Set indices
i = 0 i = 0
for sec, normalized_articles in normalized_sections: for sec, articles in normalized_sections:
i += 1 i += 1
sec.index = i sec.index = i
sec.parent_index = 0
for sec, normalized_articles in normalized_sections: for sec, articles in normalized_sections:
for art in normalized_articles: for art in articles:
i += 1 i += 1
art.index = i art.index = i
art.parent_index = sec.index art.parent_index = sec.index
@ -606,7 +627,7 @@ class Indexer(object): # {{{
for s, x in enumerate(normalized_sections): for s, x in enumerate(normalized_sections):
sec, articles = x sec, articles = x
try: try:
next_offset = normalized_sections[s+1].offset next_offset = normalized_sections[s+1][0].offset
except: except:
next_offset = self.serializer.body_end_offset next_offset = self.serializer.body_end_offset
sec.length = next_offset - sec.offset sec.length = next_offset - sec.offset
@ -622,7 +643,7 @@ class Indexer(object): # {{{
for s, x in enumerate(normalized_sections): for s, x in enumerate(normalized_sections):
sec, articles = x sec, articles = x
try: try:
next_sec = normalized_sections[s+1] next_sec = normalized_sections[s+1][0]
except: except:
if (sec.length == 0 or sec.next_offset != if (sec.length == 0 or sec.next_offset !=
self.serializer.body_end_offset): self.serializer.body_end_offset):
@ -659,6 +680,7 @@ class Indexer(object): # {{{
self.tbs_map = {} self.tbs_map = {}
found_node = False found_node = False
sections = [i for i in self.indices if i.depth == 1] sections = [i for i in self.indices if i.depth == 1]
deepest = max(i.depth for i in self.indices)
for i in xrange(self.number_of_text_records): for i in xrange(self.number_of_text_records):
offset = i * RECORD_SIZE offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE next_offset = offset + RECORD_SIZE
@ -683,7 +705,7 @@ class Indexer(object): # {{{
if index.next_offset <= next_offset: if index.next_offset <= next_offset:
# Node ends in current record # Node ends in current record
data['ends'].append(index) data['ends'].append(index)
else: elif index.depth == deepest:
data['spans'] = index data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None): data['spans'] is not None):

View File

@ -55,6 +55,7 @@ class MobiWriter(object):
self.last_text_record_idx = 1 self.last_text_record_idx = 1
def __call__(self, oeb, path_or_stream): def __call__(self, oeb, path_or_stream):
self.log = oeb.log
if hasattr(path_or_stream, 'write'): if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream) return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream: with open(path_or_stream, 'w+b') as stream:
@ -90,6 +91,7 @@ class MobiWriter(object):
self.primary_index_record_idx = None self.primary_index_record_idx = None
try: try:
self.indexer = Indexer(self.serializer, self.last_text_record_idx, self.indexer = Indexer(self.serializer, self.last_text_record_idx,
len(self.records[self.last_text_record_idx]),
self.opts, self.oeb) self.opts, self.oeb)
except: except:
self.log.exception('Failed to generate MOBI index:') self.log.exception('Failed to generate MOBI index:')