...

2025-08-11 09:13:57 -04:00 · 2011-07-26 11:25:56 -06:00 · 2011-07-26 11:25:56 -06:00 · e61b86cd24
commit e61b86cd24
parent 919011f8e5
4 changed files with 77 additions and 41 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -957,15 +957,17 @@ class TBSIndexing(object): # {{{
            return str({bin4(k):v for k, v in extra.iteritems()})

        tbs_type = 0
+        is_periodical = self.doc_type in (257, 258, 259)
        if len(byts):
-            outermost_index, extra, consumed = decode_tbs(byts)
+            outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
+                    is_periodical else 3)
            byts = byts[consumed:]
            for k in extra:
                tbs_type |= k
            ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
            ans.append('Outermost index: %d'%outermost_index)
            ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
-            if self.doc_type in (257, 259): # Hierarchical periodical
+            if is_periodical: # Hierarchical periodical
                byts, a = self.interpret_periodical(tbs_type, byts,
                        dat['geom'][0])
                ans += a
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -66,11 +66,14 @@ def encint(value, forward=True):
    If forward is True the bytes returned are suitable for prepending to the
    output buffer, otherwise they must be append to the output buffer.
    '''
+    if value < 0:
+        raise ValueError('Cannot encode negative numbers as vwi')
    # Encode vwi
    byts = bytearray()
    while True:
        b = value & 0b01111111
        value >>= 7 # shift value to the right by 7 bits
+
        byts.append(b)
        if value == 0:
            break
@ -198,24 +201,31 @@ def encode_trailing_data(raw):
        lsize += 1
    return raw + encoded

-def encode_fvwi(val, flags):
+def encode_fvwi(val, flags, flag_size=4):
    '''
-    Encode the value val and the 4 bit flags flags as a fvwi. This encoding is
+    Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
    used in the trailing byte sequences for indexing. Returns encoded
    bytestring.
    '''
-    ans = (val << 4) | (flags & 0b1111)
+    ans = val << flag_size
+    for i in xrange(flag_size):
+        ans |= (flags & (1 << i))
    return encint(ans)


-def decode_fvwi(byts):
+def decode_fvwi(byts, flag_size=4):
    '''
    Decode encoded fvwi. Returns number, flags, consumed
    '''
    arg, consumed = decint(bytes(byts))
-    return (arg >> 4), (arg & 0b1111), consumed
+    val = arg >> flag_size
+    flags = 0
+    for i in xrange(flag_size):
+        flags |= (arg & (1 << i))
+    return val, flags, consumed

-def decode_tbs(byts):
+
+def decode_tbs(byts, flag_size=4):
    '''
    Trailing byte sequences for indexing consists of series of fvwi numbers.
    This function reads the fvwi number and its associated flags. It them uses
@ -226,10 +236,10 @@ def decode_tbs(byts):
    data and the number of bytes consumed.
    '''
    byts = bytes(byts)
-    val, flags, consumed = decode_fvwi(byts)
+    val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
    extra = {}
    byts = byts[consumed:]
-    if flags & 0b1000:
+    if flags & 0b1000 and flag_size > 3:
        extra[0b1000] = True
    if flags & 0b0010:
        x, consumed2 = decint(byts)
@ -247,7 +257,7 @@ def decode_tbs(byts):
        consumed += consumed2
    return val, extra, consumed

-def encode_tbs(val, extra):
+def encode_tbs(val, extra, flag_size=4):
    '''
    Encode the number val and the extra data in the extra dict as an fvwi. See
    decode_tbs above.
@ -255,7 +265,7 @@ def encode_tbs(val, extra):
    flags = 0
    for flag in extra:
        flags |= flag
-    ans = encode_fvwi(val, flags)
+    ans = encode_fvwi(val, flags, flag_size=flag_size)

    if 0b0010 in extra:
        ans += encint(extra[0b0010])
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -28,13 +28,12 @@ class CNCX(object): # {{{

    MAX_STRING_LENGTH = 500

-    def __init__(self, toc, opts):
+    def __init__(self, toc, is_periodical):
        self.strings = OrderedDict()

-        for item in toc:
-            if item is self.toc: continue
+        for item in toc.iterdescendants():
            self.strings[item.title] = 0
-            if opts.mobi_periodical:
+            if is_periodical:
                self.strings[item.klass] = 0

        self.records = []
@ -91,6 +90,17 @@ class IndexEntry(object): # {{{
        self.first_child_index = None
        self.last_child_index = None

+    def __repr__(self):
+        return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
+                ' parent_index=%r)')%(self.offset, self.depth, self.length,
+                        self.index, self.parent_index)
+
+    @dynamic_property
+    def size(self):
+        def fget(self): return self.length
+        def fset(self, val): self.length = val
+        return property(fget=fget, fset=fset, doc='Alias for length')
+
    @classmethod
    def tagx_block(cls, for_periodical=True):
        buf = bytearray()
@ -137,7 +147,7 @@ class IndexEntry(object): # {{{
    def entry_type(self):
        ans = 0
        for tag in self.tag_nums:
-            ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x
+            ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x
        return ans

    @property
@ -152,7 +162,7 @@ class IndexEntry(object): # {{{
            val = getattr(self, attr)
            buf.write(encint(val))

-        ans = buf.get_value()
+        ans = buf.getvalue()
        return ans

 # }}}
@ -175,13 +185,16 @@ class TBS(object): # {{{
                # The starting bytes.
                # The value is zero which I think indicates the periodical
                # index entry. The values for the various flags seem to be
-                # unused. If the 0b0100 is present, it means that the record
+                # unused. If the 0b100 is present, it means that the record
                # deals with section 1 (or is the final record with section
                # transitions).
-                self.type_010 = encode_tbs(0, {0b0010: 0})
-                self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0})
-                self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0})
-                self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0})
+                self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
+                self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
+                        flag_size=3)
+                self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
+                        flag_size=3)
+                self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
+                    0}, flag_size=3)

                depth_map = defaultdict(list)
                for x in ('starts', 'ends', 'completes'):
@ -221,12 +234,18 @@ class TBS(object): # {{{
                        self.type_010)
            elif not depth_map[1]:
                # has only article nodes, i.e. spanned by a section
-                parent_section_index = self.depth_map[2][0].parent_index
+                parent_section_index = depth_map[2][0].parent_index
                typ = (self.type_111 if parent_section_index == 1 else
                        self.type_010)
            else:
                # has section transitions
-                parent_section_index = self.depth_map[2][0].parent_index
+                if depth_map[2]:
+                    parent_section_index = depth_map[2][0].parent_index
+                    typ = self.type_011
+                else:
+                    parent_section_index = depth_map[1][0].index
+                    typ = (self.type_110 if parent_section_index == 1 else
+                            self.type_011)

        buf.write(typ)

@ -243,9 +262,10 @@ class TBS(object): # {{{

        if spanner is None:
            articles = depth_map[2]
-            sections = [self.section_map[a.parent_index] for a in articles]
-            sections.sort(key=lambda x:x.offset)
-            section_map = {s:[a for a in articles is a.parent_index ==
+            sections = set([self.section_map[a.parent_index] for a in
+                articles])
+            sections = sorted(sections, key=lambda x:x.offset)
+            section_map = {s:[a for a in articles if a.parent_index ==
                s.index] for s in sections}
            for i, section in enumerate(sections):
                # All the articles in this record that belong to section
@ -257,7 +277,7 @@ class TBS(object): # {{{
                try:
                    next_sec = sections[i+1]
                except:
-                    next_sec == None
+                    next_sec = None

                extra = {}
                if num > 1:
@ -299,14 +319,14 @@ class Indexer(object): # {{{
        self.log('Generating MOBI index for a %s'%('periodical' if
            self.is_periodical else 'book'))
        self.is_flat_periodical = False
-        if opts.mobi_periodical:
+        if self.is_periodical:
            periodical_node = iter(oeb.toc).next()
            sections = tuple(periodical_node)
            self.is_flat_periodical = len(sections) == 1

        self.records = []

-        self.cncx = CNCX(oeb.toc, opts)
+        self.cncx = CNCX(oeb.toc, self.is_periodical)

        if self.is_periodical:
            self.indices = self.create_periodical_index()
@ -405,7 +425,7 @@ class Indexer(object): # {{{
        buf.write(pack(b'>I', 0)) # Filled in later

        # Number of index records 24-28
-        buf.write(pack('b>I', len(self.records)))
+        buf.write(pack(b'>I', len(self.records)))

        # Index Encoding 28-32
        buf.write(pack(b'>I', 65001)) # utf-8
@ -457,7 +477,7 @@ class Indexer(object): # {{{
        idxt_offset = buf.tell()

        buf.write(b'IDXT')
-        buf.write(header_length + len(tagx_block))
+        buf.write(pack(b'>H', header_length + len(tagx_block)))
        buf.write(b'\0')
        buf.seek(20)
        buf.write(pack(b'>I', idxt_offset))
@ -567,7 +587,7 @@ class Indexer(object): # {{{
        for s, x in enumerate(normalized_sections):
            sec, normalized_articles = x
            try:
-                sec.length = normalized_sections[s+1].offset - sec.offset
+                sec.length = normalized_sections[s+1][0].offset - sec.offset
            except:
                sec.length = self.serializer.body_end_offset - sec.offset
            for i, art in enumerate(normalized_articles):
@ -583,17 +603,18 @@ class Indexer(object): # {{{
                normalized_articles))
            normalized_sections[i] = (sec, normalized_articles)

-        normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1],
+        normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
            normalized_sections))

        # Set indices
        i = 0
-        for sec, normalized_articles in normalized_sections:
+        for sec, articles in normalized_sections:
            i += 1
            sec.index = i
+            sec.parent_index = 0

-        for sec, normalized_articles in normalized_sections:
-            for art in normalized_articles:
+        for sec, articles in normalized_sections:
+            for art in articles:
                i += 1
                art.index = i
                art.parent_index = sec.index
@ -606,7 +627,7 @@ class Indexer(object): # {{{
        for s, x in enumerate(normalized_sections):
            sec, articles = x
            try:
-                next_offset = normalized_sections[s+1].offset
+                next_offset = normalized_sections[s+1][0].offset
            except:
                next_offset = self.serializer.body_end_offset
            sec.length = next_offset - sec.offset
@ -622,7 +643,7 @@ class Indexer(object): # {{{
        for s, x in enumerate(normalized_sections):
            sec, articles = x
            try:
-                next_sec = normalized_sections[s+1]
+                next_sec = normalized_sections[s+1][0]
            except:
                if (sec.length == 0 or sec.next_offset !=
                        self.serializer.body_end_offset):
@ -659,6 +680,7 @@ class Indexer(object): # {{{
        self.tbs_map = {}
        found_node = False
        sections = [i for i in self.indices if i.depth == 1]
+        deepest = max(i.depth for i in self.indices)
        for i in xrange(self.number_of_text_records):
            offset = i * RECORD_SIZE
            next_offset = offset + RECORD_SIZE
@ -683,7 +705,7 @@ class Indexer(object): # {{{
                    if index.next_offset <= next_offset:
                        # Node ends in current record
                        data['ends'].append(index)
-                    else:
+                    elif index.depth == deepest:
                        data['spans'] = index
            if (data['ends'] or data['completes'] or data['starts'] or
                    data['spans'] is not None):
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -55,6 +55,7 @@ class MobiWriter(object):
        self.last_text_record_idx = 1

    def __call__(self, oeb, path_or_stream):
+        self.log = oeb.log
        if hasattr(path_or_stream, 'write'):
            return self.dump_stream(oeb, path_or_stream)
        with open(path_or_stream, 'w+b') as stream:
@ -90,6 +91,7 @@ class MobiWriter(object):
        self.primary_index_record_idx = None
        try:
            self.indexer = Indexer(self.serializer, self.last_text_record_idx,
+                    len(self.records[self.last_text_record_idx]),
                    self.opts, self.oeb)
        except:
            self.log.exception('Failed to generate MOBI index:')