...

2025-07-09 03:04:10 -04:00 · 2011-07-23 01:04:49 -06:00 · 2011-07-23 01:04:49 -06:00 · 9800c93daa
commit 9800c93daa
parent 34d3ce25aa
2 changed files with 233 additions and 15 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -406,7 +406,7 @@ class IndexHeader(object): # {{{
        self.unknown1 = raw[8:16]
        self.index_type, = struct.unpack('>I', raw[16:20])
        self.index_type_desc = {0: 'normal', 2:
-                'inflection'}.get(self.index_type, 'unknown')
+                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
        self.idxt_start, = struct.unpack('>I', raw[20:24])
        self.index_count, = struct.unpack('>I', raw[24:28])
        self.index_encoding_num, = struct.unpack('>I', raw[28:32])
@ -596,10 +596,11 @@ class IndexEntry(object): # {{{
            0x3f : 'article',
    }

-    def __init__(self, ident, entry_type, raw, cncx, tagx_entries):
+    def __init__(self, ident, entry_type, raw, cncx, tagx_entries, flags=0):
        self.index = ident
        self.raw = raw
        self.tags = []
+        self.entry_type_raw = entry_type

        try:
            self.entry_type = self.TYPES[entry_type]
@ -619,6 +620,24 @@ class IndexEntry(object): # {{{
                vals.append(val)
            self.tags.append(Tag(tag, vals, self.entry_type, cncx))

+        if flags & 0b10:
+            # Look for optional description and author
+            desc_tag = [t for t in tagx_entries if t.tag == 22]
+            if desc_tag and raw:
+                val, consumed = decint(raw)
+                raw = raw[consumed:]
+                if val:
+                    self.tags.append(Tag(desc_tag[0], [val], self.entry_type,
+                        cncx))
+        if flags & 0b100:
+            aut_tag = [t for t in tagx_entries if t.tag == 23]
+            if aut_tag and raw:
+                val, consumed = decint(raw)
+                raw = raw[consumed:]
+                if val:
+                    self.tags.append(Tag(aut_tag[0], [val], self.entry_type,
+                        cncx))
+
    @property
    def label(self):
        for tag in self.tags:
@ -669,8 +688,8 @@ class IndexEntry(object): # {{{
        return -1

    def __str__(self):
-        ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
-            self.index, self.entry_type, len(self.tags))]
+        ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d)'%(
+            self.index, self.entry_type, bin(self.entry_type_raw)[2:], len(self.tags))]
        for tag in self.tags:
            ans.append('\t'+str(tag))
        if self.first_child_index != -1:
@ -723,8 +742,13 @@ class IndexRecord(object): # {{{
                next_off = len(indxt)
            index, consumed = decode_hex_number(indxt[off:])
            entry_type = ord(indxt[off+consumed])
+            d = 1
+            if index_header.index_type == 6:
+                flags = ord(indxt[off+consumed+d])
+                d += 1
            self.indices.append(IndexEntry(index, entry_type,
-                indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries))
+                indxt[off+consumed+d:next_off], cncx,
+                index_header.tagx_entries, flags=flags))
            index = self.indices[-1]

    def get_parent(self, index):
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -2,6 +2,7 @@
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
+from future_builtins import filter

 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -13,7 +14,7 @@ from collections import OrderedDict

 from calibre.ebooks import normalize
 from calibre.ebook.mobi.writer2 import RECORD_SIZE
-from calibre.ebooks.mobi.utils import encint
+from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex)

 def utf8_text(text):
    '''
@ -56,10 +57,6 @@ class CNCX(object): # {{{
            self.strings[item.title] = 0
            if opts.mobi_periodical:
                self.strings[item.klass] = 0
-                if item.description:
-                    self.strings[item.description] = 0
-                if item.author:
-                    self.string[item.author] = 0

        self.records = []
        offset = 0
@ -88,6 +85,69 @@ class CNCX(object): # {{{
        return self.strings[string]
 # }}}

+class IndexEntry(object):
+
+    TAG_VALUES = {
+            'offset': 1,
+            'size': 2,
+            'label_offset': 3,
+            'depth': 4,
+            'class_offset': 5,
+            'parent_index': 21,
+            'first_child_index': 22,
+            'last_child_index': 23,
+    }
+    RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys())
+
+    BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,]
+
+    def __init__(self, offset, label_offset, depth=0, class_offset=None):
+        self.offset, self.label_offset = offset, label_offset
+        self.depth, self.class_offset = depth, class_offset
+
+        self.length = 0
+        self.index = 0
+
+        self.parent_index = None
+        self.first_child_index = None
+        self.last_child_index = None
+
+    @property
+    def next_offset(self):
+        return self.offset + self.length
+
+    @property
+    def tag_nums(self):
+        for i in range(1, 5):
+            yield i
+        for attr in ('class_offset', 'parent_index', 'first_child_index',
+                'last_child_index'):
+            if getattr(self, attr) is not None:
+                yield self.TAG_VALUES[attr]
+
+    @property
+    def entry_type(self):
+        ans = 0
+        for tag in self.tag_nums:
+            ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x
+        return ans
+
+    @property
+    def bytestring(self):
+        buf = StringIO()
+        buf.write(encode_number_as_hex(self.index))
+        et = self.entry_type
+        buf.write(bytes(bytearray([et])))
+
+        for tag in self.tag_nums:
+            attr = self.RTAG_MAP[tag]
+            val = getattr(self, attr)
+            buf.write(encint(val))
+
+        ans = buf.get_value()
+        return ans
+
+
 class Indexer(object):

    def __init__(self, serializer, number_of_text_records,
@ -112,18 +172,152 @@ class Indexer(object):
        self.cncx = CNCX(oeb.toc, opts)

        if self.is_periodical:
-            self.create_periodical_index()
+            indices = self.create_periodical_index()
+            indices
        else:
            raise NotImplementedError()

-    def create_periodical_index(self):
+    def create_periodical_index(self): # {{{
        periodical_node = iter(self.oeb.toc).next()
-        sections = tuple(periodical_node)
        periodical_node_offset = self.serializer.body_start_offset
        periodical_node_size = (self.serializer.body_end_offset -
                periodical_node_offset)
-        periodical_node_size
-        sections
+
+        normalized_sections = []
+
+        id_offsets = self.serializer.id_offsets
+
+        periodical = IndexEntry(periodical_node_offset,
+                self.cncx[periodical_node.title],
+                class_offset=self.cncx[periodical_node.klass])
+        periodical.length = periodical_node_size
+        periodical.first_child_index = 1
+
+        seen_sec_offsets = set()
+        seen_art_offsets = set()
+
+        for sec in periodical_node:
+            normalized_articles = []
+            try:
+                offset = id_offsets[sec.href]
+                label = self.cncx[sec.title]
+                klass = self.cncx[sec.klass]
+            except:
+                continue
+            if offset in seen_sec_offsets:
+                continue
+            seen_sec_offsets.add(offset)
+            section = IndexEntry(offset, label, class_offset=klass, depth=1)
+            section.parent_index = 0
+            for art in sec:
+                try:
+                    offset = id_offsets[art.href]
+                    label = self.cncx[art.title]
+                    klass = self.cncx[art.klass]
+                except:
+                    continue
+                if offset in seen_art_offsets:
+                    continue
+                seen_art_offsets.add(offset)
+                article = IndexEntry(offset, label, class_offset=klass,
+                        depth=2)
+                normalized_articles.append(article)
+            if normalized_articles:
+                normalized_articles.sort(key=lambda x:x.offset)
+                normalized_sections.append((section, normalized_articles))
+
+        normalized_sections.sort(key=lambda x:x[0].offset)
+
+        # Set lengths
+        for s, x in enumerate(normalized_sections):
+            sec, normalized_articles = x
+            try:
+                sec.length = normalized_sections[s+1].offset - sec.offset
+            except:
+                sec.length = self.serializer.body_end_offset - sec.offset
+            for i, art in enumerate(normalized_articles):
+                try:
+                    art.length = normalized_articles[i+1].offset - art.offset
+                except:
+                    art.length = sec.offset + sec.length - art.offset
+
+        # Filter
+        for i, x in list(enumerate(normalized_sections)):
+            sec, normalized_articles = x
+            normalized_articles = list(filter(lambda x: x.length > 0,
+                normalized_articles))
+            normalized_sections[i] = (sec, normalized_articles)
+
+        normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1],
+            normalized_sections))
+
+        # Set indices
+        i = 0
+        for sec, normalized_articles in normalized_sections:
+            i += 1
+            sec.index = i
+
+        for sec, normalized_articles in normalized_sections:
+            for art in normalized_articles:
+                i += 1
+                art.index = i
+                art.parent_index = sec.index
+
+        for sec, normalized_articles in normalized_sections:
+            sec.first_child_index = normalized_articles[0].index
+            sec.last_child_index = normalized_articles[-1].index
+
+        # Set lengths again to close up any gaps left by filtering
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_offset = normalized_sections[s+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            sec.length = next_offset - sec.offset
+
+            for a, art in enumerate(articles):
+                try:
+                    next_offset = articles[a+1].offset
+                except:
+                    next_offset = sec.next_offset
+                art.length = next_offset - art.offset
+
+        # Sanity check
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_sec = normalized_sections[s+1]
+            except:
+                if (sec.length == 0 or sec.next_offset !=
+                        self.serializer.body_end_offset):
+                    raise ValueError('Invalid section layout')
+            else:
+                if next_sec.offset != sec.next_offset or sec.length == 0:
+                    raise ValueError('Invalid section layout')
+            for a, art in enumerate(articles):
+                try:
+                    next_art = articles[a+1]
+                except:
+                    if (art.length == 0 or art.next_offset !=
+                            sec.next_offset):
+                        raise ValueError('Invalid article layout')
+                else:
+                    if art.length == 0 or art.next_offset != next_art.offset:
+                        raise ValueError('Invalid article layout')
+
+        # Flatten
+        indices = [periodical]
+        for sec, articles in normalized_sections:
+            indices.append(sec)
+            periodical.last_child_index = sec.index
+
+        for sec, articles in normalized_sections:
+            for a in articles:
+                indices.append(a)
+
+        return indices
+    # }}}

    def create_header(self):
        buf = StringIO()