diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index d861e69cbf..5b0ecf6f8e 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -406,7 +406,7 @@ class IndexHeader(object): # {{{ self.unknown1 = raw[8:16] self.index_type, = struct.unpack('>I', raw[16:20]) self.index_type_desc = {0: 'normal', 2: - 'inflection'}.get(self.index_type, 'unknown') + 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') self.idxt_start, = struct.unpack('>I', raw[20:24]) self.index_count, = struct.unpack('>I', raw[24:28]) self.index_encoding_num, = struct.unpack('>I', raw[28:32]) @@ -596,10 +596,11 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw, cncx, tagx_entries): + def __init__(self, ident, entry_type, raw, cncx, tagx_entries, flags=0): self.index = ident self.raw = raw self.tags = [] + self.entry_type_raw = entry_type try: self.entry_type = self.TYPES[entry_type] @@ -619,6 +620,24 @@ class IndexEntry(object): # {{{ vals.append(val) self.tags.append(Tag(tag, vals, self.entry_type, cncx)) + if flags & 0b10: + # Look for optional description and author + desc_tag = [t for t in tagx_entries if t.tag == 22] + if desc_tag and raw: + val, consumed = decint(raw) + raw = raw[consumed:] + if val: + self.tags.append(Tag(desc_tag[0], [val], self.entry_type, + cncx)) + if flags & 0b100: + aut_tag = [t for t in tagx_entries if t.tag == 23] + if aut_tag and raw: + val, consumed = decint(raw) + raw = raw[consumed:] + if val: + self.tags.append(Tag(aut_tag[0], [val], self.entry_type, + cncx)) + @property def label(self): for tag in self.tags: @@ -669,8 +688,8 @@ class IndexEntry(object): # {{{ return -1 def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( - self.index, self.entry_type, len(self.tags))] + ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d)'%( + self.index, self.entry_type, bin(self.entry_type_raw)[2:], len(self.tags))] for tag in self.tags: ans.append('\t'+str(tag)) if self.first_child_index != -1: @@ -723,8 +742,13 @@ class IndexRecord(object): # {{{ next_off = len(indxt) index, consumed = decode_hex_number(indxt[off:]) entry_type = ord(indxt[off+consumed]) + d = 1 + if index_header.index_type == 6: + flags = ord(indxt[off+consumed+d]) + d += 1 self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries)) + indxt[off+consumed+d:next_off], cncx, + index_header.tagx_entries, flags=flags)) index = self.indices[-1] def get_parent(self, index): diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index e586c446b4..2296a40d14 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -2,6 +2,7 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import (unicode_literals, division, absolute_import, print_function) +from future_builtins import filter __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' @@ -13,7 +14,7 @@ from collections import OrderedDict from calibre.ebooks import normalize from calibre.ebook.mobi.writer2 import RECORD_SIZE -from calibre.ebooks.mobi.utils import encint +from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex) def utf8_text(text): ''' @@ -56,10 +57,6 @@ class CNCX(object): # {{{ self.strings[item.title] = 0 if opts.mobi_periodical: self.strings[item.klass] = 0 - if item.description: - self.strings[item.description] = 0 - if item.author: - self.string[item.author] = 0 self.records = [] offset = 0 @@ -88,6 +85,69 @@ class CNCX(object): # {{{ return self.strings[string] # }}} +class IndexEntry(object): + + TAG_VALUES = { + 'offset': 1, + 'size': 2, + 'label_offset': 3, + 'depth': 4, + 'class_offset': 5, + 'parent_index': 21, + 'first_child_index': 22, + 'last_child_index': 23, + } + RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys()) + + BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,] + + def __init__(self, offset, label_offset, depth=0, class_offset=None): + self.offset, self.label_offset = offset, label_offset + self.depth, self.class_offset = depth, class_offset + + self.length = 0 + self.index = 0 + + self.parent_index = None + self.first_child_index = None + self.last_child_index = None + + @property + def next_offset(self): + return self.offset + self.length + + @property + def tag_nums(self): + for i in range(1, 5): + yield i + for attr in ('class_offset', 'parent_index', 'first_child_index', + 'last_child_index'): + if getattr(self, attr) is not None: + yield self.TAG_VALUES[attr] + + @property + def entry_type(self): + ans = 0 + for tag in self.tag_nums: + ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x + return ans + + @property + def bytestring(self): + buf = StringIO() + buf.write(encode_number_as_hex(self.index)) + et = self.entry_type + buf.write(bytes(bytearray([et]))) + + for tag in self.tag_nums: + attr = self.RTAG_MAP[tag] + val = getattr(self, attr) + buf.write(encint(val)) + + ans = buf.get_value() + return ans + + class Indexer(object): def __init__(self, serializer, number_of_text_records, @@ -112,18 +172,152 @@ class Indexer(object): self.cncx = CNCX(oeb.toc, opts) if self.is_periodical: - self.create_periodical_index() + indices = self.create_periodical_index() + indices else: raise NotImplementedError() - def create_periodical_index(self): + def create_periodical_index(self): # {{{ periodical_node = iter(self.oeb.toc).next() - sections = tuple(periodical_node) periodical_node_offset = self.serializer.body_start_offset periodical_node_size = (self.serializer.body_end_offset - periodical_node_offset) - periodical_node_size - sections + + normalized_sections = [] + + id_offsets = self.serializer.id_offsets + + periodical = IndexEntry(periodical_node_offset, + self.cncx[periodical_node.title], + class_offset=self.cncx[periodical_node.klass]) + periodical.length = periodical_node_size + periodical.first_child_index = 1 + + seen_sec_offsets = set() + seen_art_offsets = set() + + for sec in periodical_node: + normalized_articles = [] + try: + offset = id_offsets[sec.href] + label = self.cncx[sec.title] + klass = self.cncx[sec.klass] + except: + continue + if offset in seen_sec_offsets: + continue + seen_sec_offsets.add(offset) + section = IndexEntry(offset, label, class_offset=klass, depth=1) + section.parent_index = 0 + for art in sec: + try: + offset = id_offsets[art.href] + label = self.cncx[art.title] + klass = self.cncx[art.klass] + except: + continue + if offset in seen_art_offsets: + continue + seen_art_offsets.add(offset) + article = IndexEntry(offset, label, class_offset=klass, + depth=2) + normalized_articles.append(article) + if normalized_articles: + normalized_articles.sort(key=lambda x:x.offset) + normalized_sections.append((section, normalized_articles)) + + normalized_sections.sort(key=lambda x:x[0].offset) + + # Set lengths + for s, x in enumerate(normalized_sections): + sec, normalized_articles = x + try: + sec.length = normalized_sections[s+1].offset - sec.offset + except: + sec.length = self.serializer.body_end_offset - sec.offset + for i, art in enumerate(normalized_articles): + try: + art.length = normalized_articles[i+1].offset - art.offset + except: + art.length = sec.offset + sec.length - art.offset + + # Filter + for i, x in list(enumerate(normalized_sections)): + sec, normalized_articles = x + normalized_articles = list(filter(lambda x: x.length > 0, + normalized_articles)) + normalized_sections[i] = (sec, normalized_articles) + + normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1], + normalized_sections)) + + # Set indices + i = 0 + for sec, normalized_articles in normalized_sections: + i += 1 + sec.index = i + + for sec, normalized_articles in normalized_sections: + for art in normalized_articles: + i += 1 + art.index = i + art.parent_index = sec.index + + for sec, normalized_articles in normalized_sections: + sec.first_child_index = normalized_articles[0].index + sec.last_child_index = normalized_articles[-1].index + + # Set lengths again to close up any gaps left by filtering + for s, x in enumerate(normalized_sections): + sec, articles = x + try: + next_offset = normalized_sections[s+1].offset + except: + next_offset = self.serializer.body_end_offset + sec.length = next_offset - sec.offset + + for a, art in enumerate(articles): + try: + next_offset = articles[a+1].offset + except: + next_offset = sec.next_offset + art.length = next_offset - art.offset + + # Sanity check + for s, x in enumerate(normalized_sections): + sec, articles = x + try: + next_sec = normalized_sections[s+1] + except: + if (sec.length == 0 or sec.next_offset != + self.serializer.body_end_offset): + raise ValueError('Invalid section layout') + else: + if next_sec.offset != sec.next_offset or sec.length == 0: + raise ValueError('Invalid section layout') + for a, art in enumerate(articles): + try: + next_art = articles[a+1] + except: + if (art.length == 0 or art.next_offset != + sec.next_offset): + raise ValueError('Invalid article layout') + else: + if art.length == 0 or art.next_offset != next_art.offset: + raise ValueError('Invalid article layout') + + # Flatten + indices = [periodical] + for sec, articles in normalized_sections: + indices.append(sec) + periodical.last_child_index = sec.index + + for sec, articles in normalized_sections: + for a in articles: + indices.append(a) + + return indices + # }}} def create_header(self): buf = StringIO()