From 9ab4ff1840a7b3735a6e94e4c1465295285bfc4f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 11:15:31 +0530 Subject: [PATCH] A nice framework for generating MOBI header records --- .../ebooks/conversion/plugins/mobi_output.py | 2 +- src/calibre/ebooks/mobi/debug/index.py | 4 +- src/calibre/ebooks/mobi/debug/mobi8.py | 2 +- src/calibre/ebooks/mobi/utils.py | 7 +- src/calibre/ebooks/mobi/writer8/header.py | 77 +++++++++++ src/calibre/ebooks/mobi/writer8/index.py | 125 +++++++++++++++++- 6 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 src/calibre/ebooks/mobi/writer8/header.py diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 89ab91f8eb..971d11df3b 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -169,6 +169,7 @@ class MOBIOutput(OutputFormatPlugin): self.remove_html_cover() resources = Resources(oeb, opts, self.is_periodical, add_fonts=create_kf8) + self.check_for_periodical() kf8 = self.create_kf8(resources) if create_kf8 else None @@ -203,7 +204,6 @@ class MOBIOutput(OutputFormatPlugin): resources.add_extra_images() mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) - self.check_for_periodical() write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') from calibre.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter(opts, resources, kf8, diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py index 1af1611918..94f252e231 100644 --- a/src/calibre/ebooks/mobi/debug/index.py +++ b/src/calibre/ebooks/mobi/debug/index.py @@ -17,7 +17,7 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry) File = namedtuple('File', 'file_number name divtbl_count start_position length') -Elem = namedtuple('Elem', +Elem = namedtuple('Chunk', 'insert_pos toc_text file_number sequence_number start_pos ' 'length') @@ -110,7 +110,7 @@ class SECTIndex(Index): for i, text in enumerate(self.table.iterkeys()): tag_map = self.table[text] if set(tag_map.iterkeys()) != {2, 3, 4, 6}: - raise ValueError('SECT Index has unknown tags: %s'% + raise ValueError('Chunk Index has unknown tags: %s'% (set(tag_map.iterkeys())-{2, 3, 4, 6})) toc_text = self.cncx[tag_map[2][0]] diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 1c61690d42..e3e26af0b1 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -198,7 +198,7 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, 'skel.record'), 'wb') as fo: fo.write(str(f.skel_index).encode('utf-8')) - with open(os.path.join(ddir, 'sect.record'), 'wb') as fo: + with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo: fo.write(str(f.sect_index).encode('utf-8')) with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo: diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 319af30f86..aa59ee2217 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -583,7 +583,9 @@ class CNCX(object): # {{{ self.strings[key] = offset offset += len(raw) - self.records.append(align_block(buf.getvalue())) + val = buf.getvalue() + if val: + self.records.append(align_block(val)) def __getitem__(self, string): return self.strings[string] @@ -592,6 +594,9 @@ class CNCX(object): # {{{ return bool(self.records) __nonzero__ = __bool__ + def __len__(self): + return len(self.records) + # }}} diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py new file mode 100644 index 0000000000..31571d0f5f --- /dev/null +++ b/src/calibre/ebooks/mobi/writer8/header.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from io import BytesIO +from collections import OrderedDict +from struct import pack + +from calibre.ebooks.mobi.utils import align_block + +NULL = 0xffffffff +zeroes = lambda x: b'\0'*x +nulls = lambda x: b'\xff'*x + +class Header(OrderedDict): + + HEADER_NAME = b'' + + DEFINITION = ''' + ''' + + ALIGN_BLOCK = False + POSITIONS = {} + + def __init__(self): + OrderedDict.__init__(self) + + for line in self.DEFINITION.splitlines(): + line = line.strip() + if not line or line.startswith('#'): continue + name, val = [x.strip() for x in line.partition('=')[0::2]] + if val: + val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None, + 'nulls':nulls}) + else: + val = 0 + if name in self: + raise ValueError('Duplicate field in definition: %r'%name) + self[name] = val + + def __call__(self, **kwargs): + positions = {} + for name, val in kwargs.iteritems(): + if name not in self: + raise KeyError('Not a valid header field: %r'%name) + self[name] = val + + buf = BytesIO() + buf.write(bytes(self.HEADER_NAME)) + for name, val in self.iteritems(): + val = self.format_value(name, val) + positions[name] = buf.tell() + if val is None: + raise ValueError('Dynamic field %r not set'%name) + if isinstance(val, (int, long)): + val = pack(b'>I', val) + buf.write(val) + + for pos_field, field in self.POSITIONS.iteritems(): + buf.seek(positions[pos_field]) + buf.write(pack(b'>I', positions[field])) + + ans = buf.getvalue() + if self.ALIGN_BLOCK: + ans = align_block(ans) + return ans + + + def format_value(self, name, val): + return val + + diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py index 1ee20857fb..153e140b06 100644 --- a/src/calibre/ebooks/mobi/writer8/index.py +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -12,7 +12,8 @@ from collections import namedtuple from struct import pack from io import BytesIO -from calibre.ebooks.mobi.utils import CNCX, encint +from calibre.ebooks.mobi.utils import CNCX, encint, align_block +from calibre.ebooks.mobi.writer8.header import Header TagMeta = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') @@ -23,13 +24,79 @@ EndTagTable = TagMeta('eof', 0, 0, 0, 1) mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6, 128:7, 192: 6 } +class IndexHeader(Header): # {{{ -class Index(object): + HEADER_NAME = b'INDX' + ALIGN_BLOCK = True + HEADER_LENGTH = 192 + + DEFINITION = ''' + # 4 - 8: Header Length + header_length = {header_length} + + # 8 - 16: Unknown + unknown1 = zeroes(8) + + # 16 - 20: Index type: 0 - normal 2 - inflection + type = 2 + + # 20 - 24: IDXT offset (filled in later) + idxt_offset + + # 24 - 28: Number of index records + num_of_records = 1 + + # 28 - 32: Index encoding (65001 = utf-8) + encoding = 65001 + + # 32 - 36: Unknown + unknown2 = NULL + + # 36 - 40: Number of Index entries + num_of_entries = DYN + + # 40 - 44: ORDT offset + ordt_offset + + # 44 - 48: LIGT offset + ligt_offset + + # 48 - 52: Number of ORDT/LIGT? entries + num_of_ordt_entries + + # 52 - 56: Number of CNCX records + num_of_cncx = DYN + + # 56 - 180: Unknown + unknown3 = zeroes(124) + + # 180 - 184: TAGX offset + tagx_offset = {header_length} + + # 184 - 192: Unknown + unknown4 = zeroes(8) + + # TAGX + tagx = DYN + + # Last Index entry + last_index = DYN + + # IDXT + idxt = DYN + '''.format(header_length=HEADER_LENGTH) + + POSITIONS = {'idxt_offset':'idxt'} +# }}} + +class Index(object): # {{{ control_byte_count = 1 cncx = CNCX() tag_types = (EndTagTable,) + HEADER_LENGTH = IndexHeader.HEADER_LENGTH + @classmethod def generate_tagx(cls): header = b'TAGX' @@ -60,17 +127,18 @@ class Index(object): control_bytes.append(cbs) return control_bytes - def build_records(self): + def __call__(self): self.control_bytes = self.calculate_control_bytes_for_each_entry( self.entries) - self.rendered_entries = [] + rendered_entries = [] offset = 0 + index, idxt, buf = BytesIO(), BytesIO(), BytesIO() IndexEntry = namedtuple('IndexEntry', 'offset length raw') for i, x in enumerate(self.entries): control_bytes = self.control_bytes[i] leading_text, tags = x - buf = BytesIO() + buf.truncate(0) raw = bytearray(leading_text) raw.insert(0, len(leading_text)) buf.write(bytes(raw)) @@ -81,8 +149,53 @@ class Index(object): for val in values: buf.write(encint(val)) raw = buf.getvalue() - self.rendered_entries.append(IndexEntry(offset, len(raw), raw)) + rendered_entries.append(IndexEntry(offset, len(raw), raw)) + idxt.write(pack(b'>H', self.HEADER_LENGTH+offset)) offset += len(raw) + index.write(raw) + + index_block = align_block(index.getvalue()) + idxt_block = align_block(b'IDXT' + idxt.getvalue()) + body = index_block + idxt_block + if len(body) + self.HEADER_LENGTH >= 0x10000: + raise ValueError('Index has too many entries, calibre does not' + ' support generating multiple index records at this' + ' time.') + + header = b'INDX' + buf.truncate(0) + buf.write(pack(b'>I', self.HEADER_LENGTH)) + buf.write(b'\0'*4) # Unknown + buf.write(pack(b'>I', 1)) # Header type? Or index record number? + buf.write(b'\0'*4) # Unknown + + # IDXT block offset + buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block))) + + # Number of index entries + buf.write(pack(b'>I', len(rendered_entries))) + + buf.write(b'\xff'*8) # Unknown + + buf.write(b'\0'*156) # Unknown + + header += buf.getvalue() + index_record = header + body + + tagx = self.generate_tagx() + idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) + + b'\0') + header = { + 'num_of_entries': len(rendered_entries), + 'num_of_cncx': len(self.cncx), + 'tagx':tagx, + 'idxt':idxt + } + header = IndexHeader()(**header) + self.records = [header, index_record] + self.records.extend(self.cncx.records) + return self.records +# }}} class SkelIndex(Index):