From 60f1f24e6695a8bd3f30da388038d03a724a15c1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 22 Jul 2011 18:48:48 -0600 Subject: [PATCH] Start work on new MOBI indexing implementation --- src/calibre/ebooks/mobi/output.py | 26 --- src/calibre/ebooks/mobi/utils.py | 20 +++ src/calibre/ebooks/mobi/writer2/__init__.py | 1 + src/calibre/ebooks/mobi/writer2/indexer.py | 116 ++++++++++++ src/calibre/ebooks/mobi/writer2/main.py | 166 ++++++++++-------- src/calibre/ebooks/mobi/writer2/serializer.py | 2 +- 6 files changed, 231 insertions(+), 100 deletions(-) create mode 100644 src/calibre/ebooks/mobi/writer2/indexer.py diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index 982ee74a2b..669d41fa8f 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -82,26 +82,6 @@ class MOBIOutput(OutputFormatPlugin): else: self.oeb.log.debug('Using mastheadImage supplied in manifest...') - - def dump_toc(self, toc) : - self.log( "\n >>> TOC contents <<<") - self.log( " toc.title: %s" % toc.title) - self.log( " toc.href: %s" % toc.href) - for periodical in toc.nodes : - self.log( "\tperiodical title: %s" % periodical.title) - self.log( "\t href: %s" % periodical.href) - for section in periodical : - self.log( "\t\tsection title: %s" % section.title) - self.log( "\t\tfirst article: %s" % section.href) - for article in section : - self.log( "\t\t\tarticle title: %s" % repr(article.title)) - self.log( "\t\t\t href: %s" % article.href) - - def dump_manifest(self) : - self.log( "\n >>> Manifest entries <<<") - for href in self.oeb.manifest.hrefs : - self.log ("\t%s" % href) - def periodicalize_toc(self): from calibre.ebooks.oeb.base import TOC toc = self.oeb.toc @@ -156,12 +136,6 @@ class MOBIOutput(OutputFormatPlugin): # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href - # diagnostics - if self.opts.verbose > 3: - self.dump_toc(toc) - self.dump_manifest() - - def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb from calibre.ebooks.mobi.mobiml import MobiMLizer diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 5192eee43c..cf03c613f4 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -177,3 +177,23 @@ def get_trailing_data(record, extra_data_flags): record = record[:-sz] return data, record +def encode_trailing_data(raw): + ''' + Given some data in the bytestring raw, return a bytestring of the form + + + + where size is a backwards encoded vwi whose value is the length of the + entire return bytestring. + + This is the encoding used for trailing data entries at the end of text + records. See get_trailing_data() for details. + ''' + lsize = 1 + while True: + encoded = encint(len(raw) + lsize, forward=False) + if len(encoded) == lsize: + break + lsize += 1 + return raw + encoded + diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py index df3dcefb94..bc8dbbf7de 100644 --- a/src/calibre/ebooks/mobi/writer2/__init__.py +++ b/src/calibre/ebooks/mobi/writer2/__init__.py @@ -12,4 +12,5 @@ UNCOMPRESSED = 1 PALMDOC = 2 HUFFDIC = 17480 PALM_MAX_IMAGE_SIZE = 63 * 1024 +RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py new file mode 100644 index 0000000000..c28b91e63a --- /dev/null +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import pack +from cStringIO import StringIO +from collections import OrderedDict + +from calibre.ebooks import normalize +from calibre.ebooks.mobi.utils import encint + +def utf8_text(text): + ''' + Convert a possibly null string to utf-8 bytes, guaranteeing to return a non + empty, normalized bytestring. + ''' + if text and text.strip(): + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else: + text = _('Unknown').encode('utf-8') + return text + +def align_block(raw, multiple=4, pad=b'\0'): + ''' + Return raw with enough pad bytes append to ensure its length is a multiple + of 4. + ''' + extra = len(raw) % multiple + if extra == 0: return raw + return raw + pad*(multiple - extra) + + +class CNCX(object): # {{{ + + ''' + Create the CNCX records. These are records containing all the strings from + the NCX. Each record is of the form: + ''' + + MAX_STRING_LENGTH = 500 + + def __init__(self, toc, opts): + self.strings = OrderedDict() + + for item in toc: + if item is self.toc: continue + label = item.title + klass = item.klass + if opts.mobi_periodical: + if item.description: + self.strings[item.description] = 0 + if item.author: + self.string[item.author] = 0 + self.strings[label] = self.strings[klass] = 0 + + self.records = [] + + offset = 0 + buf = StringIO() + for key in tuple(self.strings.iterkeys()): + utf8 = utf8_text(key[:self.MAX_STRING_LENGTH]) + l = len(utf8) + sz_bytes = encint(l) + raw = sz_bytes + utf8 + if 0xfbf8 - buf.tell() < 6 + len(raw): + # Records in PDB files cannot be larger than 0x10000, so we + # stop well before that. + pad = 0xfbf8 - self._ctoc.tell() + buf.write(b'\0' * pad) + self.records.append(buf.getvalue()) + buf.truncate(0) + offset = len(self.records) * 0x10000 + + self.strings[key] = offset + offset += len(raw) + + buf.write(b'\0') # CNCX must end with zero byte + self.records.append(align_block(buf.getvalue())) + + def __getitem__(self, string): + return self.strings[string] +# }}} + +class Indexer(object): + + def __init__(self, serializer, number_of_text_records, opts, oeb): + self.serializer = serializer + self.number_of_text_records = number_of_text_records + self.oeb = oeb + self.log = oeb.log + self.opts = opts + + self.cncx = CNCX(oeb.toc, opts) + + self.records = [] + + def create_header(self): + buf = StringIO() + + # Ident + buf.write(b'INDX') + + # Header length + buf.write(pack(b'>I', 192)) + + # Index type: 0 - normal, 2 - inflection + buf.write(pack(b'>I', 2)) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 2e9d31458a..088326a876 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -17,8 +17,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename -from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED -from calibre.ebooks.mobi.utils import (rescale_image, encint) +from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE) +from calibre.ebooks.mobi.utils import (rescale_image, encint, + encode_trailing_data) EXTH_CODES = { 'creator': 100, @@ -39,9 +40,6 @@ EXTH_CODES = { # Disabled as I dont care about uncrossable breaks WRITE_UNCROSSABLE_BREAKS = False -RECORD_SIZE = 0x1000 # 4096 - - MAX_THUMB_SIZE = 16 * 1024 MAX_THUMB_DIMEN = (180, 240) @@ -53,6 +51,7 @@ class MobiWriter(object): self.write_page_breaks_after_item = write_page_breaks_after_item self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC self.prefer_author_sort = opts.prefer_author_sort + self.last_text_record_idx = 1 def __call__(self, oeb, path_or_stream): if hasattr(path_or_stream, 'write'): @@ -79,9 +78,44 @@ class MobiWriter(object): def generate_content(self): self.map_image_names() self.generate_text() - # Image records come after text records + # Index records come after text records + self.generate_index() + self.write_uncrossable_breaks() + # Image records come after index records self.generate_images() + # Indexing {{{ + def generate_index(self): + self.primary_index_record_idx = None + # }}} + + def write_uncrossable_breaks(self): # {{{ + ''' + Write information about uncrossable breaks (non linear items in + the spine. + ''' + if not WRITE_UNCROSSABLE_BREAKS: + return + + breaks = self.serializer.breaks + + for i in xrange(1, self.last_text_record_idx+1): + offset = i * RECORD_SIZE + pbreak = 0 + running = offset + + buf = StringIO() + + while breaks and (breaks[0] - offset) < RECORD_SIZE: + pbreak = (breaks.pop(0) - running) >> 3 + encoded = encint(pbreak) + buf.write(encoded) + running += pbreak << 3 + encoded = encode_trailing_data(buf.getvalue()) + self.records[i] += encoded + # }}} + + # Images {{{ def map_image_names(self): ''' Map image names to record indices, ensuring that the masthead image if @@ -120,23 +154,38 @@ class MobiWriter(object): if self.first_image_record is None: self.first_image_record = len(self.records) - 1 + def add_thumbnail(self, item): + try: + data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN, + maxsizeb=MAX_THUMB_SIZE) + except IOError: + self.oeb.logger.warn('Bad image file %r' % item.href) + return None + manifest = self.oeb.manifest + id, href = manifest.generate('thumbnail', 'thumbnail.jpeg') + manifest.add(id, href, 'image/jpeg', data=data) + index = len(self.images) + 1 + self.images[href] = index + self.records.append(data) + return index + + # }}} + + # Text {{{ + def generate_text(self): self.oeb.logger.info('Serializing markup content...') - serializer = Serializer(self.oeb, self.images, + self.serializer = Serializer(self.oeb, self.images, write_page_breaks_after_item=self.write_page_breaks_after_item) - text = serializer() - breaks = serializer.breaks - self.anchor_offset_kindle = serializer.anchor_offset_kindle - self.id_offsets = serializer.id_offsets + text = self.serializer() self.content_length = len(text) self.text_length = len(text) text = StringIO(text) - buf = [] nrecords = 0 - offset = 0 if self.compression != UNCOMPRESSED: self.oeb.logger.info(' Compressing markup content...') + data, overlap = self.read_text_record(text) while len(data) > 0: @@ -146,39 +195,15 @@ class MobiWriter(object): record.write(data) self.records.append(record.getvalue()) - buf.append(self.records[-1]) nrecords += 1 - offset += RECORD_SIZE data, overlap = self.read_text_record(text) # Write information about the mutibyte character overlap, if any record.write(overlap) record.write(pack(b'>B', len(overlap))) - # Write information about uncrossable breaks (non linear items in - # the spine) - if WRITE_UNCROSSABLE_BREAKS: - nextra = 0 - pbreak = 0 - running = offset - # Write information about every uncrossable break that occurs in - # the next record. - while breaks and (breaks[0] - offset) < RECORD_SIZE: - pbreak = (breaks.pop(0) - running) >> 3 - encoded = encint(pbreak) - record.write(encoded) - running += pbreak << 3 - nextra += len(encoded) - lsize = 1 - while True: - size = encint(nextra + lsize, forward=False) - if len(size) == lsize: - break - lsize += 1 - record.write(size) - - self.text_nrecords = nrecords + 1 + self.last_text_record_idx = nrecords def read_text_record(self, text): ''' @@ -230,25 +255,31 @@ class MobiWriter(object): return data, overlap - def generate_end_records(self): - self.flis_number = len(self.records) - self.records.append('\xE9\x8E\x0D\x0A') + # }}} - def generate_record0(self): # {{{ + def generate_record0(self): # MOBI header {{{ metadata = self.oeb.metadata exth = self.build_exth() last_content_record = len(self.records) - 1 + # EOF record + self.records.append('\xE9\x8E\x0D\x0A') + self.generate_end_records() record0 = StringIO() - # The PalmDOC Header - record0.write(pack(b'>HHIHHHH', self.compression, 0, - self.text_length, - self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf) + # The MOBI Header + record0.write(pack(b'>HHIHHHH', + self.compression, # compression type # compression type + 0, # Unused + self.text_length, # Text length + self.last_text_record_idx, # Number of text records or last tr idx + RECORD_SIZE, # Text record size + 0, # Unused + 0 # Unused + )) # 0 - 15 (0x0 - 0xf) uid = random.randint(0, 0xffffffff) title = normalize(unicode(metadata.title[0])).encode('utf-8') - # The MOBI Header # 0x0 - 0x3 record0.write(b'MOBI') @@ -270,7 +301,6 @@ class MobiWriter(object): # 0x18 - 0x1f : Unknown record0.write(b'\xff' * 8) - # 0x20 - 0x23 : Secondary index record record0.write(pack(b'>I', 0xffffffff)) @@ -279,7 +309,7 @@ class MobiWriter(object): # 0x40 - 0x43 : Offset of first non-text record record0.write(pack(b'>I', - self.text_nrecords + 1)) + self.last_text_record_idx + 1)) # 0x44 - 0x4b : title offset, title length record0.write(pack(b'>II', @@ -289,7 +319,7 @@ class MobiWriter(object): record0.write(iana2mobi( str(metadata.language[0]))) - # 0x50 - 0x57 : Unknown + # 0x50 - 0x57 : Input language and Output language record0.write(b'\0' * 8) # 0x58 - 0x5b : Format version @@ -348,19 +378,20 @@ class MobiWriter(object): # 0xe0 - 0xe3 : Extra record data # Extra record data flags: - # - 0x1: (?) - # - 0x2: GR - # - 0x4: - # GR: Use 7 for indexed files, 5 for unindexed + # - 0b1 : + # - 0b10 : + # - 0b100: # Setting bit 2 (0x2) disables functionality - extra_data_flags = 0b1 # Has multibyte overlap bytes + if self.primary_index_record_idx is not None: + extra_data_flags |= 0b10 if WRITE_UNCROSSABLE_BREAKS: extra_data_flags |= 0b100 record0.write(pack(b'>I', extra_data_flags)) # 0xe4 - 0xe7 : Primary index record - record0.write(pack(b'>I', 0xffffffff)) + record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx + is None else self.primary_index_record_idx)) record0.write(exth) record0.write(title) @@ -371,7 +402,7 @@ class MobiWriter(object): self.records[0] = record0 # }}} - def build_exth(self): # {{{ + def build_exth(self): # EXTH Header {{{ oeb = self.oeb exth = StringIO() nrecs = 0 @@ -467,22 +498,10 @@ class MobiWriter(object): return b''.join(exth) # }}} - def add_thumbnail(self, item): - try: - data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN, - maxsizeb=MAX_THUMB_SIZE) - except IOError: - self.oeb.logger.warn('Bad image file %r' % item.href) - return None - manifest = self.oeb.manifest - id, href = manifest.generate('thumbnail', 'thumbnail.jpeg') - manifest.add(id, href, 'image/jpeg', data=data) - index = len(self.images) + 1 - self.images[href] = index - self.records.append(data) - return index - - def write_header(self): + def write_header(self): # PalmDB header {{{ + ''' + Write the PalmDB header + ''' title = ascii_filename(unicode(self.oeb.metadata.title[0])) title = title + (b'\0' * (32 - len(title))) now = int(time.time()) @@ -494,6 +513,7 @@ class MobiWriter(object): self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:]) offset += len(record) self.write(b'\0\0') + # }}} def write_content(self): for record in self.records: diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 7f1ca3931e..d6878bee4a 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -138,7 +138,7 @@ class Serializer(object): buf = self.buf self.anchor_offset = buf.tell() buf.write(b'') - self.anchor_offset_kindle = buf.tell() + self.body_start_offset = buf.tell() spine = [item for item in self.oeb.spine if item.linear] spine.extend([item for item in self.oeb.spine if not item.linear]) for item in spine: