Start work on new MOBI indexing implementation

This commit is contained in:
Kovid Goyal 2011-07-22 18:48:48 -06:00
parent eab57e4f82
commit 60f1f24e66
6 changed files with 231 additions and 100 deletions

View File

@ -82,26 +82,6 @@ class MOBIOutput(OutputFormatPlugin):
else:
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
def dump_toc(self, toc) :
self.log( "\n >>> TOC contents <<<")
self.log( " toc.title: %s" % toc.title)
self.log( " toc.href: %s" % toc.href)
for periodical in toc.nodes :
self.log( "\tperiodical title: %s" % periodical.title)
self.log( "\t href: %s" % periodical.href)
for section in periodical :
self.log( "\t\tsection title: %s" % section.title)
self.log( "\t\tfirst article: %s" % section.href)
for article in section :
self.log( "\t\t\tarticle title: %s" % repr(article.title))
self.log( "\t\t\t href: %s" % article.href)
def dump_manifest(self) :
self.log( "\n >>> Manifest entries <<<")
for href in self.oeb.manifest.hrefs :
self.log ("\t%s" % href)
def periodicalize_toc(self):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
@ -156,12 +136,6 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
# diagnostics
if self.opts.verbose > 3:
self.dump_toc(toc)
self.dump_manifest()
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.mobiml import MobiMLizer

View File

@ -177,3 +177,23 @@ def get_trailing_data(record, extra_data_flags):
record = record[:-sz]
return data, record
def encode_trailing_data(raw):
'''
Given some data in the bytestring raw, return a bytestring of the form
<data><size>
where size is a backwards encoded vwi whose value is the length of the
entire return bytestring.
This is the encoding used for trailing data entries at the end of text
records. See get_trailing_data() for details.
'''
lsize = 1
while True:
encoded = encint(len(raw) + lsize, forward=False)
if len(encoded) == lsize:
break
lsize += 1
return raw + encoded

View File

@ -12,4 +12,5 @@ UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import pack
from cStringIO import StringIO
from collections import OrderedDict
from calibre.ebooks import normalize
from calibre.ebooks.mobi.utils import encint
def utf8_text(text):
'''
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
empty, normalized bytestring.
'''
if text and text.strip():
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else:
text = _('Unknown').encode('utf-8')
return text
def align_block(raw, multiple=4, pad=b'\0'):
'''
Return raw with enough pad bytes append to ensure its length is a multiple
of 4.
'''
extra = len(raw) % multiple
if extra == 0: return raw
return raw + pad*(multiple - extra)
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
the NCX. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, toc, opts):
self.strings = OrderedDict()
for item in toc:
if item is self.toc: continue
label = item.title
klass = item.klass
if opts.mobi_periodical:
if item.description:
self.strings[item.description] = 0
if item.author:
self.string[item.author] = 0
self.strings[label] = self.strings[klass] = 0
self.records = []
offset = 0
buf = StringIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - self._ctoc.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
self.strings[key] = offset
offset += len(raw)
buf.write(b'\0') # CNCX must end with zero byte
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
# }}}
class Indexer(object):
def __init__(self, serializer, number_of_text_records, opts, oeb):
self.serializer = serializer
self.number_of_text_records = number_of_text_records
self.oeb = oeb
self.log = oeb.log
self.opts = opts
self.cncx = CNCX(oeb.toc, opts)
self.records = []
def create_header(self):
buf = StringIO()
# Ident
buf.write(b'INDX')
# Header length
buf.write(pack(b'>I', 192))
# Index type: 0 - normal, 2 - inflection
buf.write(pack(b'>I', 2))

View File

@ -17,8 +17,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
from calibre.ebooks.mobi.utils import (rescale_image, encint)
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
from calibre.ebooks.mobi.utils import (rescale_image, encint,
encode_trailing_data)
EXTH_CODES = {
'creator': 100,
@ -39,9 +40,6 @@ EXTH_CODES = {
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
RECORD_SIZE = 0x1000 # 4096
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)
@ -53,6 +51,7 @@ class MobiWriter(object):
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
self.last_text_record_idx = 1
def __call__(self, oeb, path_or_stream):
if hasattr(path_or_stream, 'write'):
@ -79,9 +78,44 @@ class MobiWriter(object):
def generate_content(self):
self.map_image_names()
self.generate_text()
# Image records come after text records
# Index records come after text records
self.generate_index()
self.write_uncrossable_breaks()
# Image records come after index records
self.generate_images()
# Indexing {{{
def generate_index(self):
self.primary_index_record_idx = None
# }}}
def write_uncrossable_breaks(self): # {{{
'''
Write information about uncrossable breaks (non linear items in
the spine.
'''
if not WRITE_UNCROSSABLE_BREAKS:
return
breaks = self.serializer.breaks
for i in xrange(1, self.last_text_record_idx+1):
offset = i * RECORD_SIZE
pbreak = 0
running = offset
buf = StringIO()
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = encint(pbreak)
buf.write(encoded)
running += pbreak << 3
encoded = encode_trailing_data(buf.getvalue())
self.records[i] += encoded
# }}}
# Images {{{
def map_image_names(self):
'''
Map image names to record indices, ensuring that the masthead image if
@ -120,23 +154,38 @@ class MobiWriter(object):
if self.first_image_record is None:
self.first_image_record = len(self.records) - 1
def add_thumbnail(self, item):
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except IOError:
self.oeb.logger.warn('Bad image file %r' % item.href)
return None
manifest = self.oeb.manifest
id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
manifest.add(id, href, 'image/jpeg', data=data)
index = len(self.images) + 1
self.images[href] = index
self.records.append(data)
return index
# }}}
# Text {{{
def generate_text(self):
self.oeb.logger.info('Serializing markup content...')
serializer = Serializer(self.oeb, self.images,
self.serializer = Serializer(self.oeb, self.images,
write_page_breaks_after_item=self.write_page_breaks_after_item)
text = serializer()
breaks = serializer.breaks
self.anchor_offset_kindle = serializer.anchor_offset_kindle
self.id_offsets = serializer.id_offsets
text = self.serializer()
self.content_length = len(text)
self.text_length = len(text)
text = StringIO(text)
buf = []
nrecords = 0
offset = 0
if self.compression != UNCOMPRESSED:
self.oeb.logger.info(' Compressing markup content...')
data, overlap = self.read_text_record(text)
while len(data) > 0:
@ -146,39 +195,15 @@ class MobiWriter(object):
record.write(data)
self.records.append(record.getvalue())
buf.append(self.records[-1])
nrecords += 1
offset += RECORD_SIZE
data, overlap = self.read_text_record(text)
# Write information about the mutibyte character overlap, if any
record.write(overlap)
record.write(pack(b'>B', len(overlap)))
# Write information about uncrossable breaks (non linear items in
# the spine)
if WRITE_UNCROSSABLE_BREAKS:
nextra = 0
pbreak = 0
running = offset
# Write information about every uncrossable break that occurs in
# the next record.
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = encint(pbreak)
record.write(encoded)
running += pbreak << 3
nextra += len(encoded)
lsize = 1
while True:
size = encint(nextra + lsize, forward=False)
if len(size) == lsize:
break
lsize += 1
record.write(size)
self.text_nrecords = nrecords + 1
self.last_text_record_idx = nrecords
def read_text_record(self, text):
'''
@ -230,25 +255,31 @@ class MobiWriter(object):
return data, overlap
def generate_end_records(self):
self.flis_number = len(self.records)
self.records.append('\xE9\x8E\x0D\x0A')
# }}}
def generate_record0(self): # {{{
def generate_record0(self): # MOBI header {{{
metadata = self.oeb.metadata
exth = self.build_exth()
last_content_record = len(self.records) - 1
# EOF record
self.records.append('\xE9\x8E\x0D\x0A')
self.generate_end_records()
record0 = StringIO()
# The PalmDOC Header
record0.write(pack(b'>HHIHHHH', self.compression, 0,
self.text_length,
self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
# The MOBI Header
record0.write(pack(b'>HHIHHHH',
self.compression, # compression type # compression type
0, # Unused
self.text_length, # Text length
self.last_text_record_idx, # Number of text records or last tr idx
RECORD_SIZE, # Text record size
0, # Unused
0 # Unused
)) # 0 - 15 (0x0 - 0xf)
uid = random.randint(0, 0xffffffff)
title = normalize(unicode(metadata.title[0])).encode('utf-8')
# The MOBI Header
# 0x0 - 0x3
record0.write(b'MOBI')
@ -270,7 +301,6 @@ class MobiWriter(object):
# 0x18 - 0x1f : Unknown
record0.write(b'\xff' * 8)
# 0x20 - 0x23 : Secondary index record
record0.write(pack(b'>I', 0xffffffff))
@ -279,7 +309,7 @@ class MobiWriter(object):
# 0x40 - 0x43 : Offset of first non-text record
record0.write(pack(b'>I',
self.text_nrecords + 1))
self.last_text_record_idx + 1))
# 0x44 - 0x4b : title offset, title length
record0.write(pack(b'>II',
@ -289,7 +319,7 @@ class MobiWriter(object):
record0.write(iana2mobi(
str(metadata.language[0])))
# 0x50 - 0x57 : Unknown
# 0x50 - 0x57 : Input language and Output language
record0.write(b'\0' * 8)
# 0x58 - 0x5b : Format version
@ -348,19 +378,20 @@ class MobiWriter(object):
# 0xe0 - 0xe3 : Extra record data
# Extra record data flags:
# - 0x1: <extra multibyte bytes><size> (?)
# - 0x2: <TBS indexing description of this HTML record><size> GR
# - 0x4: <uncrossable breaks><size>
# GR: Use 7 for indexed files, 5 for unindexed
# - 0b1 : <extra multibyte bytes><size>
# - 0b10 : <TBS indexing description of this HTML record><size>
# - 0b100: <uncrossable breaks><size>
# Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
if WRITE_UNCROSSABLE_BREAKS:
extra_data_flags |= 0b100
record0.write(pack(b'>I', extra_data_flags))
# 0xe4 - 0xe7 : Primary index record
record0.write(pack(b'>I', 0xffffffff))
record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
is None else self.primary_index_record_idx))
record0.write(exth)
record0.write(title)
@ -371,7 +402,7 @@ class MobiWriter(object):
self.records[0] = record0
# }}}
def build_exth(self): # {{{
def build_exth(self): # EXTH Header {{{
oeb = self.oeb
exth = StringIO()
nrecs = 0
@ -467,22 +498,10 @@ class MobiWriter(object):
return b''.join(exth)
# }}}
def add_thumbnail(self, item):
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except IOError:
self.oeb.logger.warn('Bad image file %r' % item.href)
return None
manifest = self.oeb.manifest
id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
manifest.add(id, href, 'image/jpeg', data=data)
index = len(self.images) + 1
self.images[href] = index
self.records.append(data)
return index
def write_header(self):
def write_header(self): # PalmDB header {{{
'''
Write the PalmDB header
'''
title = ascii_filename(unicode(self.oeb.metadata.title[0]))
title = title + (b'\0' * (32 - len(title)))
now = int(time.time())
@ -494,6 +513,7 @@ class MobiWriter(object):
self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
offset += len(record)
self.write(b'\0\0')
# }}}
def write_content(self):
for record in self.records:

View File

@ -138,7 +138,7 @@ class Serializer(object):
buf = self.buf
self.anchor_offset = buf.tell()
buf.write(b'<body>')
self.anchor_offset_kindle = buf.tell()
self.body_start_offset = buf.tell()
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine: