mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Start work on new MOBI indexing implementation
This commit is contained in:
		
							parent
							
								
									eab57e4f82
								
							
						
					
					
						commit
						60f1f24e66
					
				@ -82,26 +82,6 @@ class MOBIOutput(OutputFormatPlugin):
 | 
				
			|||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.oeb.log.debug('Using mastheadImage supplied in manifest...')
 | 
					            self.oeb.log.debug('Using mastheadImage supplied in manifest...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def dump_toc(self, toc) :
 | 
					 | 
				
			||||||
        self.log( "\n         >>> TOC contents <<<")
 | 
					 | 
				
			||||||
        self.log( "     toc.title: %s" % toc.title)
 | 
					 | 
				
			||||||
        self.log( "      toc.href: %s" % toc.href)
 | 
					 | 
				
			||||||
        for periodical in toc.nodes :
 | 
					 | 
				
			||||||
            self.log( "\tperiodical title: %s" % periodical.title)
 | 
					 | 
				
			||||||
            self.log( "\t            href: %s" % periodical.href)
 | 
					 | 
				
			||||||
            for section in periodical :
 | 
					 | 
				
			||||||
                self.log( "\t\tsection title: %s" % section.title)
 | 
					 | 
				
			||||||
                self.log( "\t\tfirst article: %s" % section.href)
 | 
					 | 
				
			||||||
                for article in section :
 | 
					 | 
				
			||||||
                    self.log( "\t\t\tarticle title: %s" % repr(article.title))
 | 
					 | 
				
			||||||
                    self.log( "\t\t\t         href: %s" % article.href)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def dump_manifest(self) :
 | 
					 | 
				
			||||||
        self.log( "\n         >>> Manifest entries <<<")
 | 
					 | 
				
			||||||
        for href in self.oeb.manifest.hrefs :
 | 
					 | 
				
			||||||
            self.log ("\t%s" % href)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def periodicalize_toc(self):
 | 
					    def periodicalize_toc(self):
 | 
				
			||||||
        from calibre.ebooks.oeb.base import TOC
 | 
					        from calibre.ebooks.oeb.base import TOC
 | 
				
			||||||
        toc = self.oeb.toc
 | 
					        toc = self.oeb.toc
 | 
				
			||||||
@ -156,12 +136,6 @@ class MOBIOutput(OutputFormatPlugin):
 | 
				
			|||||||
            # Fix up the periodical href to point to first section href
 | 
					            # Fix up the periodical href to point to first section href
 | 
				
			||||||
            toc.nodes[0].href = toc.nodes[0].nodes[0].href
 | 
					            toc.nodes[0].href = toc.nodes[0].nodes[0].href
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # diagnostics
 | 
					 | 
				
			||||||
            if self.opts.verbose > 3:
 | 
					 | 
				
			||||||
                self.dump_toc(toc)
 | 
					 | 
				
			||||||
                self.dump_manifest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def convert(self, oeb, output_path, input_plugin, opts, log):
 | 
					    def convert(self, oeb, output_path, input_plugin, opts, log):
 | 
				
			||||||
        self.log, self.opts, self.oeb = log, opts, oeb
 | 
					        self.log, self.opts, self.oeb = log, opts, oeb
 | 
				
			||||||
        from calibre.ebooks.mobi.mobiml import MobiMLizer
 | 
					        from calibre.ebooks.mobi.mobiml import MobiMLizer
 | 
				
			||||||
 | 
				
			|||||||
@ -177,3 +177,23 @@ def get_trailing_data(record, extra_data_flags):
 | 
				
			|||||||
            record = record[:-sz]
 | 
					            record = record[:-sz]
 | 
				
			||||||
    return data, record
 | 
					    return data, record
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def encode_trailing_data(raw):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Given some data in the bytestring raw, return a bytestring of the form
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        <data><size>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    where size is a backwards encoded vwi whose value is the length of the
 | 
				
			||||||
 | 
					    entire return bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This is the encoding used for trailing data entries at the end of text
 | 
				
			||||||
 | 
					    records. See get_trailing_data() for details.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    lsize = 1
 | 
				
			||||||
 | 
					    while True:
 | 
				
			||||||
 | 
					        encoded = encint(len(raw) + lsize, forward=False)
 | 
				
			||||||
 | 
					        if len(encoded) == lsize:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        lsize += 1
 | 
				
			||||||
 | 
					    return raw + encoded
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -12,4 +12,5 @@ UNCOMPRESSED = 1
 | 
				
			|||||||
PALMDOC = 2
 | 
					PALMDOC = 2
 | 
				
			||||||
HUFFDIC = 17480
 | 
					HUFFDIC = 17480
 | 
				
			||||||
PALM_MAX_IMAGE_SIZE = 63 * 1024
 | 
					PALM_MAX_IMAGE_SIZE = 63 * 1024
 | 
				
			||||||
 | 
					RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										116
									
								
								src/calibre/ebooks/mobi/writer2/indexer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										116
									
								
								src/calibre/ebooks/mobi/writer2/indexer.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,116 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 | 
				
			||||||
 | 
					from __future__ import (unicode_literals, division, absolute_import,
 | 
				
			||||||
 | 
					                        print_function)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__license__   = 'GPL v3'
 | 
				
			||||||
 | 
					__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 | 
				
			||||||
 | 
					__docformat__ = 'restructuredtext en'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from struct import pack
 | 
				
			||||||
 | 
					from cStringIO import StringIO
 | 
				
			||||||
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from calibre.ebooks import normalize
 | 
				
			||||||
 | 
					from calibre.ebooks.mobi.utils import encint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def utf8_text(text):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
 | 
				
			||||||
 | 
					    empty, normalized bytestring.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    if text and text.strip():
 | 
				
			||||||
 | 
					        text = text.strip()
 | 
				
			||||||
 | 
					        if not isinstance(text, unicode):
 | 
				
			||||||
 | 
					            text = text.decode('utf-8', 'replace')
 | 
				
			||||||
 | 
					        text = normalize(text).encode('utf-8')
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        text = _('Unknown').encode('utf-8')
 | 
				
			||||||
 | 
					    return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def align_block(raw, multiple=4, pad=b'\0'):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Return raw with enough pad bytes append to ensure its length is a multiple
 | 
				
			||||||
 | 
					    of 4.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    extra = len(raw) % multiple
 | 
				
			||||||
 | 
					    if extra == 0: return raw
 | 
				
			||||||
 | 
					    return raw + pad*(multiple - extra)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CNCX(object): # {{{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Create the CNCX records. These are records containing all the strings from
 | 
				
			||||||
 | 
					    the NCX. Each record is of the form: <vwi string size><utf-8 encoded
 | 
				
			||||||
 | 
					    string>
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    MAX_STRING_LENGTH = 500
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, toc, opts):
 | 
				
			||||||
 | 
					        self.strings = OrderedDict()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for item in toc:
 | 
				
			||||||
 | 
					            if item is self.toc: continue
 | 
				
			||||||
 | 
					            label = item.title
 | 
				
			||||||
 | 
					            klass = item.klass
 | 
				
			||||||
 | 
					            if opts.mobi_periodical:
 | 
				
			||||||
 | 
					                if item.description:
 | 
				
			||||||
 | 
					                    self.strings[item.description] = 0
 | 
				
			||||||
 | 
					                if item.author:
 | 
				
			||||||
 | 
					                    self.string[item.author] = 0
 | 
				
			||||||
 | 
					            self.strings[label] = self.strings[klass] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.records = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        offset = 0
 | 
				
			||||||
 | 
					        buf = StringIO()
 | 
				
			||||||
 | 
					        for key in tuple(self.strings.iterkeys()):
 | 
				
			||||||
 | 
					            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
 | 
				
			||||||
 | 
					            l = len(utf8)
 | 
				
			||||||
 | 
					            sz_bytes = encint(l)
 | 
				
			||||||
 | 
					            raw = sz_bytes + utf8
 | 
				
			||||||
 | 
					            if 0xfbf8 - buf.tell() < 6 + len(raw):
 | 
				
			||||||
 | 
					                # Records in PDB files cannot be larger than 0x10000, so we
 | 
				
			||||||
 | 
					                # stop well before that.
 | 
				
			||||||
 | 
					                pad = 0xfbf8 - self._ctoc.tell()
 | 
				
			||||||
 | 
					                buf.write(b'\0' * pad)
 | 
				
			||||||
 | 
					                self.records.append(buf.getvalue())
 | 
				
			||||||
 | 
					                buf.truncate(0)
 | 
				
			||||||
 | 
					                offset = len(self.records) * 0x10000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.strings[key] = offset
 | 
				
			||||||
 | 
					            offset += len(raw)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        buf.write(b'\0') # CNCX must end with zero byte
 | 
				
			||||||
 | 
					        self.records.append(align_block(buf.getvalue()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __getitem__(self, string):
 | 
				
			||||||
 | 
					        return self.strings[string]
 | 
				
			||||||
 | 
					# }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Indexer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, serializer, number_of_text_records, opts, oeb):
 | 
				
			||||||
 | 
					        self.serializer = serializer
 | 
				
			||||||
 | 
					        self.number_of_text_records = number_of_text_records
 | 
				
			||||||
 | 
					        self.oeb = oeb
 | 
				
			||||||
 | 
					        self.log = oeb.log
 | 
				
			||||||
 | 
					        self.opts = opts
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.cncx = CNCX(oeb.toc, opts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.records = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def create_header(self):
 | 
				
			||||||
 | 
					        buf = StringIO()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Ident
 | 
				
			||||||
 | 
					        buf.write(b'INDX')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Header length
 | 
				
			||||||
 | 
					        buf.write(pack(b'>I', 192))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Index type: 0 - normal, 2 - inflection
 | 
				
			||||||
 | 
					        buf.write(pack(b'>I', 2))
 | 
				
			||||||
@ -17,8 +17,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
 | 
				
			|||||||
from calibre.ebooks.compression.palmdoc import compress_doc
 | 
					from calibre.ebooks.compression.palmdoc import compress_doc
 | 
				
			||||||
from calibre.ebooks.mobi.langcodes import iana2mobi
 | 
					from calibre.ebooks.mobi.langcodes import iana2mobi
 | 
				
			||||||
from calibre.utils.filenames import ascii_filename
 | 
					from calibre.utils.filenames import ascii_filename
 | 
				
			||||||
from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
 | 
					from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
 | 
				
			||||||
from calibre.ebooks.mobi.utils import (rescale_image, encint)
 | 
					from calibre.ebooks.mobi.utils import (rescale_image, encint,
 | 
				
			||||||
 | 
					        encode_trailing_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EXTH_CODES = {
 | 
					EXTH_CODES = {
 | 
				
			||||||
    'creator': 100,
 | 
					    'creator': 100,
 | 
				
			||||||
@ -39,9 +40,6 @@ EXTH_CODES = {
 | 
				
			|||||||
# Disabled as I dont care about uncrossable breaks
 | 
					# Disabled as I dont care about uncrossable breaks
 | 
				
			||||||
WRITE_UNCROSSABLE_BREAKS = False
 | 
					WRITE_UNCROSSABLE_BREAKS = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RECORD_SIZE = 0x1000 # 4096
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MAX_THUMB_SIZE = 16 * 1024
 | 
					MAX_THUMB_SIZE = 16 * 1024
 | 
				
			||||||
MAX_THUMB_DIMEN = (180, 240)
 | 
					MAX_THUMB_DIMEN = (180, 240)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -53,6 +51,7 @@ class MobiWriter(object):
 | 
				
			|||||||
        self.write_page_breaks_after_item = write_page_breaks_after_item
 | 
					        self.write_page_breaks_after_item = write_page_breaks_after_item
 | 
				
			||||||
        self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
 | 
					        self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
 | 
				
			||||||
        self.prefer_author_sort = opts.prefer_author_sort
 | 
					        self.prefer_author_sort = opts.prefer_author_sort
 | 
				
			||||||
 | 
					        self.last_text_record_idx = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, oeb, path_or_stream):
 | 
					    def __call__(self, oeb, path_or_stream):
 | 
				
			||||||
        if hasattr(path_or_stream, 'write'):
 | 
					        if hasattr(path_or_stream, 'write'):
 | 
				
			||||||
@ -79,9 +78,44 @@ class MobiWriter(object):
 | 
				
			|||||||
    def generate_content(self):
 | 
					    def generate_content(self):
 | 
				
			||||||
        self.map_image_names()
 | 
					        self.map_image_names()
 | 
				
			||||||
        self.generate_text()
 | 
					        self.generate_text()
 | 
				
			||||||
        # Image records come after text records
 | 
					        # Index records come after text records
 | 
				
			||||||
 | 
					        self.generate_index()
 | 
				
			||||||
 | 
					        self.write_uncrossable_breaks()
 | 
				
			||||||
 | 
					        # Image records come after index records
 | 
				
			||||||
        self.generate_images()
 | 
					        self.generate_images()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Indexing {{{
 | 
				
			||||||
 | 
					    def generate_index(self):
 | 
				
			||||||
 | 
					        self.primary_index_record_idx = None
 | 
				
			||||||
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def write_uncrossable_breaks(self): # {{{
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Write information about uncrossable breaks (non linear items in
 | 
				
			||||||
 | 
					        the spine.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        if not WRITE_UNCROSSABLE_BREAKS:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        breaks = self.serializer.breaks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for i in xrange(1, self.last_text_record_idx+1):
 | 
				
			||||||
 | 
					            offset = i * RECORD_SIZE
 | 
				
			||||||
 | 
					            pbreak = 0
 | 
				
			||||||
 | 
					            running = offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            buf = StringIO()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            while breaks and (breaks[0] - offset) < RECORD_SIZE:
 | 
				
			||||||
 | 
					                pbreak = (breaks.pop(0) - running) >> 3
 | 
				
			||||||
 | 
					                encoded = encint(pbreak)
 | 
				
			||||||
 | 
					                buf.write(encoded)
 | 
				
			||||||
 | 
					                running += pbreak << 3
 | 
				
			||||||
 | 
					            encoded = encode_trailing_data(buf.getvalue())
 | 
				
			||||||
 | 
					            self.records[i] += encoded
 | 
				
			||||||
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Images {{{
 | 
				
			||||||
    def map_image_names(self):
 | 
					    def map_image_names(self):
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        Map image names to record indices, ensuring that the masthead image if
 | 
					        Map image names to record indices, ensuring that the masthead image if
 | 
				
			||||||
@ -120,23 +154,38 @@ class MobiWriter(object):
 | 
				
			|||||||
            if self.first_image_record is None:
 | 
					            if self.first_image_record is None:
 | 
				
			||||||
                self.first_image_record = len(self.records) - 1
 | 
					                self.first_image_record = len(self.records) - 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_thumbnail(self, item):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
 | 
				
			||||||
 | 
					                    maxsizeb=MAX_THUMB_SIZE)
 | 
				
			||||||
 | 
					        except IOError:
 | 
				
			||||||
 | 
					            self.oeb.logger.warn('Bad image file %r' % item.href)
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        manifest = self.oeb.manifest
 | 
				
			||||||
 | 
					        id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
 | 
				
			||||||
 | 
					        manifest.add(id, href, 'image/jpeg', data=data)
 | 
				
			||||||
 | 
					        index = len(self.images) + 1
 | 
				
			||||||
 | 
					        self.images[href] = index
 | 
				
			||||||
 | 
					        self.records.append(data)
 | 
				
			||||||
 | 
					        return index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Text {{{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def generate_text(self):
 | 
					    def generate_text(self):
 | 
				
			||||||
        self.oeb.logger.info('Serializing markup content...')
 | 
					        self.oeb.logger.info('Serializing markup content...')
 | 
				
			||||||
        serializer = Serializer(self.oeb, self.images,
 | 
					        self.serializer = Serializer(self.oeb, self.images,
 | 
				
			||||||
                write_page_breaks_after_item=self.write_page_breaks_after_item)
 | 
					                write_page_breaks_after_item=self.write_page_breaks_after_item)
 | 
				
			||||||
        text = serializer()
 | 
					        text = self.serializer()
 | 
				
			||||||
        breaks = serializer.breaks
 | 
					 | 
				
			||||||
        self.anchor_offset_kindle = serializer.anchor_offset_kindle
 | 
					 | 
				
			||||||
        self.id_offsets = serializer.id_offsets
 | 
					 | 
				
			||||||
        self.content_length = len(text)
 | 
					        self.content_length = len(text)
 | 
				
			||||||
        self.text_length = len(text)
 | 
					        self.text_length = len(text)
 | 
				
			||||||
        text = StringIO(text)
 | 
					        text = StringIO(text)
 | 
				
			||||||
        buf = []
 | 
					 | 
				
			||||||
        nrecords = 0
 | 
					        nrecords = 0
 | 
				
			||||||
        offset = 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.compression != UNCOMPRESSED:
 | 
					        if self.compression != UNCOMPRESSED:
 | 
				
			||||||
            self.oeb.logger.info('  Compressing markup content...')
 | 
					            self.oeb.logger.info('  Compressing markup content...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data, overlap = self.read_text_record(text)
 | 
					        data, overlap = self.read_text_record(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        while len(data) > 0:
 | 
					        while len(data) > 0:
 | 
				
			||||||
@ -146,39 +195,15 @@ class MobiWriter(object):
 | 
				
			|||||||
            record.write(data)
 | 
					            record.write(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.records.append(record.getvalue())
 | 
					            self.records.append(record.getvalue())
 | 
				
			||||||
            buf.append(self.records[-1])
 | 
					 | 
				
			||||||
            nrecords += 1
 | 
					            nrecords += 1
 | 
				
			||||||
            offset += RECORD_SIZE
 | 
					 | 
				
			||||||
            data, overlap = self.read_text_record(text)
 | 
					            data, overlap = self.read_text_record(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Write information about the mutibyte character overlap, if any
 | 
					            # Write information about the mutibyte character overlap, if any
 | 
				
			||||||
            record.write(overlap)
 | 
					            record.write(overlap)
 | 
				
			||||||
            record.write(pack(b'>B', len(overlap)))
 | 
					            record.write(pack(b'>B', len(overlap)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Write information about uncrossable breaks (non linear items in
 | 
					 | 
				
			||||||
            # the spine)
 | 
					 | 
				
			||||||
            if WRITE_UNCROSSABLE_BREAKS:
 | 
					 | 
				
			||||||
                nextra = 0
 | 
					 | 
				
			||||||
                pbreak = 0
 | 
					 | 
				
			||||||
                running = offset
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
                # Write information about every uncrossable break that occurs in
 | 
					        self.last_text_record_idx = nrecords
 | 
				
			||||||
                # the next record.
 | 
					 | 
				
			||||||
                while breaks and (breaks[0] - offset) < RECORD_SIZE:
 | 
					 | 
				
			||||||
                    pbreak = (breaks.pop(0) - running) >> 3
 | 
					 | 
				
			||||||
                    encoded = encint(pbreak)
 | 
					 | 
				
			||||||
                    record.write(encoded)
 | 
					 | 
				
			||||||
                    running += pbreak << 3
 | 
					 | 
				
			||||||
                    nextra += len(encoded)
 | 
					 | 
				
			||||||
                lsize = 1
 | 
					 | 
				
			||||||
                while True:
 | 
					 | 
				
			||||||
                    size = encint(nextra + lsize, forward=False)
 | 
					 | 
				
			||||||
                    if len(size) == lsize:
 | 
					 | 
				
			||||||
                        break
 | 
					 | 
				
			||||||
                    lsize += 1
 | 
					 | 
				
			||||||
                record.write(size)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.text_nrecords = nrecords + 1
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def read_text_record(self, text):
 | 
					    def read_text_record(self, text):
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
@ -230,25 +255,31 @@ class MobiWriter(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        return data, overlap
 | 
					        return data, overlap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def generate_end_records(self):
 | 
					    # }}}
 | 
				
			||||||
        self.flis_number = len(self.records)
 | 
					 | 
				
			||||||
        self.records.append('\xE9\x8E\x0D\x0A')
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def generate_record0(self): # {{{
 | 
					    def generate_record0(self): #  MOBI header {{{
 | 
				
			||||||
        metadata = self.oeb.metadata
 | 
					        metadata = self.oeb.metadata
 | 
				
			||||||
        exth = self.build_exth()
 | 
					        exth = self.build_exth()
 | 
				
			||||||
        last_content_record = len(self.records) - 1
 | 
					        last_content_record = len(self.records) - 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # EOF record
 | 
				
			||||||
 | 
					        self.records.append('\xE9\x8E\x0D\x0A')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.generate_end_records()
 | 
					        self.generate_end_records()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        record0 = StringIO()
 | 
					        record0 = StringIO()
 | 
				
			||||||
        # The PalmDOC Header
 | 
					        # The MOBI Header
 | 
				
			||||||
        record0.write(pack(b'>HHIHHHH', self.compression, 0,
 | 
					        record0.write(pack(b'>HHIHHHH',
 | 
				
			||||||
            self.text_length,
 | 
					            self.compression, # compression type # compression type
 | 
				
			||||||
            self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
 | 
					            0, # Unused
 | 
				
			||||||
 | 
					            self.text_length, # Text length
 | 
				
			||||||
 | 
					            self.last_text_record_idx, # Number of text records or last tr idx
 | 
				
			||||||
 | 
					            RECORD_SIZE, # Text record size
 | 
				
			||||||
 | 
					            0, # Unused
 | 
				
			||||||
 | 
					            0  # Unused
 | 
				
			||||||
 | 
					        )) # 0 - 15 (0x0 - 0xf)
 | 
				
			||||||
        uid = random.randint(0, 0xffffffff)
 | 
					        uid = random.randint(0, 0xffffffff)
 | 
				
			||||||
        title = normalize(unicode(metadata.title[0])).encode('utf-8')
 | 
					        title = normalize(unicode(metadata.title[0])).encode('utf-8')
 | 
				
			||||||
        # The MOBI Header
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # 0x0 - 0x3
 | 
					        # 0x0 - 0x3
 | 
				
			||||||
        record0.write(b'MOBI')
 | 
					        record0.write(b'MOBI')
 | 
				
			||||||
@ -270,7 +301,6 @@ class MobiWriter(object):
 | 
				
			|||||||
        # 0x18 - 0x1f : Unknown
 | 
					        # 0x18 - 0x1f : Unknown
 | 
				
			||||||
        record0.write(b'\xff' * 8)
 | 
					        record0.write(b'\xff' * 8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # 0x20 - 0x23 : Secondary index record
 | 
					        # 0x20 - 0x23 : Secondary index record
 | 
				
			||||||
        record0.write(pack(b'>I', 0xffffffff))
 | 
					        record0.write(pack(b'>I', 0xffffffff))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -279,7 +309,7 @@ class MobiWriter(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # 0x40 - 0x43 : Offset of first non-text record
 | 
					        # 0x40 - 0x43 : Offset of first non-text record
 | 
				
			||||||
        record0.write(pack(b'>I',
 | 
					        record0.write(pack(b'>I',
 | 
				
			||||||
            self.text_nrecords + 1))
 | 
					            self.last_text_record_idx + 1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # 0x44 - 0x4b : title offset, title length
 | 
					        # 0x44 - 0x4b : title offset, title length
 | 
				
			||||||
        record0.write(pack(b'>II',
 | 
					        record0.write(pack(b'>II',
 | 
				
			||||||
@ -289,7 +319,7 @@ class MobiWriter(object):
 | 
				
			|||||||
        record0.write(iana2mobi(
 | 
					        record0.write(iana2mobi(
 | 
				
			||||||
            str(metadata.language[0])))
 | 
					            str(metadata.language[0])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # 0x50 - 0x57 : Unknown
 | 
					        # 0x50 - 0x57 : Input language and Output language
 | 
				
			||||||
        record0.write(b'\0' * 8)
 | 
					        record0.write(b'\0' * 8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # 0x58 - 0x5b : Format version
 | 
					        # 0x58 - 0x5b : Format version
 | 
				
			||||||
@ -348,19 +378,20 @@ class MobiWriter(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # 0xe0 - 0xe3 : Extra record data
 | 
					        # 0xe0 - 0xe3 : Extra record data
 | 
				
			||||||
        # Extra record data flags:
 | 
					        # Extra record data flags:
 | 
				
			||||||
        #   - 0x1: <extra multibyte bytes><size> (?)
 | 
					        #   - 0b1  : <extra multibyte bytes><size>
 | 
				
			||||||
        #   - 0x2: <TBS indexing description of this HTML record><size> GR
 | 
					        #   - 0b10 : <TBS indexing description of this HTML record><size>
 | 
				
			||||||
        #   - 0x4: <uncrossable breaks><size>
 | 
					        #   - 0b100: <uncrossable breaks><size>
 | 
				
			||||||
        # GR: Use 7 for indexed files, 5 for unindexed
 | 
					 | 
				
			||||||
        # Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
 | 
					        # Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
 | 
				
			||||||
 | 
					 | 
				
			||||||
        extra_data_flags = 0b1 # Has multibyte overlap bytes
 | 
					        extra_data_flags = 0b1 # Has multibyte overlap bytes
 | 
				
			||||||
 | 
					        if self.primary_index_record_idx is not None:
 | 
				
			||||||
 | 
					            extra_data_flags |= 0b10
 | 
				
			||||||
        if WRITE_UNCROSSABLE_BREAKS:
 | 
					        if WRITE_UNCROSSABLE_BREAKS:
 | 
				
			||||||
            extra_data_flags |= 0b100
 | 
					            extra_data_flags |= 0b100
 | 
				
			||||||
        record0.write(pack(b'>I', extra_data_flags))
 | 
					        record0.write(pack(b'>I', extra_data_flags))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # 0xe4 - 0xe7 : Primary index record
 | 
					        # 0xe4 - 0xe7 : Primary index record
 | 
				
			||||||
        record0.write(pack(b'>I', 0xffffffff))
 | 
					        record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
 | 
				
			||||||
 | 
					            is None else self.primary_index_record_idx))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        record0.write(exth)
 | 
					        record0.write(exth)
 | 
				
			||||||
        record0.write(title)
 | 
					        record0.write(title)
 | 
				
			||||||
@ -371,7 +402,7 @@ class MobiWriter(object):
 | 
				
			|||||||
        self.records[0] = record0
 | 
					        self.records[0] = record0
 | 
				
			||||||
    # }}}
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def build_exth(self): # {{{
 | 
					    def build_exth(self): # EXTH Header {{{
 | 
				
			||||||
        oeb = self.oeb
 | 
					        oeb = self.oeb
 | 
				
			||||||
        exth = StringIO()
 | 
					        exth = StringIO()
 | 
				
			||||||
        nrecs = 0
 | 
					        nrecs = 0
 | 
				
			||||||
@ -467,22 +498,10 @@ class MobiWriter(object):
 | 
				
			|||||||
        return b''.join(exth)
 | 
					        return b''.join(exth)
 | 
				
			||||||
    # }}}
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_thumbnail(self, item):
 | 
					    def write_header(self): # PalmDB header {{{
 | 
				
			||||||
        try:
 | 
					        '''
 | 
				
			||||||
            data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
 | 
					        Write the PalmDB header
 | 
				
			||||||
                    maxsizeb=MAX_THUMB_SIZE)
 | 
					        '''
 | 
				
			||||||
        except IOError:
 | 
					 | 
				
			||||||
            self.oeb.logger.warn('Bad image file %r' % item.href)
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        manifest = self.oeb.manifest
 | 
					 | 
				
			||||||
        id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
 | 
					 | 
				
			||||||
        manifest.add(id, href, 'image/jpeg', data=data)
 | 
					 | 
				
			||||||
        index = len(self.images) + 1
 | 
					 | 
				
			||||||
        self.images[href] = index
 | 
					 | 
				
			||||||
        self.records.append(data)
 | 
					 | 
				
			||||||
        return index
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def write_header(self):
 | 
					 | 
				
			||||||
        title = ascii_filename(unicode(self.oeb.metadata.title[0]))
 | 
					        title = ascii_filename(unicode(self.oeb.metadata.title[0]))
 | 
				
			||||||
        title = title + (b'\0' * (32 - len(title)))
 | 
					        title = title + (b'\0' * (32 - len(title)))
 | 
				
			||||||
        now = int(time.time())
 | 
					        now = int(time.time())
 | 
				
			||||||
@ -494,6 +513,7 @@ class MobiWriter(object):
 | 
				
			|||||||
            self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
 | 
					            self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
 | 
				
			||||||
            offset += len(record)
 | 
					            offset += len(record)
 | 
				
			||||||
        self.write(b'\0\0')
 | 
					        self.write(b'\0\0')
 | 
				
			||||||
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def write_content(self):
 | 
					    def write_content(self):
 | 
				
			||||||
        for record in self.records:
 | 
					        for record in self.records:
 | 
				
			||||||
 | 
				
			|||||||
@ -138,7 +138,7 @@ class Serializer(object):
 | 
				
			|||||||
        buf = self.buf
 | 
					        buf = self.buf
 | 
				
			||||||
        self.anchor_offset = buf.tell()
 | 
					        self.anchor_offset = buf.tell()
 | 
				
			||||||
        buf.write(b'<body>')
 | 
					        buf.write(b'<body>')
 | 
				
			||||||
        self.anchor_offset_kindle = buf.tell()
 | 
					        self.body_start_offset = buf.tell()
 | 
				
			||||||
        spine = [item for item in self.oeb.spine if item.linear]
 | 
					        spine = [item for item in self.oeb.spine if item.linear]
 | 
				
			||||||
        spine.extend([item for item in self.oeb.spine if not item.linear])
 | 
					        spine.extend([item for item in self.oeb.spine if not item.linear])
 | 
				
			||||||
        for item in spine:
 | 
					        for item in spine:
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user