KF8 Output: Create NCX and Guide records

2025-08-11 09:13:57 -04:00 · 2012-04-22 10:17:06 +05:30 · 2012-04-22 10:17:06 +05:30 · e4a55aae56
commit e4a55aae56
parent 0db1fcb103
5 changed files with 208 additions and 26 deletions
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -599,4 +599,8 @@ class CNCX(object): # {{{
 # }}}
 def is_guide_ref_start(ref):
    return (ref.title.lower() == 'start' or
            (ref.type and ref.type.lower() in {'start',
                    'other.start', 'text'}))
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -12,6 +12,7 @@ import re
 from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
        namespace, prefixname, urlnormalize)
 from calibre.ebooks.mobi.mobiml import MBP_NS
 from calibre.ebooks.mobi.utils import is_guide_ref_start
 from collections import defaultdict
 from urlparse import urldefrag
@ -161,9 +162,7 @@ class Serializer(object):
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
-                if (ref.title.lower() == 'start' or
+                if is_guide_ref_start(ref):
                    (ref.type and ref.type.lower() in {'start',
                        'other.start', 'text'})):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@ -15,9 +15,10 @@ from io import BytesIO
 from calibre.ebooks.mobi.utils import CNCX, encint, align_block
 from calibre.ebooks.mobi.writer8.header import Header
-TagMeta = namedtuple('TagMeta',
+TagMeta_ = namedtuple('TagMeta',
        'name number values_per_entry bitmask end_flag')
-EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+TagMeta = lambda x:TagMeta_(*x)
 EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
 # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
 # could also be extended to 4 bit wide ones as well
@ -118,7 +119,10 @@ class Index(object): # {{{
                    cbs.append(ans)
                    ans = 0
                    continue
                try:
                    nvals = len(tags.get(name, ()))
                except TypeError:
                    nvals = 1
                nentries = nvals // vpe
                shifts = mask_to_bit_shifts[mask]
                ans |= mask & (nentries << shifts)
@ -132,36 +136,51 @@ class Index(object): # {{{
                self.entries)
        rendered_entries = []
        offset = 0
        index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
        IndexEntry = namedtuple('IndexEntry', 'offset length raw')
        last_lead_text = b''
        too_large = ValueError('Index has too many entries, calibre does not'
                    ' support generating multiple index records at this'
                    ' time.')
        for i, x in enumerate(self.entries):
            control_bytes = self.control_bytes[i]
            leading_text, tags = x
-            buf.truncate(0)
+            buf.seek(0), buf.truncate(0)
            leading_text = (leading_text.encode('utf-8') if
                    isinstance(leading_text, unicode) else leading_text)
            raw = bytearray(leading_text)
            raw.insert(0, len(leading_text))
            buf.write(bytes(raw))
-            buf.write(control_bytes)
+            buf.write(bytes(bytearray(control_bytes)))
            for tag in self.tag_types:
                values = tags.get(tag.name, None)
                if values is None: continue
                try:
                    len(values)
                except TypeError:
                    values = [values]
                if values:
                    for val in values:
                        try:
                            buf.write(encint(val))
                        except ValueError:
                            raise ValueError('Invalid values for %r: %r'%(
                                tag, values))
            raw = buf.getvalue()
            offset = index.tell()
            if offset + self.HEADER_LENGTH >= 0x10000:
                raise too_large
            rendered_entries.append(IndexEntry(offset, len(raw), raw))
            idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
            offset += len(raw)
            index.write(raw)
            last_lead_text = leading_text
        index_block = align_block(index.getvalue())
        idxt_block = align_block(b'IDXT' + idxt.getvalue())
        body = index_block + idxt_block
        if len(body) + self.HEADER_LENGTH >= 0x10000:
-            raise ValueError('Index has too many entries, calibre does not'
+            raise too_large
                    ' support generating multiple index records at this'
                    ' time.')
        header = b'INDX'
        buf.truncate(0)
        buf.write(pack(b'>I', self.HEADER_LENGTH))
@ -185,10 +204,15 @@ class Index(object): # {{{
        tagx = self.generate_tagx()
        idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
                b'\0')
        # Last index
        idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
        idx += pack(b'>H', len(rendered_entries))
        header = {
                'num_of_entries': len(rendered_entries),
                'num_of_cncx': len(self.cncx),
                'tagx':tagx,
                'last_index':align_block(idx),
                'idxt':idxt
        }
        header = IndexHeader()(**header)
@ -235,6 +259,74 @@ class ChunkIndex(Index):
                    'file_number':c.file_number,
                    'sequence_number':c.sequence_number,
                    'geometry':(c.start_pos, c.length),
-                    }) for s in chunk_table
+                    }) for c in chunk_table
        ]
 class GuideIndex(Index):
    tag_types = tuple(map(TagMeta, (
        ('title',           1, 1, 1, 0),
        ('pos_fid',         6, 2, 2, 0),
        EndTagTable
    )))
    def __init__(self, guide_table):
        self.cncx = CNCX(c.title for c in guide_table)
        self.entries = [
                (r.type, {
                    'title':self.cncx[r.title],
                    'pos_fid':r.pos_fid,
                    }) for r in guide_table
        ]
 class NCXIndex(Index):
    control_byte_count = 2
    tag_types = tuple(map(TagMeta, (
        ('offset',             1, 1, 1, 0),
        ('length',             2, 1, 2, 0),
        ('label',              3, 1, 4, 0),
        ('depth',              4, 1, 8, 0),
        ('parent',             21, 1, 16, 0),
        ('first_child',        22, 1, 32, 0),
        ('last_child',         23, 1, 64, 0),
        ('pos_fid',            6, 2, 128, 0),
        EndTagTable,
        ('image',              69, 1, 1, 0),
        ('description',        70, 1, 2, 0),
        ('author',             71, 1, 4, 0),
        ('caption',            72, 1, 8, 0),
        ('attribution',        73, 1, 16, 0),
        EndTagTable
    )))
    def __init__(self, toc_table):
        strings = []
        for entry in toc_table:
            strings.append(entry['label'])
            aut = entry.get('author', None)
            if aut:
                strings.append(aut)
            desc = entry.get('description', None)
            if desc:
                strings.append(desc)
        self.cncx = CNCX(strings)
        def to_entry(x):
            ans = {}
            for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
                    'first_child', 'last_child'):
                if f in x:
                    ans[f] = x[f]
            for f in ('label', 'description', 'author'):
                if f in x:
                    ans[f] = self.cncx[x[f]]
            return ('%02x'%x['index'], ans)
        self.entries = list(map(to_entry, toc_table))
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -17,12 +17,15 @@ import cssutils
 from lxml import etree
 from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
        is_guide_ref_start)
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
        extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
 from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
 from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
        ChunkIndex, GuideIndex)
 XML_DOCS = OEB_DOCS | {SVG_MIME}
@ -38,11 +41,11 @@ class KF8Writer(object):
        self.log.info('Creating KF8 output')
        self.used_images = set()
        self.resources = resources
        self.dup_data()
        self.flows = [None] # First flow item is reserved for the text
        self.records = []
        self.fdst_table = []
        self.log('\tGenerating KF8 markup...')
        self.dup_data()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
@ -52,7 +55,10 @@ class KF8Writer(object):
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
-        self.create_fdst_table()
+        self.log('\tCreating indices...')
        self.create_fdst_records()
        self.create_indices()
        self.create_guide()
    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -231,7 +237,7 @@ class KF8Writer(object):
        records_size = 0
        if self.compress:
-            self.oeb.logger.info('  Compressing markup content...')
+            self.oeb.logger.info('\tCompressing markup...')
        while text.tell() < self.text_length:
            data, overlap = create_text_record(text)
@ -252,9 +258,90 @@ class KF8Writer(object):
            self.records.append(b'\x00'*(records_size % 4))
            self.first_non_text_record_idx += 1
-    def create_fdst_table(self):
+    def create_fdst_records(self):
        FDST = namedtuple('Flow', 'start end')
        entries = []
        self.fdst_table = []
        for i, flow in enumerate(self.flows):
            start = 0 if i == 0 else self.fdst_table[-1].end
            self.fdst_table.append(FDST(start, start + len(flow)))
            entries.extend(self.fdst_table[-1])
        rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
                pack(b'>%dL'%len(entries), *entries))
        self.fdst_records = [rec]
    def create_indices(self):
        self.skel_records = SkelIndex(self.skel_table)()
        self.chunk_records = ChunkIndex(self.chunk_table)()
        self.ncx_records = []
        toc = self.oeb.toc
        max_depth = toc.depth()
        entries = []
        is_periodical = self.opts.mobi_periodical
        if toc.count() < 2:
            self.log.warn('Document has no ToC, MOBI will have no NCX index')
            return
        # Flatten the ToC into a depth first list
        fl = toc.iter() if is_periodical else toc.iterdescendants()
        for i, item in enumerate(fl):
            entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
                is_periodical else 1), 'href':item.href, 'label':(item.title or
                    _('Unknown'))}
            entries.append(entry)
            for child in item:
                child.ncx_parent = entry
            p = getattr(item, 'ncx_parent', None)
            if p is not None:
                entry['parent'] = p['index']
            if is_periodical:
                if item.author:
                    entry['author'] = item.author
                if item.description:
                    entry['description'] = item.description
        for entry in entries:
            children = [e for e in entries if e.get('parent', -1) == entry['index']]
            if children:
                entry['first_child'] = children[0]['index']
                entry['last_child'] = children[-1]['index']
            href = entry.pop('href')
            href, frag = href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''), None)
            if aid is None:
                pos, fid = 0, 0
            else:
                pos, fid = self.aid_offset_map[aid]
            chunk = self.chunk_table[pos]
            offset = chunk.insert_pos + fid
            length = chunk.length
            entry['pos_fid'] = (pos, fid)
            entry['offset'] = offset
            entry['length'] = length
        self.ncx_records = NCXIndex(entries)()
    def create_guide(self):
        self.start_offset = None
        self.guide_table = []
        self.guide_records = []
        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
        for ref in self.oeb.guide:
            ref = self.oeb.guide[ref]
            href, frag = ref.href.partition('#')[0::2]
            aid = self.id_map.get((href, frag), None)
            if aid is None:
                aid = self.id_map.get((href, ''))
            if aid is None:
                continue
            pos, fid = self.aid_offset_map[aid]
            if is_guide_ref_start(ref):
                self.start_offset = pos
            self.guide_table.append(GuideRef(ref.title or
                _('Unknown'), ref.type, (pos, fid)))
        if self.guide_table:
            self.guide_records = GuideIndex(self.guide_table)()
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -359,14 +359,14 @@ class Chunker(object):
            if pos_fid is None:
                raise ValueError('Could not find chunk for aid: %r'%
                        match.group(1))
-            aid_map[match.group(1)] = (to_base(chunk.sequence_number,
+            aid_map[match.group(1)] = pos_fid
                                            base=32, min_num_digits=4),
                                    to_href(offset-chunk.insert_pos))
        self.aid_offset_map = aid_map
        def to_placeholder(aid):
-            return bytes(':'.join(aid_map[aid]))
+            pos, fid = aid_map[aid]
            pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
            return bytes(':'.join((pos, fid)))
        placeholder_map = {bytes(k):to_placeholder(v) for k, v in
                self.placeholder_map.iteritems()}