KF8 Output: Create NCX and Guide records

2025-08-11 09:13:57 -04:00 · 2012-04-22 10:17:06 +05:30 · 2012-04-22 10:17:06 +05:30 · e4a55aae56
commit e4a55aae56
parent 0db1fcb103
5 changed files with 208 additions and 26 deletions
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -599,4 +599,8 @@ class CNCX(object): # {{{

 # }}}

+def is_guide_ref_start(ref):
+    return (ref.title.lower() == 'start' or
+            (ref.type and ref.type.lower() in {'start',
+                    'other.start', 'text'}))

--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -12,6 +12,7 @@ import re
 from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
        namespace, prefixname, urlnormalize)
 from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start

 from collections import defaultdict
 from urlparse import urldefrag
@ -161,9 +162,7 @@ class Serializer(object):
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
-                if (ref.title.lower() == 'start' or
-                    (ref.type and ref.type.lower() in {'start',
-                        'other.start', 'text'})):
+                if is_guide_ref_start(ref):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@ -15,9 +15,10 @@ from io import BytesIO
 from calibre.ebooks.mobi.utils import CNCX, encint, align_block
 from calibre.ebooks.mobi.writer8.header import Header

-TagMeta = namedtuple('TagMeta',
+TagMeta_ = namedtuple('TagMeta',
        'name number values_per_entry bitmask end_flag')
-EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))

 # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
 # could also be extended to 4 bit wide ones as well
@ -118,7 +119,10 @@ class Index(object): # {{{
                    cbs.append(ans)
                    ans = 0
                    continue
-                nvals = len(tags.get(name, ()))
+                try:
+                    nvals = len(tags.get(name, ()))
+                except TypeError:
+                    nvals = 1
                nentries = nvals // vpe
                shifts = mask_to_bit_shifts[mask]
                ans |= mask & (nentries << shifts)
@ -132,36 +136,51 @@ class Index(object): # {{{
                self.entries)

        rendered_entries = []
-        offset = 0
        index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
        IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+        last_lead_text = b''
+        too_large = ValueError('Index has too many entries, calibre does not'
+                    ' support generating multiple index records at this'
+                    ' time.')
+
        for i, x in enumerate(self.entries):
            control_bytes = self.control_bytes[i]
            leading_text, tags = x
-            buf.truncate(0)
+            buf.seek(0), buf.truncate(0)
+            leading_text = (leading_text.encode('utf-8') if
+                    isinstance(leading_text, unicode) else leading_text)
            raw = bytearray(leading_text)
            raw.insert(0, len(leading_text))
            buf.write(bytes(raw))
-            buf.write(control_bytes)
+            buf.write(bytes(bytearray(control_bytes)))
            for tag in self.tag_types:
                values = tags.get(tag.name, None)
+                if values is None: continue
+                try:
+                    len(values)
+                except TypeError:
+                    values = [values]
                if values:
                    for val in values:
-                        buf.write(encint(val))
+                        try:
+                            buf.write(encint(val))
+                        except ValueError:
+                            raise ValueError('Invalid values for %r: %r'%(
+                                tag, values))
            raw = buf.getvalue()
+            offset = index.tell()
+            if offset + self.HEADER_LENGTH >= 0x10000:
+                raise too_large
            rendered_entries.append(IndexEntry(offset, len(raw), raw))
            idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
-            offset += len(raw)
            index.write(raw)
+            last_lead_text = leading_text

        index_block = align_block(index.getvalue())
        idxt_block = align_block(b'IDXT' + idxt.getvalue())
        body = index_block + idxt_block
        if len(body) + self.HEADER_LENGTH >= 0x10000:
-            raise ValueError('Index has too many entries, calibre does not'
-                    ' support generating multiple index records at this'
-                    ' time.')
-
+            raise too_large
        header = b'INDX'
        buf.truncate(0)
        buf.write(pack(b'>I', self.HEADER_LENGTH))
@ -185,10 +204,15 @@ class Index(object): # {{{
        tagx = self.generate_tagx()
        idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
                b'\0')
+        # Last index
+        idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+        idx += pack(b'>H', len(rendered_entries))
+
        header = {
                'num_of_entries': len(rendered_entries),
                'num_of_cncx': len(self.cncx),
                'tagx':tagx,
+                'last_index':align_block(idx),
                'idxt':idxt
        }
        header = IndexHeader()(**header)
@ -235,6 +259,74 @@ class ChunkIndex(Index):
                    'file_number':c.file_number,
                    'sequence_number':c.sequence_number,
                    'geometry':(c.start_pos, c.length),
-                    }) for s in chunk_table
+                    }) for c in chunk_table
        ]

+class GuideIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('title',           1, 1, 1, 0),
+        ('pos_fid',         6, 2, 2, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, guide_table):
+        self.cncx = CNCX(c.title for c in guide_table)
+
+        self.entries = [
+                (r.type, {
+
+                    'title':self.cncx[r.title],
+                    'pos_fid':r.pos_fid,
+                    }) for r in guide_table
+        ]
+
+
+class NCXIndex(Index):
+
+    control_byte_count = 2
+    tag_types = tuple(map(TagMeta, (
+        ('offset',             1, 1, 1, 0),
+        ('length',             2, 1, 2, 0),
+        ('label',              3, 1, 4, 0),
+        ('depth',              4, 1, 8, 0),
+        ('parent',             21, 1, 16, 0),
+        ('first_child',        22, 1, 32, 0),
+        ('last_child',         23, 1, 64, 0),
+        ('pos_fid',            6, 2, 128, 0),
+        EndTagTable,
+        ('image',              69, 1, 1, 0),
+        ('description',        70, 1, 2, 0),
+        ('author',             71, 1, 4, 0),
+        ('caption',            72, 1, 8, 0),
+        ('attribution',        73, 1, 16, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, toc_table):
+        strings = []
+        for entry in toc_table:
+            strings.append(entry['label'])
+            aut = entry.get('author', None)
+            if aut:
+                strings.append(aut)
+            desc = entry.get('description', None)
+            if desc:
+                strings.append(desc)
+        self.cncx = CNCX(strings)
+
+        def to_entry(x):
+            ans = {}
+            for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+                    'first_child', 'last_child'):
+                if f in x:
+                    ans[f] = x[f]
+            for f in ('label', 'description', 'author'):
+                if f in x:
+                    ans[f] = self.cncx[x[f]]
+            return ('%02x'%x['index'], ans)
+
+        self.entries = list(map(to_entry, toc_table))
+
+
+
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -17,12 +17,15 @@ import cssutils
 from lxml import etree

 from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+        is_guide_ref_start)
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
        extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
 from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+        ChunkIndex, GuideIndex)

 XML_DOCS = OEB_DOCS | {SVG_MIME}

@ -38,11 +41,11 @@ class KF8Writer(object):
        self.log.info('Creating KF8 output')
        self.used_images = set()
        self.resources = resources
-        self.dup_data()
        self.flows = [None] # First flow item is reserved for the text
        self.records = []
-        self.fdst_table = []

+        self.log('\tGenerating KF8 markup...')
+        self.dup_data()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
@ -52,7 +55,10 @@ class KF8Writer(object):
        # Dump the cloned data as it is no longer needed
        del self._data_cache
        self.create_text_records()
-        self.create_fdst_table()
+        self.log('\tCreating indices...')
+        self.create_fdst_records()
+        self.create_indices()
+        self.create_guide()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -231,7 +237,7 @@ class KF8Writer(object):
        records_size = 0

        if self.compress:
-            self.oeb.logger.info('  Compressing markup content...')
+            self.oeb.logger.info('\tCompressing markup...')

        while text.tell() < self.text_length:
            data, overlap = create_text_record(text)
@ -252,9 +258,90 @@ class KF8Writer(object):
            self.records.append(b'\x00'*(records_size % 4))
            self.first_non_text_record_idx += 1

-    def create_fdst_table(self):
+    def create_fdst_records(self):
        FDST = namedtuple('Flow', 'start end')
+        entries = []
+        self.fdst_table = []
        for i, flow in enumerate(self.flows):
            start = 0 if i == 0 else self.fdst_table[-1].end
            self.fdst_table.append(FDST(start, start + len(flow)))
+            entries.extend(self.fdst_table[-1])
+        rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
+                pack(b'>%dL'%len(entries), *entries))
+        self.fdst_records = [rec]
+
+    def create_indices(self):
+        self.skel_records = SkelIndex(self.skel_table)()
+        self.chunk_records = ChunkIndex(self.chunk_table)()
+        self.ncx_records = []
+        toc = self.oeb.toc
+        max_depth = toc.depth()
+        entries = []
+        is_periodical = self.opts.mobi_periodical
+        if toc.count() < 2:
+            self.log.warn('Document has no ToC, MOBI will have no NCX index')
+            return
+
+        # Flatten the ToC into a depth first list
+        fl = toc.iter() if is_periodical else toc.iterdescendants()
+        for i, item in enumerate(fl):
+            entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
+                is_periodical else 1), 'href':item.href, 'label':(item.title or
+                    _('Unknown'))}
+            entries.append(entry)
+            for child in item:
+                child.ncx_parent = entry
+            p = getattr(item, 'ncx_parent', None)
+            if p is not None:
+                entry['parent'] = p['index']
+            if is_periodical:
+                if item.author:
+                    entry['author'] = item.author
+                if item.description:
+                    entry['description'] = item.description
+
+        for entry in entries:
+            children = [e for e in entries if e.get('parent', -1) == entry['index']]
+            if children:
+                entry['first_child'] = children[0]['index']
+                entry['last_child'] = children[-1]['index']
+            href = entry.pop('href')
+            href, frag = href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''), None)
+            if aid is None:
+                pos, fid = 0, 0
+            else:
+                pos, fid = self.aid_offset_map[aid]
+            chunk = self.chunk_table[pos]
+            offset = chunk.insert_pos + fid
+            length = chunk.length
+            entry['pos_fid'] = (pos, fid)
+            entry['offset'] = offset
+            entry['length'] = length
+
+        self.ncx_records = NCXIndex(entries)()
+
+    def create_guide(self):
+        self.start_offset = None
+        self.guide_table = []
+        self.guide_records = []
+        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
+        for ref in self.oeb.guide:
+            ref = self.oeb.guide[ref]
+            href, frag = ref.href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''))
+            if aid is None:
+                continue
+            pos, fid = self.aid_offset_map[aid]
+            if is_guide_ref_start(ref):
+                self.start_offset = pos
+            self.guide_table.append(GuideRef(ref.title or
+                _('Unknown'), ref.type, (pos, fid)))
+
+        if self.guide_table:
+            self.guide_records = GuideIndex(self.guide_table)()

--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -359,14 +359,14 @@ class Chunker(object):
            if pos_fid is None:
                raise ValueError('Could not find chunk for aid: %r'%
                        match.group(1))
-            aid_map[match.group(1)] = (to_base(chunk.sequence_number,
-                                            base=32, min_num_digits=4),
-                                    to_href(offset-chunk.insert_pos))
+            aid_map[match.group(1)] = pos_fid

        self.aid_offset_map = aid_map

        def to_placeholder(aid):
-            return bytes(':'.join(aid_map[aid]))
+            pos, fid = aid_map[aid]
+            pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
+            return bytes(':'.join((pos, fid)))

        placeholder_map = {bytes(k):to_placeholder(v) for k, v in
                self.placeholder_map.iteritems()}