From e4a55aae564adfa92bcef668f020982b82a38aab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Apr 2012 10:17:06 +0530 Subject: [PATCH] KF8 Output: Create NCX and Guide records --- src/calibre/ebooks/mobi/utils.py | 4 + src/calibre/ebooks/mobi/writer2/serializer.py | 5 +- src/calibre/ebooks/mobi/writer8/index.py | 118 ++++++++++++++++-- src/calibre/ebooks/mobi/writer8/main.py | 99 ++++++++++++++- src/calibre/ebooks/mobi/writer8/skeleton.py | 8 +- 5 files changed, 208 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index aa59ee2217..3b8ce61ba8 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -599,4 +599,8 @@ class CNCX(object): # {{{ # }}} +def is_guide_ref_start(ref): + return (ref.title.lower() == 'start' or + (ref.type and ref.type.lower() in {'start', + 'other.start', 'text'})) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index d8d63bcff4..2dda657a93 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -12,6 +12,7 @@ import re from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize) from calibre.ebooks.mobi.mobiml import MBP_NS +from calibre.ebooks.mobi.utils import is_guide_ref_start from collections import defaultdict from urlparse import urldefrag @@ -161,9 +162,7 @@ class Serializer(object): buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') - if (ref.title.lower() == 'start' or - (ref.type and ref.type.lower() in {'start', - 'other.start', 'text'})): + if is_guide_ref_start(ref): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py index 153e140b06..1cf9f02d4b 100644 --- a/src/calibre/ebooks/mobi/writer8/index.py +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -15,9 +15,10 @@ from io import BytesIO from calibre.ebooks.mobi.utils import CNCX, encint, align_block from calibre.ebooks.mobi.writer8.header import Header -TagMeta = namedtuple('TagMeta', +TagMeta_ = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') -EndTagTable = TagMeta('eof', 0, 0, 0, 1) +TagMeta = lambda x:TagMeta_(*x) +EndTagTable = TagMeta(('eof', 0, 0, 0, 1)) # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks # could also be extended to 4 bit wide ones as well @@ -118,7 +119,10 @@ class Index(object): # {{{ cbs.append(ans) ans = 0 continue - nvals = len(tags.get(name, ())) + try: + nvals = len(tags.get(name, ())) + except TypeError: + nvals = 1 nentries = nvals // vpe shifts = mask_to_bit_shifts[mask] ans |= mask & (nentries << shifts) @@ -132,36 +136,51 @@ class Index(object): # {{{ self.entries) rendered_entries = [] - offset = 0 index, idxt, buf = BytesIO(), BytesIO(), BytesIO() IndexEntry = namedtuple('IndexEntry', 'offset length raw') + last_lead_text = b'' + too_large = ValueError('Index has too many entries, calibre does not' + ' support generating multiple index records at this' + ' time.') + for i, x in enumerate(self.entries): control_bytes = self.control_bytes[i] leading_text, tags = x - buf.truncate(0) + buf.seek(0), buf.truncate(0) + leading_text = (leading_text.encode('utf-8') if + isinstance(leading_text, unicode) else leading_text) raw = bytearray(leading_text) raw.insert(0, len(leading_text)) buf.write(bytes(raw)) - buf.write(control_bytes) + buf.write(bytes(bytearray(control_bytes))) for tag in self.tag_types: values = tags.get(tag.name, None) + if values is None: continue + try: + len(values) + except TypeError: + values = [values] if values: for val in values: - buf.write(encint(val)) + try: + buf.write(encint(val)) + except ValueError: + raise ValueError('Invalid values for %r: %r'%( + tag, values)) raw = buf.getvalue() + offset = index.tell() + if offset + self.HEADER_LENGTH >= 0x10000: + raise too_large rendered_entries.append(IndexEntry(offset, len(raw), raw)) idxt.write(pack(b'>H', self.HEADER_LENGTH+offset)) - offset += len(raw) index.write(raw) + last_lead_text = leading_text index_block = align_block(index.getvalue()) idxt_block = align_block(b'IDXT' + idxt.getvalue()) body = index_block + idxt_block if len(body) + self.HEADER_LENGTH >= 0x10000: - raise ValueError('Index has too many entries, calibre does not' - ' support generating multiple index records at this' - ' time.') - + raise too_large header = b'INDX' buf.truncate(0) buf.write(pack(b'>I', self.HEADER_LENGTH)) @@ -185,10 +204,15 @@ class Index(object): # {{{ tagx = self.generate_tagx() idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) + b'\0') + # Last index + idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text + idx += pack(b'>H', len(rendered_entries)) + header = { 'num_of_entries': len(rendered_entries), 'num_of_cncx': len(self.cncx), 'tagx':tagx, + 'last_index':align_block(idx), 'idxt':idxt } header = IndexHeader()(**header) @@ -235,6 +259,74 @@ class ChunkIndex(Index): 'file_number':c.file_number, 'sequence_number':c.sequence_number, 'geometry':(c.start_pos, c.length), - }) for s in chunk_table + }) for c in chunk_table ] +class GuideIndex(Index): + + tag_types = tuple(map(TagMeta, ( + ('title', 1, 1, 1, 0), + ('pos_fid', 6, 2, 2, 0), + EndTagTable + ))) + + def __init__(self, guide_table): + self.cncx = CNCX(c.title for c in guide_table) + + self.entries = [ + (r.type, { + + 'title':self.cncx[r.title], + 'pos_fid':r.pos_fid, + }) for r in guide_table + ] + + +class NCXIndex(Index): + + control_byte_count = 2 + tag_types = tuple(map(TagMeta, ( + ('offset', 1, 1, 1, 0), + ('length', 2, 1, 2, 0), + ('label', 3, 1, 4, 0), + ('depth', 4, 1, 8, 0), + ('parent', 21, 1, 16, 0), + ('first_child', 22, 1, 32, 0), + ('last_child', 23, 1, 64, 0), + ('pos_fid', 6, 2, 128, 0), + EndTagTable, + ('image', 69, 1, 1, 0), + ('description', 70, 1, 2, 0), + ('author', 71, 1, 4, 0), + ('caption', 72, 1, 8, 0), + ('attribution', 73, 1, 16, 0), + EndTagTable + ))) + + def __init__(self, toc_table): + strings = [] + for entry in toc_table: + strings.append(entry['label']) + aut = entry.get('author', None) + if aut: + strings.append(aut) + desc = entry.get('description', None) + if desc: + strings.append(desc) + self.cncx = CNCX(strings) + + def to_entry(x): + ans = {} + for f in ('offset', 'length', 'depth', 'pos_fid', 'parent', + 'first_child', 'last_child'): + if f in x: + ans[f] = x[f] + for f in ('label', 'description', 'author'): + if f in x: + ans[f] = self.cncx[x[f]] + return ('%02x'%x['index'], ans) + + self.entries = list(map(to_entry, toc_table)) + + + diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 955fbab460..76492cb9a9 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -17,12 +17,15 @@ import cssutils from lxml import etree from calibre import isbytestring, force_unicode -from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.mobi.utils import (create_text_record, to_base, + is_guide_ref_start) from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href +from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, + ChunkIndex, GuideIndex) XML_DOCS = OEB_DOCS | {SVG_MIME} @@ -38,11 +41,11 @@ class KF8Writer(object): self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources - self.dup_data() self.flows = [None] # First flow item is reserved for the text self.records = [] - self.fdst_table = [] + self.log('\tGenerating KF8 markup...') + self.dup_data() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() @@ -52,7 +55,10 @@ class KF8Writer(object): # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() - self.create_fdst_table() + self.log('\tCreating indices...') + self.create_fdst_records() + self.create_indices() + self.create_guide() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only @@ -231,7 +237,7 @@ class KF8Writer(object): records_size = 0 if self.compress: - self.oeb.logger.info(' Compressing markup content...') + self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) @@ -252,9 +258,90 @@ class KF8Writer(object): self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 - def create_fdst_table(self): + def create_fdst_records(self): FDST = namedtuple('Flow', 'start end') + entries = [] + self.fdst_table = [] for i, flow in enumerate(self.flows): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) + entries.extend(self.fdst_table[-1]) + rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) + + pack(b'>%dL'%len(entries), *entries)) + self.fdst_records = [rec] + + def create_indices(self): + self.skel_records = SkelIndex(self.skel_table)() + self.chunk_records = ChunkIndex(self.chunk_table)() + self.ncx_records = [] + toc = self.oeb.toc + max_depth = toc.depth() + entries = [] + is_periodical = self.opts.mobi_periodical + if toc.count() < 2: + self.log.warn('Document has no ToC, MOBI will have no NCX index') + return + + # Flatten the ToC into a depth first list + fl = toc.iter() if is_periodical else toc.iterdescendants() + for i, item in enumerate(fl): + entry = {'index':i, 'depth': max_depth - item.depth() - (0 if + is_periodical else 1), 'href':item.href, 'label':(item.title or + _('Unknown'))} + entries.append(entry) + for child in item: + child.ncx_parent = entry + p = getattr(item, 'ncx_parent', None) + if p is not None: + entry['parent'] = p['index'] + if is_periodical: + if item.author: + entry['author'] = item.author + if item.description: + entry['description'] = item.description + + for entry in entries: + children = [e for e in entries if e.get('parent', -1) == entry['index']] + if children: + entry['first_child'] = children[0]['index'] + entry['last_child'] = children[-1]['index'] + href = entry.pop('href') + href, frag = href.partition('#')[0::2] + aid = self.id_map.get((href, frag), None) + if aid is None: + aid = self.id_map.get((href, ''), None) + if aid is None: + pos, fid = 0, 0 + else: + pos, fid = self.aid_offset_map[aid] + chunk = self.chunk_table[pos] + offset = chunk.insert_pos + fid + length = chunk.length + entry['pos_fid'] = (pos, fid) + entry['offset'] = offset + entry['length'] = length + + self.ncx_records = NCXIndex(entries)() + + def create_guide(self): + self.start_offset = None + self.guide_table = [] + self.guide_records = [] + GuideRef = namedtuple('GuideRef', 'title type pos_fid') + for ref in self.oeb.guide: + ref = self.oeb.guide[ref] + href, frag = ref.href.partition('#')[0::2] + aid = self.id_map.get((href, frag), None) + if aid is None: + aid = self.id_map.get((href, '')) + if aid is None: + continue + pos, fid = self.aid_offset_map[aid] + if is_guide_ref_start(ref): + self.start_offset = pos + self.guide_table.append(GuideRef(ref.title or + _('Unknown'), ref.type, (pos, fid))) + + if self.guide_table: + self.guide_records = GuideIndex(self.guide_table)() diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 8f0a3795db..398c684e43 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -359,14 +359,14 @@ class Chunker(object): if pos_fid is None: raise ValueError('Could not find chunk for aid: %r'% match.group(1)) - aid_map[match.group(1)] = (to_base(chunk.sequence_number, - base=32, min_num_digits=4), - to_href(offset-chunk.insert_pos)) + aid_map[match.group(1)] = pos_fid self.aid_offset_map = aid_map def to_placeholder(aid): - return bytes(':'.join(aid_map[aid])) + pos, fid = aid_map[aid] + pos, fid = to_base(pos, min_num_digits=4), to_href(fid) + return bytes(':'.join((pos, fid))) placeholder_map = {bytes(k):to_placeholder(v) for k, v in self.placeholder_map.iteritems()}