KF8 Output: Create NCX and Guide records

This commit is contained in:
Kovid Goyal 2012-04-22 10:17:06 +05:30
parent 0db1fcb103
commit e4a55aae56
5 changed files with 208 additions and 26 deletions

View File

@ -599,4 +599,8 @@ class CNCX(object): # {{{
# }}} # }}}
def is_guide_ref_start(ref):
return (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'}))

View File

@ -12,6 +12,7 @@ import re
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS, from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize) namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.mobi.utils import is_guide_ref_start
from collections import defaultdict from collections import defaultdict
from urlparse import urldefrag from urlparse import urldefrag
@ -161,9 +162,7 @@ class Serializer(object):
buf.write(b'title="') buf.write(b'title="')
self.serialize_text(ref.title, quot=True) self.serialize_text(ref.title, quot=True)
buf.write(b'" ') buf.write(b'" ')
if (ref.title.lower() == 'start' or if is_guide_ref_start(ref):
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'})):
self._start_href = ref.href self._start_href = ref.href
self.serialize_href(ref.href) self.serialize_href(ref.href)
# Space required or won't work, I kid you not # Space required or won't work, I kid you not

View File

@ -15,9 +15,10 @@ from io import BytesIO
from calibre.ebooks.mobi.utils import CNCX, encint, align_block from calibre.ebooks.mobi.utils import CNCX, encint, align_block
from calibre.ebooks.mobi.writer8.header import Header from calibre.ebooks.mobi.writer8.header import Header
TagMeta = namedtuple('TagMeta', TagMeta_ = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag') 'name number values_per_entry bitmask end_flag')
EndTagTable = TagMeta('eof', 0, 0, 0, 1) TagMeta = lambda x:TagMeta_(*x)
EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
# could also be extended to 4 bit wide ones as well # could also be extended to 4 bit wide ones as well
@ -118,7 +119,10 @@ class Index(object): # {{{
cbs.append(ans) cbs.append(ans)
ans = 0 ans = 0
continue continue
try:
nvals = len(tags.get(name, ())) nvals = len(tags.get(name, ()))
except TypeError:
nvals = 1
nentries = nvals // vpe nentries = nvals // vpe
shifts = mask_to_bit_shifts[mask] shifts = mask_to_bit_shifts[mask]
ans |= mask & (nentries << shifts) ans |= mask & (nentries << shifts)
@ -132,36 +136,51 @@ class Index(object): # {{{
self.entries) self.entries)
rendered_entries = [] rendered_entries = []
offset = 0
index, idxt, buf = BytesIO(), BytesIO(), BytesIO() index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
IndexEntry = namedtuple('IndexEntry', 'offset length raw') IndexEntry = namedtuple('IndexEntry', 'offset length raw')
last_lead_text = b''
too_large = ValueError('Index has too many entries, calibre does not'
' support generating multiple index records at this'
' time.')
for i, x in enumerate(self.entries): for i, x in enumerate(self.entries):
control_bytes = self.control_bytes[i] control_bytes = self.control_bytes[i]
leading_text, tags = x leading_text, tags = x
buf.truncate(0) buf.seek(0), buf.truncate(0)
leading_text = (leading_text.encode('utf-8') if
isinstance(leading_text, unicode) else leading_text)
raw = bytearray(leading_text) raw = bytearray(leading_text)
raw.insert(0, len(leading_text)) raw.insert(0, len(leading_text))
buf.write(bytes(raw)) buf.write(bytes(raw))
buf.write(control_bytes) buf.write(bytes(bytearray(control_bytes)))
for tag in self.tag_types: for tag in self.tag_types:
values = tags.get(tag.name, None) values = tags.get(tag.name, None)
if values is None: continue
try:
len(values)
except TypeError:
values = [values]
if values: if values:
for val in values: for val in values:
try:
buf.write(encint(val)) buf.write(encint(val))
except ValueError:
raise ValueError('Invalid values for %r: %r'%(
tag, values))
raw = buf.getvalue() raw = buf.getvalue()
offset = index.tell()
if offset + self.HEADER_LENGTH >= 0x10000:
raise too_large
rendered_entries.append(IndexEntry(offset, len(raw), raw)) rendered_entries.append(IndexEntry(offset, len(raw), raw))
idxt.write(pack(b'>H', self.HEADER_LENGTH+offset)) idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
offset += len(raw)
index.write(raw) index.write(raw)
last_lead_text = leading_text
index_block = align_block(index.getvalue()) index_block = align_block(index.getvalue())
idxt_block = align_block(b'IDXT' + idxt.getvalue()) idxt_block = align_block(b'IDXT' + idxt.getvalue())
body = index_block + idxt_block body = index_block + idxt_block
if len(body) + self.HEADER_LENGTH >= 0x10000: if len(body) + self.HEADER_LENGTH >= 0x10000:
raise ValueError('Index has too many entries, calibre does not' raise too_large
' support generating multiple index records at this'
' time.')
header = b'INDX' header = b'INDX'
buf.truncate(0) buf.truncate(0)
buf.write(pack(b'>I', self.HEADER_LENGTH)) buf.write(pack(b'>I', self.HEADER_LENGTH))
@ -185,10 +204,15 @@ class Index(object): # {{{
tagx = self.generate_tagx() tagx = self.generate_tagx()
idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) + idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
b'\0') b'\0')
# Last index
idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
idx += pack(b'>H', len(rendered_entries))
header = { header = {
'num_of_entries': len(rendered_entries), 'num_of_entries': len(rendered_entries),
'num_of_cncx': len(self.cncx), 'num_of_cncx': len(self.cncx),
'tagx':tagx, 'tagx':tagx,
'last_index':align_block(idx),
'idxt':idxt 'idxt':idxt
} }
header = IndexHeader()(**header) header = IndexHeader()(**header)
@ -235,6 +259,74 @@ class ChunkIndex(Index):
'file_number':c.file_number, 'file_number':c.file_number,
'sequence_number':c.sequence_number, 'sequence_number':c.sequence_number,
'geometry':(c.start_pos, c.length), 'geometry':(c.start_pos, c.length),
}) for s in chunk_table }) for c in chunk_table
] ]
class GuideIndex(Index):
tag_types = tuple(map(TagMeta, (
('title', 1, 1, 1, 0),
('pos_fid', 6, 2, 2, 0),
EndTagTable
)))
def __init__(self, guide_table):
self.cncx = CNCX(c.title for c in guide_table)
self.entries = [
(r.type, {
'title':self.cncx[r.title],
'pos_fid':r.pos_fid,
}) for r in guide_table
]
class NCXIndex(Index):
control_byte_count = 2
tag_types = tuple(map(TagMeta, (
('offset', 1, 1, 1, 0),
('length', 2, 1, 2, 0),
('label', 3, 1, 4, 0),
('depth', 4, 1, 8, 0),
('parent', 21, 1, 16, 0),
('first_child', 22, 1, 32, 0),
('last_child', 23, 1, 64, 0),
('pos_fid', 6, 2, 128, 0),
EndTagTable,
('image', 69, 1, 1, 0),
('description', 70, 1, 2, 0),
('author', 71, 1, 4, 0),
('caption', 72, 1, 8, 0),
('attribution', 73, 1, 16, 0),
EndTagTable
)))
def __init__(self, toc_table):
strings = []
for entry in toc_table:
strings.append(entry['label'])
aut = entry.get('author', None)
if aut:
strings.append(aut)
desc = entry.get('description', None)
if desc:
strings.append(desc)
self.cncx = CNCX(strings)
def to_entry(x):
ans = {}
for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
'first_child', 'last_child'):
if f in x:
ans[f] = x[f]
for f in ('label', 'description', 'author'):
if f in x:
ans[f] = self.cncx[x[f]]
return ('%02x'%x['index'], ans)
self.entries = list(map(to_entry, toc_table))

View File

@ -17,12 +17,15 @@ import cssutils
from lxml import etree from lxml import etree
from calibre import isbytestring, force_unicode from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import create_text_record, to_base from calibre.ebooks.mobi.utils import (create_text_record, to_base,
is_guide_ref_start)
from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize) extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.oeb.parse_utils import barename
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
ChunkIndex, GuideIndex)
XML_DOCS = OEB_DOCS | {SVG_MIME} XML_DOCS = OEB_DOCS | {SVG_MIME}
@ -38,11 +41,11 @@ class KF8Writer(object):
self.log.info('Creating KF8 output') self.log.info('Creating KF8 output')
self.used_images = set() self.used_images = set()
self.resources = resources self.resources = resources
self.dup_data()
self.flows = [None] # First flow item is reserved for the text self.flows = [None] # First flow item is reserved for the text
self.records = [] self.records = []
self.fdst_table = []
self.log('\tGenerating KF8 markup...')
self.dup_data()
self.replace_resource_links() self.replace_resource_links()
self.extract_css_into_flows() self.extract_css_into_flows()
self.extract_svg_into_flows() self.extract_svg_into_flows()
@ -52,7 +55,10 @@ class KF8Writer(object):
# Dump the cloned data as it is no longer needed # Dump the cloned data as it is no longer needed
del self._data_cache del self._data_cache
self.create_text_records() self.create_text_records()
self.create_fdst_table() self.log('\tCreating indices...')
self.create_fdst_records()
self.create_indices()
self.create_guide()
def dup_data(self): def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only ''' Duplicate data so that any changes we make to markup/CSS only
@ -231,7 +237,7 @@ class KF8Writer(object):
records_size = 0 records_size = 0
if self.compress: if self.compress:
self.oeb.logger.info(' Compressing markup content...') self.oeb.logger.info('\tCompressing markup...')
while text.tell() < self.text_length: while text.tell() < self.text_length:
data, overlap = create_text_record(text) data, overlap = create_text_record(text)
@ -252,9 +258,90 @@ class KF8Writer(object):
self.records.append(b'\x00'*(records_size % 4)) self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1 self.first_non_text_record_idx += 1
def create_fdst_table(self): def create_fdst_records(self):
FDST = namedtuple('Flow', 'start end') FDST = namedtuple('Flow', 'start end')
entries = []
self.fdst_table = []
for i, flow in enumerate(self.flows): for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow))) self.fdst_table.append(FDST(start, start + len(flow)))
entries.extend(self.fdst_table[-1])
rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
pack(b'>%dL'%len(entries), *entries))
self.fdst_records = [rec]
def create_indices(self):
self.skel_records = SkelIndex(self.skel_table)()
self.chunk_records = ChunkIndex(self.chunk_table)()
self.ncx_records = []
toc = self.oeb.toc
max_depth = toc.depth()
entries = []
is_periodical = self.opts.mobi_periodical
if toc.count() < 2:
self.log.warn('Document has no ToC, MOBI will have no NCX index')
return
# Flatten the ToC into a depth first list
fl = toc.iter() if is_periodical else toc.iterdescendants()
for i, item in enumerate(fl):
entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
is_periodical else 1), 'href':item.href, 'label':(item.title or
_('Unknown'))}
entries.append(entry)
for child in item:
child.ncx_parent = entry
p = getattr(item, 'ncx_parent', None)
if p is not None:
entry['parent'] = p['index']
if is_periodical:
if item.author:
entry['author'] = item.author
if item.description:
entry['description'] = item.description
for entry in entries:
children = [e for e in entries if e.get('parent', -1) == entry['index']]
if children:
entry['first_child'] = children[0]['index']
entry['last_child'] = children[-1]['index']
href = entry.pop('href')
href, frag = href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''), None)
if aid is None:
pos, fid = 0, 0
else:
pos, fid = self.aid_offset_map[aid]
chunk = self.chunk_table[pos]
offset = chunk.insert_pos + fid
length = chunk.length
entry['pos_fid'] = (pos, fid)
entry['offset'] = offset
entry['length'] = length
self.ncx_records = NCXIndex(entries)()
def create_guide(self):
self.start_offset = None
self.guide_table = []
self.guide_records = []
GuideRef = namedtuple('GuideRef', 'title type pos_fid')
for ref in self.oeb.guide:
ref = self.oeb.guide[ref]
href, frag = ref.href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''))
if aid is None:
continue
pos, fid = self.aid_offset_map[aid]
if is_guide_ref_start(ref):
self.start_offset = pos
self.guide_table.append(GuideRef(ref.title or
_('Unknown'), ref.type, (pos, fid)))
if self.guide_table:
self.guide_records = GuideIndex(self.guide_table)()

View File

@ -359,14 +359,14 @@ class Chunker(object):
if pos_fid is None: if pos_fid is None:
raise ValueError('Could not find chunk for aid: %r'% raise ValueError('Could not find chunk for aid: %r'%
match.group(1)) match.group(1))
aid_map[match.group(1)] = (to_base(chunk.sequence_number, aid_map[match.group(1)] = pos_fid
base=32, min_num_digits=4),
to_href(offset-chunk.insert_pos))
self.aid_offset_map = aid_map self.aid_offset_map = aid_map
def to_placeholder(aid): def to_placeholder(aid):
return bytes(':'.join(aid_map[aid])) pos, fid = aid_map[aid]
pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
return bytes(':'.join((pos, fid)))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()} self.placeholder_map.iteritems()}