KF8 Output: Start work on the index layer

This commit is contained in:
Kovid Goyal 2012-04-20 22:39:32 +05:30
parent 6c631e0e64
commit 081897ae57
3 changed files with 132 additions and 41 deletions

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import struct, string, imghdr, zlib, os import struct, string, imghdr, zlib, os
from collections import OrderedDict from collections import OrderedDict
from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize from calibre.ebooks import normalize
@ -549,3 +550,48 @@ def create_text_record(text):
return data, overlap return data, overlap
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
an index. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, strings=()):
self.strings = OrderedDict((s, 0) for s in strings)
self.records = []
offset = 0
buf = BytesIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - buf.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
# }}}

View File

@ -13,54 +13,21 @@ from cStringIO import StringIO
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, utf8_text, RECORD_SIZE) encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
class CNCX(object): # {{{ class CNCX(CNCX_): # {{{
'''
Create the CNCX records. These are records containing all the strings from
the NCX. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, toc, is_periodical): def __init__(self, toc, is_periodical):
self.strings = OrderedDict() strings = []
for item in toc.iterdescendants(breadth_first=True): for item in toc.iterdescendants(breadth_first=True):
self.strings[item.title] = 0 strings.append(item.title)
if is_periodical: if is_periodical:
self.strings[item.klass] = 0 strings.append(item.klass)
if item.author: if item.author:
self.strings[item.author] = 0 strings.append(item.author)
if item.description: if item.description:
self.strings[item.description] = 0 strings.append(item.description)
CNCX_.__init__(self, strings)
self.records = []
offset = 0
buf = StringIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - buf.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
# }}} # }}}
class TAGX(object): # {{{ class TAGX(object): # {{{

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from future_builtins import map
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import namedtuple
from struct import pack
from calibre.ebooks.mobi.utils import CNCX
TagMeta = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag')
EndTagTable = TagMeta('eof', 0, 0, 0, 1)
class Index(object):
control_byte_count = 1
cncx = CNCX()
tag_types = (EndTagTable,)
@classmethod
def generate_tagx(cls):
header = b'TAGX'
byts = bytearray()
for tag_meta in cls.tag_types:
byts.extend(tag_meta[1:])
# table length, control byte count
header += pack(b'>II', 12+len(byts), cls.control_byte_count)
return header + bytes(byts)
class SkelIndex(Index):
tag_types = tuple(map(TagMeta, (
('chunk_count', 1, 1, 3, 0),
('geometry', 6, 2, 12, 0),
EndTagTable
)))
def __init__(self, skel_table):
self.entries = [
(s.name, {
# Dont ask me why these entries have to be repeated twice
'chunk_count':(s.chunk_count, s.chunk_count),
'geometry':(s.start_pos, s.length, s.start_pos, s.length),
}) for s in skel_table
]
class ChunkIndex(Index):
tag_types = tuple(map(TagMeta, (
('cncx_offset', 2, 1, 1, 0),
('file_number', 3, 1, 2, 0),
('sequence_number', 4, 1, 4, 0),
('geometry', 6, 2, 8, 0),
EndTagTable
)))
def __init__(self, chunk_table):
self.cncx = CNCX(c.selector for c in chunk_table)
self.entries = [
('%010d'%c.insert_pos, {
'cncx_offset':self.cncx[c.selector],
'file_number':c.file_number,
'sequence_number':c.sequence_number,
'geometry':(c.start_pos, c.length),
}) for s in chunk_table
]