mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New MOBI output: Write the TBS sequences for periodicals. Also fully decoded all TBS sequences, only unknown bits left are in the opening sequence that seems to depend on the type of record being indexed. The rules are simple, so I just use them instead of spending more time looking for deeper meaning.
This commit is contained in:
parent
0c5a37fbc0
commit
4270605335
@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict
|
||||
from calibre.utils.date import utc_tz
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
||||
get_trailing_data, decode_fvwi)
|
||||
get_trailing_data, decode_tbs)
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
|
||||
# PalmDB {{{
|
||||
@ -949,20 +949,22 @@ class TBSIndexing(object): # {{{
|
||||
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
|
||||
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
||||
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
||||
def bin3(num):
|
||||
def bin4(num):
|
||||
ans = bin(num)[2:]
|
||||
return '0'*(3-len(ans)) + ans
|
||||
return bytes('0'*(4-len(ans)) + ans)
|
||||
|
||||
def repr_extra(x):
|
||||
return str({bin4(k):v for k, v in extra.iteritems()})
|
||||
|
||||
tbs_type = 0
|
||||
if len(byts):
|
||||
outer, consumed = decint(byts)
|
||||
outermost_index, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
tbs_type = outer & 0b111
|
||||
ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
|
||||
ans.append('Outer Index entry: %d'%(outer >> 3))
|
||||
arg1, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi: always 0?): %d'%arg1)
|
||||
for k in extra:
|
||||
tbs_type |= k
|
||||
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
|
||||
ans.append('Outermost index: %d'%outermost_index)
|
||||
ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
|
||||
if self.doc_type in (257, 259): # Hierarchical periodical
|
||||
byts, a = self.interpret_periodical(tbs_type, byts,
|
||||
dat['geom'][0])
|
||||
@ -977,53 +979,21 @@ class TBSIndexing(object): # {{{
|
||||
def interpret_periodical(self, tbs_type, byts, record_offset):
|
||||
ans = []
|
||||
|
||||
def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{
|
||||
if psi is None:
|
||||
# Assume parent section is 1
|
||||
psi = self.get_index(1)
|
||||
if msg is None:
|
||||
msg = ('Article index at start of record or first article'
|
||||
' index, relative to parent section')
|
||||
if byts:
|
||||
# byts could be empty
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
flags = (arg & 0b1111)
|
||||
ai = (arg >> 4)
|
||||
ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai,
|
||||
ai+psi.index))
|
||||
if flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
if arg == 0:
|
||||
# EOF of record, otherwise ignore and hope someone else
|
||||
# will deal with these bytes
|
||||
byts = byts[consumed:]
|
||||
ans.append('EOF (vwi: should be 0): %d'%arg)
|
||||
elif flags in (4, 5):
|
||||
num = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Number of article nodes in the record (byte): %d'%num)
|
||||
if flags == 5:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('%s (vwi)): %d'%(fmsg, arg))
|
||||
elif flags == 0:
|
||||
pass
|
||||
else:
|
||||
raise ValueError('Unknown flags: %d'%flags)
|
||||
return byts
|
||||
|
||||
# }}}
|
||||
|
||||
def read_section_transitions(byts, psi=None): # {{{
|
||||
if psi is None:
|
||||
# Assume parent section is 1
|
||||
# Assume previous section is 1
|
||||
psi = self.get_index(1)
|
||||
|
||||
while byts:
|
||||
ai, flags, consumed = decode_fvwi(byts)
|
||||
ai, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
if flags & 0b1000:
|
||||
if extra.get(0b0010, None) is not None:
|
||||
raise ValueError('Dont know how to interpret flag 0b0010'
|
||||
' while reading section transitions')
|
||||
if extra.get(0b1000, None) is not None:
|
||||
if len(extra) > 1:
|
||||
raise ValueError('Dont know how to interpret flags'
|
||||
' %r while reading section transitions'%extra)
|
||||
nsi = self.get_index(psi.index+1)
|
||||
ans.append('Last article in this record of section %d'
|
||||
' (relative to next section index [%d]): '
|
||||
@ -1036,113 +1006,57 @@ class TBSIndexing(object): # {{{
|
||||
' (relative to its parent section): '
|
||||
'%d [%d absolute index]'%(psi.index, ai, ai+psi.index))
|
||||
|
||||
if flags == 0:
|
||||
ans.append('The section %d has only one article'
|
||||
' in this record'%psi.index)
|
||||
continue
|
||||
num = extra.get(0b0100, None)
|
||||
if num is None:
|
||||
msg = ('The section %d has at most one article'
|
||||
' in this record')%psi.index
|
||||
else:
|
||||
msg = ('Number of articles in this record of '
|
||||
'section %d: %d')%(psi.index, num)
|
||||
ans.append(msg)
|
||||
|
||||
if flags & 0b0100:
|
||||
num = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Number of articles in this record of '
|
||||
'section %d: %d'%(psi.index, num))
|
||||
|
||||
if flags & 0b0010:
|
||||
raise ValueError(
|
||||
'Dont know how to interpret the 0b0010 flag')
|
||||
|
||||
if flags & 0b0001:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
offset = extra.get(0b0001, None)
|
||||
if offset is not None:
|
||||
if offset == 0:
|
||||
ans.append('This record is spanned by the article:'
|
||||
'%d'%(ai+psi.index))
|
||||
else:
|
||||
ans.append('->Offset to start of next section (%d) from start'
|
||||
' of record: %d [%d absolute offset]'%(psi.index+1,
|
||||
arg, arg+record_offset))
|
||||
offset, offset+record_offset))
|
||||
return byts
|
||||
# }}}
|
||||
|
||||
if tbs_type == 3: # {{{
|
||||
arg2, consumed = decint(byts)
|
||||
def read_starting_section(byts): # {{{
|
||||
si, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi: always 0?): %d'%arg2)
|
||||
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
|
||||
raise ValueError('Dont know how to interpret flags %r'
|
||||
' when reading starting section'%extra)
|
||||
si = self.get_index(si)
|
||||
ans.append('The section at the start of this record is:'
|
||||
' %d'%si.index)
|
||||
if 0b0100 in extra:
|
||||
num = extra[0b0100]
|
||||
ans.append('The number of articles from the section %d'
|
||||
' in this record: %d'%(si.index, num))
|
||||
elif 0b0001 in extra:
|
||||
eof = extra[0b0001]
|
||||
if eof != 0:
|
||||
raise ValueError('Unknown eof value %s when reading'
|
||||
' starting section'%eof)
|
||||
ans.append('This record is spanned by an article from'
|
||||
' the section: %d'%si.index)
|
||||
return si, byts
|
||||
# }}}
|
||||
|
||||
arg3, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
fsi = arg3 >> 4
|
||||
flags = arg3 & 0b1111
|
||||
ans.append('First section index (fvwi): %d'%fsi)
|
||||
psi = self.get_index(fsi)
|
||||
ans.append('Flags: %d'%flags)
|
||||
if flags == 4:
|
||||
ans.append('Number of articles in this section: %d'%byts[0])
|
||||
byts = byts[1:]
|
||||
elif flags == 0:
|
||||
pass
|
||||
if tbs_type & 0b0100:
|
||||
# Starting section is the first section
|
||||
ssi = self.get_index(1)
|
||||
else:
|
||||
raise ValueError('Unknown flags value: %d'%flags)
|
||||
byts = read_section_transitions(byts, psi)
|
||||
ssi, byts = read_starting_section(byts)
|
||||
|
||||
# }}}
|
||||
|
||||
elif tbs_type == 7: # {{{
|
||||
# This occurs for records that have no section nodes and
|
||||
# whose parent section's index == 1
|
||||
ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
|
||||
byts = byts[2:]
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ai = arg >> 4
|
||||
flags = arg & 0b1111
|
||||
ans.append('Article at start of record (fvwi): %d'%ai)
|
||||
if flags == 4:
|
||||
num = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Number of articles in record (byte): %d'%num)
|
||||
elif flags == 0:
|
||||
pass
|
||||
elif flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('EOF (vwi: should be 0): %d'%arg)
|
||||
else:
|
||||
raise ValueError('Unknown flags value: %d'%flags)
|
||||
# }}}
|
||||
|
||||
elif tbs_type == 6: # {{{
|
||||
# This is used for records spanned by an article whose parent
|
||||
# section's index == 1 or for the opening record if it contains the
|
||||
# periodical start, section 1 start and at least one article. The
|
||||
# two cases are distinguished by the flags on the article index
|
||||
# vwi.
|
||||
unk = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Unknown (byte: always 2?): %d'%unk)
|
||||
byts = tbs_type_6(byts)
|
||||
# }}}
|
||||
|
||||
elif tbs_type == 2: # {{{
|
||||
# This occurs for records with no section nodes and whose parent
|
||||
# section's index != 1 (undefined (records before the first
|
||||
# section) or > 1)
|
||||
# This is also used for records that are spanned by an article
|
||||
# whose parent section index > 1. In this case the flags of the
|
||||
# vwi referring to the article at the start
|
||||
# of the record are set to 1 instead of 4.
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
flags = (arg & 0b1111)
|
||||
psi = (arg >> 4)
|
||||
ans.append('Parent section index (fvwi): %d'%psi)
|
||||
psi = self.get_index(psi)
|
||||
ans.append('Flags: %d'%flags)
|
||||
if flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi?: always 0?): %d'%arg)
|
||||
byts = tbs_type_6(byts, psi=psi)
|
||||
elif flags == 0:
|
||||
byts = tbs_type_6(byts, psi=psi)
|
||||
else:
|
||||
raise ValueError('Unknown flags: %d'%flags)
|
||||
# }}}
|
||||
byts = read_section_transitions(byts, ssi)
|
||||
|
||||
return byts, ans
|
||||
|
||||
|
@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals
|
||||
|
||||
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing.
|
||||
|
||||
Sequence encoding:
|
||||
|
||||
0b1000 : Continuation bit
|
||||
|
||||
First sequences:
|
||||
0b0010 : 80
|
||||
0b0011 : 80 80
|
||||
0b0110 : 80 2
|
||||
0b0111 : 80 2 80
|
||||
|
||||
Other sequences:
|
||||
0b0101 : 4 1a
|
||||
0b0001 : c b1
|
||||
|
||||
Opening record
|
||||
----------------
|
||||
|
||||
@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node
|
||||
|
||||
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
|
||||
|
||||
Starting record with two section transitions::
|
||||
|
||||
Record #1: Starts at: 0 Ends at: 4095
|
||||
Contains: 7 index entries (0 ends, 4 complete, 3 starts)
|
||||
TBS bytes: 86 80 2 c0 b8 c4 3
|
||||
Complete:
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
|
||||
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
|
||||
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD]
|
||||
Starts:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader]
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net]
|
||||
Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
|
||||
Remaining bytes: b8 c4 3
|
||||
|
||||
Starting record with three section transitions::
|
||||
|
||||
Record #1: Starts at: 0 Ends at: 4095
|
||||
Contains: 10 index entries (0 ends, 7 complete, 3 starts)
|
||||
TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4
|
||||
Complete:
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net]
|
||||
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
|
||||
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results]
|
||||
Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement]
|
||||
Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents]
|
||||
Starts:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader]
|
||||
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews]
|
||||
Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
|
||||
Remaining bytes: b8 c0 b8 c4 4
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Records with no nodes
|
||||
------------------------
|
||||
|
||||
subtype = 010
|
||||
|
||||
These records are spanned by a single article. They are of two types:
|
||||
|
||||
1. If the parent section index is 1, TBS type of 6, like this::
|
||||
@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the
|
||||
Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute]
|
||||
Flags (always 8?): 8
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute]
|
||||
Number of article nodes in the record (byte): 4
|
||||
Number of article nodes in the record belonging ot the last section (byte): 4
|
||||
|
||||
|
||||
Ending record
|
||||
@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end
|
||||
|
||||
If the record had only a single article end, the last two bytes would be replaced with: f0
|
||||
|
||||
If the last record has multiple section transitions, it is of type 6 and looks like::
|
||||
|
||||
Record #9: Starts at: 32768 Ends at: 34953
|
||||
Contains: 9 index entries (3 ends, 6 complete, 0 starts)
|
||||
TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
|
||||
Ends:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader]
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
|
||||
Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
|
||||
Complete:
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net]
|
||||
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews]
|
||||
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot]
|
||||
Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results]
|
||||
Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute]
|
||||
Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
|
||||
|
||||
|
@ -11,6 +11,7 @@ import struct
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||
from calibre.ebooks import normalize
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
|
||||
@ -197,3 +198,96 @@ def encode_trailing_data(raw):
|
||||
lsize += 1
|
||||
return raw + encoded
|
||||
|
||||
def encode_fvwi(val, flags):
|
||||
'''
|
||||
Encode the value val and the 4 bit flags flags as a fvwi. This encoding is
|
||||
used in the trailing byte sequences for indexing. Returns encoded
|
||||
bytestring.
|
||||
'''
|
||||
ans = (val << 4) | (flags & 0b1111)
|
||||
return encint(ans)
|
||||
|
||||
|
||||
def decode_fvwi(byts):
|
||||
'''
|
||||
Decode encoded fvwi. Returns number, flags, consumed
|
||||
'''
|
||||
arg, consumed = decint(bytes(byts))
|
||||
return (arg >> 4), (arg & 0b1111), consumed
|
||||
|
||||
def decode_tbs(byts):
|
||||
'''
|
||||
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
||||
This function reads the fvwi number and its associated flags. It them uses
|
||||
the flags to read any more numbers that belong to the series. The flags are
|
||||
the lowest 4 bits of the vwi (see the encode_fvwi function above).
|
||||
|
||||
Returns the fvwi number, a dictionary mapping flags bits to the associated
|
||||
data and the number of bytes consumed.
|
||||
'''
|
||||
byts = bytes(byts)
|
||||
val, flags, consumed = decode_fvwi(byts)
|
||||
extra = {}
|
||||
byts = byts[consumed:]
|
||||
if flags & 0b1000:
|
||||
extra[0b1000] = True
|
||||
if flags & 0b0010:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0010] = x
|
||||
consumed += consumed2
|
||||
if flags & 0b0100:
|
||||
extra[0b0100] = ord(byts[0])
|
||||
byts = byts[1:]
|
||||
consumed += 1
|
||||
if flags & 0b0001:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0001] = x
|
||||
consumed += consumed2
|
||||
return val, extra, consumed
|
||||
|
||||
def encode_tbs(val, extra):
|
||||
'''
|
||||
Encode the number val and the extra data in the extra dict as an fvwi. See
|
||||
decode_tbs above.
|
||||
'''
|
||||
flags = 0
|
||||
for flag in extra:
|
||||
flags |= flag
|
||||
ans = encode_fvwi(val, flags)
|
||||
|
||||
if 0b0010 in extra:
|
||||
ans += encint(extra[0b0010])
|
||||
if 0b0100 in extra:
|
||||
ans += bytes(bytearray([extra[0b0100]]))
|
||||
if 0b0001 in extra:
|
||||
ans += encint(extra[0b0001])
|
||||
return ans
|
||||
|
||||
def utf8_text(text):
|
||||
'''
|
||||
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
||||
empty, normalized bytestring.
|
||||
'''
|
||||
if text and text.strip():
|
||||
text = text.strip()
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf-8', 'replace')
|
||||
text = normalize(text).encode('utf-8')
|
||||
else:
|
||||
text = _('Unknown').encode('utf-8')
|
||||
return text
|
||||
|
||||
def align_block(raw, multiple=4, pad=b'\0'):
|
||||
'''
|
||||
Return raw with enough pad bytes append to ensure its length is a multiple
|
||||
of 4.
|
||||
'''
|
||||
extra = len(raw) % multiple
|
||||
if extra == 0: return raw
|
||||
return raw + pad*(multiple - extra)
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -10,35 +10,13 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
from struct import pack
|
||||
from cStringIO import StringIO
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from calibre.ebooks import normalize
|
||||
from calibre.ebook.mobi.writer2 import RECORD_SIZE
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex)
|
||||
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||
encode_trailing_data, encode_tbs, align_block, utf8_text)
|
||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
|
||||
def utf8_text(text):
|
||||
'''
|
||||
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
||||
empty, normalized bytestring.
|
||||
'''
|
||||
if text and text.strip():
|
||||
text = text.strip()
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf-8', 'replace')
|
||||
text = normalize(text).encode('utf-8')
|
||||
else:
|
||||
text = _('Unknown').encode('utf-8')
|
||||
return text
|
||||
|
||||
def align_block(raw, multiple=4, pad=b'\0'):
|
||||
'''
|
||||
Return raw with enough pad bytes append to ensure its length is a multiple
|
||||
of 4.
|
||||
'''
|
||||
extra = len(raw) % multiple
|
||||
if extra == 0: return raw
|
||||
return raw + pad*(multiple - extra)
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
@ -98,7 +76,7 @@ class IndexEntry(object): # {{{
|
||||
'first_child_index': 22,
|
||||
'last_child_index': 23,
|
||||
}
|
||||
RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys())
|
||||
RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
|
||||
|
||||
BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,]
|
||||
|
||||
@ -186,17 +164,123 @@ class TBS(object): # {{{
|
||||
trailing byte sequence for the record.
|
||||
'''
|
||||
|
||||
def __init__(self, data, is_periodical):
|
||||
if is_periodical:
|
||||
self.periodical_tbs(data)
|
||||
def __init__(self, data, is_periodical, first=False, all_sections=[]):
|
||||
if not data:
|
||||
self.bytestring = encode_trailing_data(b'')
|
||||
else:
|
||||
self.book_tbs(data)
|
||||
self.section_map = OrderedDict((i.index, i) for i in
|
||||
sorted(all_sections, key=lambda x:x.offset))
|
||||
|
||||
def periodical_tbs(self, data):
|
||||
self.bytestring = b''
|
||||
if is_periodical:
|
||||
# The starting bytes.
|
||||
# The value is zero which I think indicates the periodical
|
||||
# index entry. The values for the various flags seem to be
|
||||
# unused. If the 0b0100 is present, it means that the record
|
||||
# deals with section 1 (or is the final record with section
|
||||
# transitions).
|
||||
self.type_010 = encode_tbs(0, {0b0010: 0})
|
||||
self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0})
|
||||
self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0})
|
||||
self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0})
|
||||
|
||||
def book_tbs(self, data):
|
||||
self.bytestring = b''
|
||||
depth_map = defaultdict(list)
|
||||
for x in ('starts', 'ends', 'completes'):
|
||||
for idx in data[x]:
|
||||
depth_map[idx.depth].append(idx)
|
||||
for l in depth_map.itervalues():
|
||||
l.sort(key=lambda x:x.offset)
|
||||
self.periodical_tbs(data, first, depth_map)
|
||||
else:
|
||||
self.book_tbs(data, first)
|
||||
|
||||
def periodical_tbs(self, data, first, depth_map):
|
||||
buf = StringIO()
|
||||
|
||||
has_section_start = (depth_map[1] and depth_map[1][0] in
|
||||
data['starts'])
|
||||
spanner = data['spans']
|
||||
first_node = None
|
||||
for nodes in depth_map.values():
|
||||
for node in nodes:
|
||||
if (first_node is None or (node.offset, node.depth) <
|
||||
(first_node.offset, first_node.depth)):
|
||||
first_node = node
|
||||
|
||||
parent_section_index = -1
|
||||
if depth_map[0]:
|
||||
# We have a terminal record
|
||||
typ = (self.type_110 if has_section_start else self.type_010)
|
||||
if first_node.depth > 0:
|
||||
parent_section_index = (first_node.index if first_node.depth
|
||||
== 1 else first_node.parent_index)
|
||||
else:
|
||||
if spanner is not None:
|
||||
# record is spanned by a single article
|
||||
parent_section_index = spanner.parent_index
|
||||
typ = (self.type_110 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
elif not depth_map[1]:
|
||||
# has only article nodes, i.e. spanned by a section
|
||||
parent_section_index = self.depth_map[2][0].parent_index
|
||||
typ = (self.type_111 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
else:
|
||||
# has section transitions
|
||||
parent_section_index = self.depth_map[2][0].parent_index
|
||||
|
||||
buf.write(typ)
|
||||
|
||||
if parent_section_index > 1:
|
||||
# Write starting section information
|
||||
if spanner is None:
|
||||
num_articles = len(depth_map[1])
|
||||
extra = {}
|
||||
if num_articles > 1:
|
||||
extra = {0b0100: num_articles}
|
||||
else:
|
||||
extra = {0b0001: 0}
|
||||
buf.write(encode_tbs(parent_section_index, extra))
|
||||
|
||||
if spanner is None:
|
||||
articles = depth_map[2]
|
||||
sections = [self.section_map[a.parent_index] for a in articles]
|
||||
sections.sort(key=lambda x:x.offset)
|
||||
section_map = {s:[a for a in articles is a.parent_index ==
|
||||
s.index] for s in sections}
|
||||
for i, section in enumerate(sections):
|
||||
# All the articles in this record that belong to section
|
||||
articles = section_map[section]
|
||||
first_article = articles[0]
|
||||
last_article = articles[-1]
|
||||
num = len(articles)
|
||||
|
||||
try:
|
||||
next_sec = sections[i+1]
|
||||
except:
|
||||
next_sec == None
|
||||
|
||||
extra = {}
|
||||
if num > 1:
|
||||
extra[0b0100] = num
|
||||
if i == 0 and next_sec is not None:
|
||||
# Write offset to next section from start of record
|
||||
# For some reason kindlegen only writes this offset
|
||||
# for the first section transition. Imitate it.
|
||||
extra[0b0001] = next_sec.offset - data['offset']
|
||||
|
||||
buf.write(encode_tbs(first_article.index-section.index, extra))
|
||||
|
||||
if next_sec is not None:
|
||||
buf.write(encode_tbs(last_article.index-next_sec.index,
|
||||
{0b1000: 0}))
|
||||
else:
|
||||
buf.write(encode_tbs(spanner.index - parent_section_index,
|
||||
{0b0001: 0}))
|
||||
|
||||
self.bytestring = encode_trailing_data(buf.getvalue())
|
||||
|
||||
def book_tbs(self, data, first):
|
||||
self.bytestring = encode_trailing_data(b'')
|
||||
# }}}
|
||||
|
||||
class Indexer(object): # {{{
|
||||
@ -548,11 +632,13 @@ class Indexer(object): # {{{
|
||||
|
||||
def calculate_trailing_byte_sequences(self):
|
||||
self.tbs_map = {}
|
||||
found_node = False
|
||||
sections = [i for i in self.indices if i.depth == 1]
|
||||
for i in xrange(self.number_of_text_records):
|
||||
offset = i * RECORD_SIZE
|
||||
next_offset = offset + RECORD_SIZE
|
||||
data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]),
|
||||
('spans', None)])
|
||||
('spans', None), ('offset', offset)])
|
||||
for index in self.indices:
|
||||
if index.offset >= next_offset:
|
||||
# Node starts after current record
|
||||
@ -574,7 +660,13 @@ class Indexer(object): # {{{
|
||||
data['ends'].append(index)
|
||||
else:
|
||||
data['spans'] = index
|
||||
self.tbs_map[i+1] = TBS(data, self.is_periodical)
|
||||
if (data['ends'] or data['completes'] or data['starts'] or
|
||||
data['spans'] is not None):
|
||||
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
|
||||
found_node, all_sections=sections)
|
||||
found_node = True
|
||||
else:
|
||||
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False)
|
||||
|
||||
def get_trailing_byte_sequence(self, num):
|
||||
return self.tbs_map[num].bytestring
|
||||
|
Loading…
x
Reference in New Issue
Block a user