mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
...
This commit is contained in:
parent
919011f8e5
commit
e61b86cd24
@ -957,15 +957,17 @@ class TBSIndexing(object): # {{{
|
|||||||
return str({bin4(k):v for k, v in extra.iteritems()})
|
return str({bin4(k):v for k, v in extra.iteritems()})
|
||||||
|
|
||||||
tbs_type = 0
|
tbs_type = 0
|
||||||
|
is_periodical = self.doc_type in (257, 258, 259)
|
||||||
if len(byts):
|
if len(byts):
|
||||||
outermost_index, extra, consumed = decode_tbs(byts)
|
outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
|
||||||
|
is_periodical else 3)
|
||||||
byts = byts[consumed:]
|
byts = byts[consumed:]
|
||||||
for k in extra:
|
for k in extra:
|
||||||
tbs_type |= k
|
tbs_type |= k
|
||||||
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
|
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
|
||||||
ans.append('Outermost index: %d'%outermost_index)
|
ans.append('Outermost index: %d'%outermost_index)
|
||||||
ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
|
ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
|
||||||
if self.doc_type in (257, 259): # Hierarchical periodical
|
if is_periodical: # Hierarchical periodical
|
||||||
byts, a = self.interpret_periodical(tbs_type, byts,
|
byts, a = self.interpret_periodical(tbs_type, byts,
|
||||||
dat['geom'][0])
|
dat['geom'][0])
|
||||||
ans += a
|
ans += a
|
||||||
|
@ -66,11 +66,14 @@ def encint(value, forward=True):
|
|||||||
If forward is True the bytes returned are suitable for prepending to the
|
If forward is True the bytes returned are suitable for prepending to the
|
||||||
output buffer, otherwise they must be append to the output buffer.
|
output buffer, otherwise they must be append to the output buffer.
|
||||||
'''
|
'''
|
||||||
|
if value < 0:
|
||||||
|
raise ValueError('Cannot encode negative numbers as vwi')
|
||||||
# Encode vwi
|
# Encode vwi
|
||||||
byts = bytearray()
|
byts = bytearray()
|
||||||
while True:
|
while True:
|
||||||
b = value & 0b01111111
|
b = value & 0b01111111
|
||||||
value >>= 7 # shift value to the right by 7 bits
|
value >>= 7 # shift value to the right by 7 bits
|
||||||
|
|
||||||
byts.append(b)
|
byts.append(b)
|
||||||
if value == 0:
|
if value == 0:
|
||||||
break
|
break
|
||||||
@ -198,24 +201,31 @@ def encode_trailing_data(raw):
|
|||||||
lsize += 1
|
lsize += 1
|
||||||
return raw + encoded
|
return raw + encoded
|
||||||
|
|
||||||
def encode_fvwi(val, flags):
|
def encode_fvwi(val, flags, flag_size=4):
|
||||||
'''
|
'''
|
||||||
Encode the value val and the 4 bit flags flags as a fvwi. This encoding is
|
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
|
||||||
used in the trailing byte sequences for indexing. Returns encoded
|
used in the trailing byte sequences for indexing. Returns encoded
|
||||||
bytestring.
|
bytestring.
|
||||||
'''
|
'''
|
||||||
ans = (val << 4) | (flags & 0b1111)
|
ans = val << flag_size
|
||||||
|
for i in xrange(flag_size):
|
||||||
|
ans |= (flags & (1 << i))
|
||||||
return encint(ans)
|
return encint(ans)
|
||||||
|
|
||||||
|
|
||||||
def decode_fvwi(byts):
|
def decode_fvwi(byts, flag_size=4):
|
||||||
'''
|
'''
|
||||||
Decode encoded fvwi. Returns number, flags, consumed
|
Decode encoded fvwi. Returns number, flags, consumed
|
||||||
'''
|
'''
|
||||||
arg, consumed = decint(bytes(byts))
|
arg, consumed = decint(bytes(byts))
|
||||||
return (arg >> 4), (arg & 0b1111), consumed
|
val = arg >> flag_size
|
||||||
|
flags = 0
|
||||||
|
for i in xrange(flag_size):
|
||||||
|
flags |= (arg & (1 << i))
|
||||||
|
return val, flags, consumed
|
||||||
|
|
||||||
def decode_tbs(byts):
|
|
||||||
|
def decode_tbs(byts, flag_size=4):
|
||||||
'''
|
'''
|
||||||
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
||||||
This function reads the fvwi number and its associated flags. It them uses
|
This function reads the fvwi number and its associated flags. It them uses
|
||||||
@ -226,10 +236,10 @@ def decode_tbs(byts):
|
|||||||
data and the number of bytes consumed.
|
data and the number of bytes consumed.
|
||||||
'''
|
'''
|
||||||
byts = bytes(byts)
|
byts = bytes(byts)
|
||||||
val, flags, consumed = decode_fvwi(byts)
|
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
|
||||||
extra = {}
|
extra = {}
|
||||||
byts = byts[consumed:]
|
byts = byts[consumed:]
|
||||||
if flags & 0b1000:
|
if flags & 0b1000 and flag_size > 3:
|
||||||
extra[0b1000] = True
|
extra[0b1000] = True
|
||||||
if flags & 0b0010:
|
if flags & 0b0010:
|
||||||
x, consumed2 = decint(byts)
|
x, consumed2 = decint(byts)
|
||||||
@ -247,7 +257,7 @@ def decode_tbs(byts):
|
|||||||
consumed += consumed2
|
consumed += consumed2
|
||||||
return val, extra, consumed
|
return val, extra, consumed
|
||||||
|
|
||||||
def encode_tbs(val, extra):
|
def encode_tbs(val, extra, flag_size=4):
|
||||||
'''
|
'''
|
||||||
Encode the number val and the extra data in the extra dict as an fvwi. See
|
Encode the number val and the extra data in the extra dict as an fvwi. See
|
||||||
decode_tbs above.
|
decode_tbs above.
|
||||||
@ -255,7 +265,7 @@ def encode_tbs(val, extra):
|
|||||||
flags = 0
|
flags = 0
|
||||||
for flag in extra:
|
for flag in extra:
|
||||||
flags |= flag
|
flags |= flag
|
||||||
ans = encode_fvwi(val, flags)
|
ans = encode_fvwi(val, flags, flag_size=flag_size)
|
||||||
|
|
||||||
if 0b0010 in extra:
|
if 0b0010 in extra:
|
||||||
ans += encint(extra[0b0010])
|
ans += encint(extra[0b0010])
|
||||||
|
@ -28,13 +28,12 @@ class CNCX(object): # {{{
|
|||||||
|
|
||||||
MAX_STRING_LENGTH = 500
|
MAX_STRING_LENGTH = 500
|
||||||
|
|
||||||
def __init__(self, toc, opts):
|
def __init__(self, toc, is_periodical):
|
||||||
self.strings = OrderedDict()
|
self.strings = OrderedDict()
|
||||||
|
|
||||||
for item in toc:
|
for item in toc.iterdescendants():
|
||||||
if item is self.toc: continue
|
|
||||||
self.strings[item.title] = 0
|
self.strings[item.title] = 0
|
||||||
if opts.mobi_periodical:
|
if is_periodical:
|
||||||
self.strings[item.klass] = 0
|
self.strings[item.klass] = 0
|
||||||
|
|
||||||
self.records = []
|
self.records = []
|
||||||
@ -91,6 +90,17 @@ class IndexEntry(object): # {{{
|
|||||||
self.first_child_index = None
|
self.first_child_index = None
|
||||||
self.last_child_index = None
|
self.last_child_index = None
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
|
||||||
|
' parent_index=%r)')%(self.offset, self.depth, self.length,
|
||||||
|
self.index, self.parent_index)
|
||||||
|
|
||||||
|
@dynamic_property
|
||||||
|
def size(self):
|
||||||
|
def fget(self): return self.length
|
||||||
|
def fset(self, val): self.length = val
|
||||||
|
return property(fget=fget, fset=fset, doc='Alias for length')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tagx_block(cls, for_periodical=True):
|
def tagx_block(cls, for_periodical=True):
|
||||||
buf = bytearray()
|
buf = bytearray()
|
||||||
@ -137,7 +147,7 @@ class IndexEntry(object): # {{{
|
|||||||
def entry_type(self):
|
def entry_type(self):
|
||||||
ans = 0
|
ans = 0
|
||||||
for tag in self.tag_nums:
|
for tag in self.tag_nums:
|
||||||
ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x
|
ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -152,7 +162,7 @@ class IndexEntry(object): # {{{
|
|||||||
val = getattr(self, attr)
|
val = getattr(self, attr)
|
||||||
buf.write(encint(val))
|
buf.write(encint(val))
|
||||||
|
|
||||||
ans = buf.get_value()
|
ans = buf.getvalue()
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
@ -175,13 +185,16 @@ class TBS(object): # {{{
|
|||||||
# The starting bytes.
|
# The starting bytes.
|
||||||
# The value is zero which I think indicates the periodical
|
# The value is zero which I think indicates the periodical
|
||||||
# index entry. The values for the various flags seem to be
|
# index entry. The values for the various flags seem to be
|
||||||
# unused. If the 0b0100 is present, it means that the record
|
# unused. If the 0b100 is present, it means that the record
|
||||||
# deals with section 1 (or is the final record with section
|
# deals with section 1 (or is the final record with section
|
||||||
# transitions).
|
# transitions).
|
||||||
self.type_010 = encode_tbs(0, {0b0010: 0})
|
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
|
||||||
self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0})
|
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
|
||||||
self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0})
|
flag_size=3)
|
||||||
self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0})
|
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
|
||||||
|
flag_size=3)
|
||||||
|
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
|
||||||
|
0}, flag_size=3)
|
||||||
|
|
||||||
depth_map = defaultdict(list)
|
depth_map = defaultdict(list)
|
||||||
for x in ('starts', 'ends', 'completes'):
|
for x in ('starts', 'ends', 'completes'):
|
||||||
@ -221,12 +234,18 @@ class TBS(object): # {{{
|
|||||||
self.type_010)
|
self.type_010)
|
||||||
elif not depth_map[1]:
|
elif not depth_map[1]:
|
||||||
# has only article nodes, i.e. spanned by a section
|
# has only article nodes, i.e. spanned by a section
|
||||||
parent_section_index = self.depth_map[2][0].parent_index
|
parent_section_index = depth_map[2][0].parent_index
|
||||||
typ = (self.type_111 if parent_section_index == 1 else
|
typ = (self.type_111 if parent_section_index == 1 else
|
||||||
self.type_010)
|
self.type_010)
|
||||||
else:
|
else:
|
||||||
# has section transitions
|
# has section transitions
|
||||||
parent_section_index = self.depth_map[2][0].parent_index
|
if depth_map[2]:
|
||||||
|
parent_section_index = depth_map[2][0].parent_index
|
||||||
|
typ = self.type_011
|
||||||
|
else:
|
||||||
|
parent_section_index = depth_map[1][0].index
|
||||||
|
typ = (self.type_110 if parent_section_index == 1 else
|
||||||
|
self.type_011)
|
||||||
|
|
||||||
buf.write(typ)
|
buf.write(typ)
|
||||||
|
|
||||||
@ -243,9 +262,10 @@ class TBS(object): # {{{
|
|||||||
|
|
||||||
if spanner is None:
|
if spanner is None:
|
||||||
articles = depth_map[2]
|
articles = depth_map[2]
|
||||||
sections = [self.section_map[a.parent_index] for a in articles]
|
sections = set([self.section_map[a.parent_index] for a in
|
||||||
sections.sort(key=lambda x:x.offset)
|
articles])
|
||||||
section_map = {s:[a for a in articles is a.parent_index ==
|
sections = sorted(sections, key=lambda x:x.offset)
|
||||||
|
section_map = {s:[a for a in articles if a.parent_index ==
|
||||||
s.index] for s in sections}
|
s.index] for s in sections}
|
||||||
for i, section in enumerate(sections):
|
for i, section in enumerate(sections):
|
||||||
# All the articles in this record that belong to section
|
# All the articles in this record that belong to section
|
||||||
@ -257,7 +277,7 @@ class TBS(object): # {{{
|
|||||||
try:
|
try:
|
||||||
next_sec = sections[i+1]
|
next_sec = sections[i+1]
|
||||||
except:
|
except:
|
||||||
next_sec == None
|
next_sec = None
|
||||||
|
|
||||||
extra = {}
|
extra = {}
|
||||||
if num > 1:
|
if num > 1:
|
||||||
@ -299,14 +319,14 @@ class Indexer(object): # {{{
|
|||||||
self.log('Generating MOBI index for a %s'%('periodical' if
|
self.log('Generating MOBI index for a %s'%('periodical' if
|
||||||
self.is_periodical else 'book'))
|
self.is_periodical else 'book'))
|
||||||
self.is_flat_periodical = False
|
self.is_flat_periodical = False
|
||||||
if opts.mobi_periodical:
|
if self.is_periodical:
|
||||||
periodical_node = iter(oeb.toc).next()
|
periodical_node = iter(oeb.toc).next()
|
||||||
sections = tuple(periodical_node)
|
sections = tuple(periodical_node)
|
||||||
self.is_flat_periodical = len(sections) == 1
|
self.is_flat_periodical = len(sections) == 1
|
||||||
|
|
||||||
self.records = []
|
self.records = []
|
||||||
|
|
||||||
self.cncx = CNCX(oeb.toc, opts)
|
self.cncx = CNCX(oeb.toc, self.is_periodical)
|
||||||
|
|
||||||
if self.is_periodical:
|
if self.is_periodical:
|
||||||
self.indices = self.create_periodical_index()
|
self.indices = self.create_periodical_index()
|
||||||
@ -405,7 +425,7 @@ class Indexer(object): # {{{
|
|||||||
buf.write(pack(b'>I', 0)) # Filled in later
|
buf.write(pack(b'>I', 0)) # Filled in later
|
||||||
|
|
||||||
# Number of index records 24-28
|
# Number of index records 24-28
|
||||||
buf.write(pack('b>I', len(self.records)))
|
buf.write(pack(b'>I', len(self.records)))
|
||||||
|
|
||||||
# Index Encoding 28-32
|
# Index Encoding 28-32
|
||||||
buf.write(pack(b'>I', 65001)) # utf-8
|
buf.write(pack(b'>I', 65001)) # utf-8
|
||||||
@ -457,7 +477,7 @@ class Indexer(object): # {{{
|
|||||||
idxt_offset = buf.tell()
|
idxt_offset = buf.tell()
|
||||||
|
|
||||||
buf.write(b'IDXT')
|
buf.write(b'IDXT')
|
||||||
buf.write(header_length + len(tagx_block))
|
buf.write(pack(b'>H', header_length + len(tagx_block)))
|
||||||
buf.write(b'\0')
|
buf.write(b'\0')
|
||||||
buf.seek(20)
|
buf.seek(20)
|
||||||
buf.write(pack(b'>I', idxt_offset))
|
buf.write(pack(b'>I', idxt_offset))
|
||||||
@ -567,7 +587,7 @@ class Indexer(object): # {{{
|
|||||||
for s, x in enumerate(normalized_sections):
|
for s, x in enumerate(normalized_sections):
|
||||||
sec, normalized_articles = x
|
sec, normalized_articles = x
|
||||||
try:
|
try:
|
||||||
sec.length = normalized_sections[s+1].offset - sec.offset
|
sec.length = normalized_sections[s+1][0].offset - sec.offset
|
||||||
except:
|
except:
|
||||||
sec.length = self.serializer.body_end_offset - sec.offset
|
sec.length = self.serializer.body_end_offset - sec.offset
|
||||||
for i, art in enumerate(normalized_articles):
|
for i, art in enumerate(normalized_articles):
|
||||||
@ -583,17 +603,18 @@ class Indexer(object): # {{{
|
|||||||
normalized_articles))
|
normalized_articles))
|
||||||
normalized_sections[i] = (sec, normalized_articles)
|
normalized_sections[i] = (sec, normalized_articles)
|
||||||
|
|
||||||
normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1],
|
normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
|
||||||
normalized_sections))
|
normalized_sections))
|
||||||
|
|
||||||
# Set indices
|
# Set indices
|
||||||
i = 0
|
i = 0
|
||||||
for sec, normalized_articles in normalized_sections:
|
for sec, articles in normalized_sections:
|
||||||
i += 1
|
i += 1
|
||||||
sec.index = i
|
sec.index = i
|
||||||
|
sec.parent_index = 0
|
||||||
|
|
||||||
for sec, normalized_articles in normalized_sections:
|
for sec, articles in normalized_sections:
|
||||||
for art in normalized_articles:
|
for art in articles:
|
||||||
i += 1
|
i += 1
|
||||||
art.index = i
|
art.index = i
|
||||||
art.parent_index = sec.index
|
art.parent_index = sec.index
|
||||||
@ -606,7 +627,7 @@ class Indexer(object): # {{{
|
|||||||
for s, x in enumerate(normalized_sections):
|
for s, x in enumerate(normalized_sections):
|
||||||
sec, articles = x
|
sec, articles = x
|
||||||
try:
|
try:
|
||||||
next_offset = normalized_sections[s+1].offset
|
next_offset = normalized_sections[s+1][0].offset
|
||||||
except:
|
except:
|
||||||
next_offset = self.serializer.body_end_offset
|
next_offset = self.serializer.body_end_offset
|
||||||
sec.length = next_offset - sec.offset
|
sec.length = next_offset - sec.offset
|
||||||
@ -622,7 +643,7 @@ class Indexer(object): # {{{
|
|||||||
for s, x in enumerate(normalized_sections):
|
for s, x in enumerate(normalized_sections):
|
||||||
sec, articles = x
|
sec, articles = x
|
||||||
try:
|
try:
|
||||||
next_sec = normalized_sections[s+1]
|
next_sec = normalized_sections[s+1][0]
|
||||||
except:
|
except:
|
||||||
if (sec.length == 0 or sec.next_offset !=
|
if (sec.length == 0 or sec.next_offset !=
|
||||||
self.serializer.body_end_offset):
|
self.serializer.body_end_offset):
|
||||||
@ -659,6 +680,7 @@ class Indexer(object): # {{{
|
|||||||
self.tbs_map = {}
|
self.tbs_map = {}
|
||||||
found_node = False
|
found_node = False
|
||||||
sections = [i for i in self.indices if i.depth == 1]
|
sections = [i for i in self.indices if i.depth == 1]
|
||||||
|
deepest = max(i.depth for i in self.indices)
|
||||||
for i in xrange(self.number_of_text_records):
|
for i in xrange(self.number_of_text_records):
|
||||||
offset = i * RECORD_SIZE
|
offset = i * RECORD_SIZE
|
||||||
next_offset = offset + RECORD_SIZE
|
next_offset = offset + RECORD_SIZE
|
||||||
@ -683,7 +705,7 @@ class Indexer(object): # {{{
|
|||||||
if index.next_offset <= next_offset:
|
if index.next_offset <= next_offset:
|
||||||
# Node ends in current record
|
# Node ends in current record
|
||||||
data['ends'].append(index)
|
data['ends'].append(index)
|
||||||
else:
|
elif index.depth == deepest:
|
||||||
data['spans'] = index
|
data['spans'] = index
|
||||||
if (data['ends'] or data['completes'] or data['starts'] or
|
if (data['ends'] or data['completes'] or data['starts'] or
|
||||||
data['spans'] is not None):
|
data['spans'] is not None):
|
||||||
|
@ -55,6 +55,7 @@ class MobiWriter(object):
|
|||||||
self.last_text_record_idx = 1
|
self.last_text_record_idx = 1
|
||||||
|
|
||||||
def __call__(self, oeb, path_or_stream):
|
def __call__(self, oeb, path_or_stream):
|
||||||
|
self.log = oeb.log
|
||||||
if hasattr(path_or_stream, 'write'):
|
if hasattr(path_or_stream, 'write'):
|
||||||
return self.dump_stream(oeb, path_or_stream)
|
return self.dump_stream(oeb, path_or_stream)
|
||||||
with open(path_or_stream, 'w+b') as stream:
|
with open(path_or_stream, 'w+b') as stream:
|
||||||
@ -90,6 +91,7 @@ class MobiWriter(object):
|
|||||||
self.primary_index_record_idx = None
|
self.primary_index_record_idx = None
|
||||||
try:
|
try:
|
||||||
self.indexer = Indexer(self.serializer, self.last_text_record_idx,
|
self.indexer = Indexer(self.serializer, self.last_text_record_idx,
|
||||||
|
len(self.records[self.last_text_record_idx]),
|
||||||
self.opts, self.oeb)
|
self.opts, self.oeb)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Failed to generate MOBI index:')
|
self.log.exception('Failed to generate MOBI index:')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user