This commit is contained in:
Kovid Goyal 2011-07-26 20:05:32 -06:00
parent 0ab0246048
commit ae6f049792
2 changed files with 48 additions and 41 deletions

View File

@ -424,12 +424,7 @@ class IndexHeader(object): # {{{
if self.index_encoding == 'unknown':
raise ValueError(
'Unknown index encoding: %d'%self.index_encoding_num)
self.locale_raw, = struct.unpack(b'>I', raw[32:36])
langcode = self.locale_raw
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.possibly_language = raw[32:36]
self.num_index_entries, = struct.unpack('>I', raw[36:40])
self.ordt_start, = struct.unpack('>I', raw[40:44])
self.ligt_start, = struct.unpack('>I', raw[44:48])
@ -489,8 +484,7 @@ class IndexHeader(object): # {{{
a('Number of index records: %d'%self.index_count)
a('Index encoding: %s (%d)'%(self.index_encoding,
self.index_encoding_num))
a('Index language: %s - %s (%s)'%(self.language, self.sublanguage,
hex(self.locale_raw)))
a('Unknown (possibly language?): %r'%(self.possibly_language))
a('Number of index entries: %d'% self.num_index_entries)
a('ORDT start: %d'%self.ordt_start)
a('LIGT start: %d'%self.ligt_start)
@ -1038,6 +1032,7 @@ class TBSIndexing(object): # {{{
# }}}
def read_starting_section(byts): # {{{
orig = byts
si, extra, consumed = decode_tbs(byts)
byts = byts[consumed:]
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
@ -1054,7 +1049,7 @@ class TBSIndexing(object): # {{{
eof = extra[0b0001]
if eof != 0:
raise ValueError('Unknown eof value %s when reading'
' starting section'%eof)
' starting section. All bytes: %r'%(eof, orig))
ans.append('This record is spanned by an article from'
' the section: %d'%si.index)
return si, byts

View File

@ -15,7 +15,6 @@ from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_trailing_data, encode_tbs, align_block, utf8_text)
from calibre.ebooks.mobi.langcodes import iana2mobi
class CNCX(object): # {{{
@ -173,10 +172,8 @@ class TBS(object): # {{{
trailing byte sequence for the record.
'''
def __init__(self, data, is_periodical, first=False, all_sections=[]):
if not data:
self.bytestring = encode_trailing_data(b'')
else:
def __init__(self, data, is_periodical, first=False, all_sections=[],
after_first=False):
self.section_map = OrderedDict((i.index, i) for i in
sorted(all_sections, key=lambda x:x.offset))
@ -195,6 +192,14 @@ class TBS(object): # {{{
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3)
if not data:
byts = b''
if after_first:
# This can happen if a record contains only text between
# the periodical start and the first section
byts = self.type_011
self.bytestring = encode_trailing_data(byts)
else:
depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'):
for idx in data[x]:
@ -202,6 +207,9 @@ class TBS(object): # {{{
for l in depth_map.itervalues():
l.sort(key=lambda x:x.offset)
self.periodical_tbs(data, first, depth_map)
else:
if not data:
self.bytestring = encode_trailing_data(b'')
else:
self.book_tbs(data, first)
@ -240,15 +248,13 @@ class TBS(object): # {{{
# has section transitions
if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index
typ = self.type_011
else:
parent_section_index = depth_map[1][0].index
typ = (self.type_110 if parent_section_index == 1 else
self.type_011)
typ = self.type_011
buf.write(typ)
if parent_section_index > 1:
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
# Write starting section information
if spanner is None:
num_articles = len(depth_map[1])
@ -429,9 +435,8 @@ class Indexer(object): # {{{
# Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8
# Index language 32-36
buf.write(iana2mobi(
str(self.oeb.metadata.language[0])))
# Unknown 32-36
buf.write(b'\xff'*4)
# Number of index entries 36-40
buf.write(pack(b'>I', len(self.indices)))
@ -680,15 +685,20 @@ class Indexer(object): # {{{
found_node = False
sections = [i for i in self.indices if i.depth == 1]
deepest = max(i.depth for i in self.indices)
for i in xrange(self.number_of_text_records):
offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE
data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]),
('spans', None), ('offset', offset)])
data = {'ends':[], 'completes':[], 'starts':[],
'spans':None, 'offset':offset, 'record_number':i+1}
for index in self.indices:
if index.offset >= next_offset:
# Node starts after current record
if index.depth == deepest:
break
else:
continue
if index.next_offset <= offset:
# Node ends before current record
continue
@ -706,13 +716,15 @@ class Indexer(object): # {{{
data['ends'].append(index)
elif index.depth == deepest:
data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None):
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
found_node, all_sections=sections)
found_node = True
else:
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False)
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
after_first=found_node)
def get_trailing_byte_sequence(self, num):
return self.tbs_map[num].bytestring