This commit is contained in:
Kovid Goyal 2011-07-26 20:05:32 -06:00
parent 0ab0246048
commit ae6f049792
2 changed files with 48 additions and 41 deletions

View File

@ -424,12 +424,7 @@ class IndexHeader(object): # {{{
if self.index_encoding == 'unknown': if self.index_encoding == 'unknown':
raise ValueError( raise ValueError(
'Unknown index encoding: %d'%self.index_encoding_num) 'Unknown index encoding: %d'%self.index_encoding_num)
self.locale_raw, = struct.unpack(b'>I', raw[32:36]) self.possibly_language = raw[32:36]
langcode = self.locale_raw
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.num_index_entries, = struct.unpack('>I', raw[36:40]) self.num_index_entries, = struct.unpack('>I', raw[36:40])
self.ordt_start, = struct.unpack('>I', raw[40:44]) self.ordt_start, = struct.unpack('>I', raw[40:44])
self.ligt_start, = struct.unpack('>I', raw[44:48]) self.ligt_start, = struct.unpack('>I', raw[44:48])
@ -489,8 +484,7 @@ class IndexHeader(object): # {{{
a('Number of index records: %d'%self.index_count) a('Number of index records: %d'%self.index_count)
a('Index encoding: %s (%d)'%(self.index_encoding, a('Index encoding: %s (%d)'%(self.index_encoding,
self.index_encoding_num)) self.index_encoding_num))
a('Index language: %s - %s (%s)'%(self.language, self.sublanguage, a('Unknown (possibly language?): %r'%(self.possibly_language))
hex(self.locale_raw)))
a('Number of index entries: %d'% self.num_index_entries) a('Number of index entries: %d'% self.num_index_entries)
a('ORDT start: %d'%self.ordt_start) a('ORDT start: %d'%self.ordt_start)
a('LIGT start: %d'%self.ligt_start) a('LIGT start: %d'%self.ligt_start)
@ -1038,6 +1032,7 @@ class TBSIndexing(object): # {{{
# }}} # }}}
def read_starting_section(byts): # {{{ def read_starting_section(byts): # {{{
orig = byts
si, extra, consumed = decode_tbs(byts) si, extra, consumed = decode_tbs(byts)
byts = byts[consumed:] byts = byts[consumed:]
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
@ -1054,7 +1049,7 @@ class TBSIndexing(object): # {{{
eof = extra[0b0001] eof = extra[0b0001]
if eof != 0: if eof != 0:
raise ValueError('Unknown eof value %s when reading' raise ValueError('Unknown eof value %s when reading'
' starting section'%eof) ' starting section. All bytes: %r'%(eof, orig))
ans.append('This record is spanned by an article from' ans.append('This record is spanned by an article from'
' the section: %d'%si.index) ' the section: %d'%si.index)
return si, byts return si, byts

View File

@ -15,7 +15,6 @@ from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_trailing_data, encode_tbs, align_block, utf8_text) encode_trailing_data, encode_tbs, align_block, utf8_text)
from calibre.ebooks.mobi.langcodes import iana2mobi
class CNCX(object): # {{{ class CNCX(object): # {{{
@ -173,28 +172,34 @@ class TBS(object): # {{{
trailing byte sequence for the record. trailing byte sequence for the record.
''' '''
def __init__(self, data, is_periodical, first=False, all_sections=[]): def __init__(self, data, is_periodical, first=False, all_sections=[],
if not data: after_first=False):
self.bytestring = encode_trailing_data(b'') self.section_map = OrderedDict((i.index, i) for i in
else: sorted(all_sections, key=lambda x:x.offset))
self.section_map = OrderedDict((i.index, i) for i in
sorted(all_sections, key=lambda x:x.offset))
if is_periodical: if is_periodical:
# The starting bytes. # The starting bytes.
# The value is zero which I think indicates the periodical # The value is zero which I think indicates the periodical
# index entry. The values for the various flags seem to be # index entry. The values for the various flags seem to be
# unused. If the 0b100 is present, it means that the record # unused. If the 0b100 is present, it means that the record
# deals with section 1 (or is the final record with section # deals with section 1 (or is the final record with section
# transitions). # transitions).
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
flag_size=3) flag_size=3)
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
flag_size=3) flag_size=3)
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3) 0}, flag_size=3)
if not data:
byts = b''
if after_first:
# This can happen if a record contains only text between
# the periodical start and the first section
byts = self.type_011
self.bytestring = encode_trailing_data(byts)
else:
depth_map = defaultdict(list) depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'): for x in ('starts', 'ends', 'completes'):
for idx in data[x]: for idx in data[x]:
@ -202,6 +207,9 @@ class TBS(object): # {{{
for l in depth_map.itervalues(): for l in depth_map.itervalues():
l.sort(key=lambda x:x.offset) l.sort(key=lambda x:x.offset)
self.periodical_tbs(data, first, depth_map) self.periodical_tbs(data, first, depth_map)
else:
if not data:
self.bytestring = encode_trailing_data(b'')
else: else:
self.book_tbs(data, first) self.book_tbs(data, first)
@ -240,15 +248,13 @@ class TBS(object): # {{{
# has section transitions # has section transitions
if depth_map[2]: if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index parent_section_index = depth_map[2][0].parent_index
typ = self.type_011
else: else:
parent_section_index = depth_map[1][0].index parent_section_index = depth_map[1][0].index
typ = (self.type_110 if parent_section_index == 1 else typ = self.type_011
self.type_011)
buf.write(typ) buf.write(typ)
if parent_section_index > 1: if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
# Write starting section information # Write starting section information
if spanner is None: if spanner is None:
num_articles = len(depth_map[1]) num_articles = len(depth_map[1])
@ -429,9 +435,8 @@ class Indexer(object): # {{{
# Index Encoding 28-32 # Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8 buf.write(pack(b'>I', 65001)) # utf-8
# Index language 32-36 # Unknown 32-36
buf.write(iana2mobi( buf.write(b'\xff'*4)
str(self.oeb.metadata.language[0])))
# Number of index entries 36-40 # Number of index entries 36-40
buf.write(pack(b'>I', len(self.indices))) buf.write(pack(b'>I', len(self.indices)))
@ -680,15 +685,20 @@ class Indexer(object): # {{{
found_node = False found_node = False
sections = [i for i in self.indices if i.depth == 1] sections = [i for i in self.indices if i.depth == 1]
deepest = max(i.depth for i in self.indices) deepest = max(i.depth for i in self.indices)
for i in xrange(self.number_of_text_records): for i in xrange(self.number_of_text_records):
offset = i * RECORD_SIZE offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE next_offset = offset + RECORD_SIZE
data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), data = {'ends':[], 'completes':[], 'starts':[],
('spans', None), ('offset', offset)]) 'spans':None, 'offset':offset, 'record_number':i+1}
for index in self.indices: for index in self.indices:
if index.offset >= next_offset: if index.offset >= next_offset:
# Node starts after current record # Node starts after current record
break if index.depth == deepest:
break
else:
continue
if index.next_offset <= offset: if index.next_offset <= offset:
# Node ends before current record # Node ends before current record
continue continue
@ -706,13 +716,15 @@ class Indexer(object): # {{{
data['ends'].append(index) data['ends'].append(index)
elif index.depth == deepest: elif index.depth == deepest:
data['spans'] = index data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None): data['spans'] is not None):
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
found_node, all_sections=sections) found_node, all_sections=sections)
found_node = True found_node = True
else: else:
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False) self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
after_first=found_node)
def get_trailing_byte_sequence(self, num): def get_trailing_byte_sequence(self, num):
return self.tbs_map[num].bytestring return self.tbs_map[num].bytestring