mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
0ab0246048
commit
ae6f049792
@ -424,12 +424,7 @@ class IndexHeader(object): # {{{
|
|||||||
if self.index_encoding == 'unknown':
|
if self.index_encoding == 'unknown':
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Unknown index encoding: %d'%self.index_encoding_num)
|
'Unknown index encoding: %d'%self.index_encoding_num)
|
||||||
self.locale_raw, = struct.unpack(b'>I', raw[32:36])
|
self.possibly_language = raw[32:36]
|
||||||
langcode = self.locale_raw
|
|
||||||
langid = langcode & 0xFF
|
|
||||||
sublangid = (langcode >> 10) & 0xFF
|
|
||||||
self.language = main_language.get(langid, 'ENGLISH')
|
|
||||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
|
||||||
self.num_index_entries, = struct.unpack('>I', raw[36:40])
|
self.num_index_entries, = struct.unpack('>I', raw[36:40])
|
||||||
self.ordt_start, = struct.unpack('>I', raw[40:44])
|
self.ordt_start, = struct.unpack('>I', raw[40:44])
|
||||||
self.ligt_start, = struct.unpack('>I', raw[44:48])
|
self.ligt_start, = struct.unpack('>I', raw[44:48])
|
||||||
@ -489,8 +484,7 @@ class IndexHeader(object): # {{{
|
|||||||
a('Number of index records: %d'%self.index_count)
|
a('Number of index records: %d'%self.index_count)
|
||||||
a('Index encoding: %s (%d)'%(self.index_encoding,
|
a('Index encoding: %s (%d)'%(self.index_encoding,
|
||||||
self.index_encoding_num))
|
self.index_encoding_num))
|
||||||
a('Index language: %s - %s (%s)'%(self.language, self.sublanguage,
|
a('Unknown (possibly language?): %r'%(self.possibly_language))
|
||||||
hex(self.locale_raw)))
|
|
||||||
a('Number of index entries: %d'% self.num_index_entries)
|
a('Number of index entries: %d'% self.num_index_entries)
|
||||||
a('ORDT start: %d'%self.ordt_start)
|
a('ORDT start: %d'%self.ordt_start)
|
||||||
a('LIGT start: %d'%self.ligt_start)
|
a('LIGT start: %d'%self.ligt_start)
|
||||||
@ -1038,6 +1032,7 @@ class TBSIndexing(object): # {{{
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def read_starting_section(byts): # {{{
|
def read_starting_section(byts): # {{{
|
||||||
|
orig = byts
|
||||||
si, extra, consumed = decode_tbs(byts)
|
si, extra, consumed = decode_tbs(byts)
|
||||||
byts = byts[consumed:]
|
byts = byts[consumed:]
|
||||||
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
|
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
|
||||||
@ -1054,7 +1049,7 @@ class TBSIndexing(object): # {{{
|
|||||||
eof = extra[0b0001]
|
eof = extra[0b0001]
|
||||||
if eof != 0:
|
if eof != 0:
|
||||||
raise ValueError('Unknown eof value %s when reading'
|
raise ValueError('Unknown eof value %s when reading'
|
||||||
' starting section'%eof)
|
' starting section. All bytes: %r'%(eof, orig))
|
||||||
ans.append('This record is spanned by an article from'
|
ans.append('This record is spanned by an article from'
|
||||||
' the section: %d'%si.index)
|
' the section: %d'%si.index)
|
||||||
return si, byts
|
return si, byts
|
||||||
|
@ -15,7 +15,6 @@ from collections import OrderedDict, defaultdict
|
|||||||
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
||||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||||
encode_trailing_data, encode_tbs, align_block, utf8_text)
|
encode_trailing_data, encode_tbs, align_block, utf8_text)
|
||||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
|
||||||
|
|
||||||
|
|
||||||
class CNCX(object): # {{{
|
class CNCX(object): # {{{
|
||||||
@ -173,28 +172,34 @@ class TBS(object): # {{{
|
|||||||
trailing byte sequence for the record.
|
trailing byte sequence for the record.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, data, is_periodical, first=False, all_sections=[]):
|
def __init__(self, data, is_periodical, first=False, all_sections=[],
|
||||||
if not data:
|
after_first=False):
|
||||||
self.bytestring = encode_trailing_data(b'')
|
self.section_map = OrderedDict((i.index, i) for i in
|
||||||
else:
|
sorted(all_sections, key=lambda x:x.offset))
|
||||||
self.section_map = OrderedDict((i.index, i) for i in
|
|
||||||
sorted(all_sections, key=lambda x:x.offset))
|
|
||||||
|
|
||||||
if is_periodical:
|
if is_periodical:
|
||||||
# The starting bytes.
|
# The starting bytes.
|
||||||
# The value is zero which I think indicates the periodical
|
# The value is zero which I think indicates the periodical
|
||||||
# index entry. The values for the various flags seem to be
|
# index entry. The values for the various flags seem to be
|
||||||
# unused. If the 0b100 is present, it means that the record
|
# unused. If the 0b100 is present, it means that the record
|
||||||
# deals with section 1 (or is the final record with section
|
# deals with section 1 (or is the final record with section
|
||||||
# transitions).
|
# transitions).
|
||||||
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
|
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
|
||||||
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
|
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
|
||||||
flag_size=3)
|
flag_size=3)
|
||||||
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
|
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
|
||||||
flag_size=3)
|
flag_size=3)
|
||||||
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
|
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
|
||||||
0}, flag_size=3)
|
0}, flag_size=3)
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
byts = b''
|
||||||
|
if after_first:
|
||||||
|
# This can happen if a record contains only text between
|
||||||
|
# the periodical start and the first section
|
||||||
|
byts = self.type_011
|
||||||
|
self.bytestring = encode_trailing_data(byts)
|
||||||
|
else:
|
||||||
depth_map = defaultdict(list)
|
depth_map = defaultdict(list)
|
||||||
for x in ('starts', 'ends', 'completes'):
|
for x in ('starts', 'ends', 'completes'):
|
||||||
for idx in data[x]:
|
for idx in data[x]:
|
||||||
@ -202,6 +207,9 @@ class TBS(object): # {{{
|
|||||||
for l in depth_map.itervalues():
|
for l in depth_map.itervalues():
|
||||||
l.sort(key=lambda x:x.offset)
|
l.sort(key=lambda x:x.offset)
|
||||||
self.periodical_tbs(data, first, depth_map)
|
self.periodical_tbs(data, first, depth_map)
|
||||||
|
else:
|
||||||
|
if not data:
|
||||||
|
self.bytestring = encode_trailing_data(b'')
|
||||||
else:
|
else:
|
||||||
self.book_tbs(data, first)
|
self.book_tbs(data, first)
|
||||||
|
|
||||||
@ -240,15 +248,13 @@ class TBS(object): # {{{
|
|||||||
# has section transitions
|
# has section transitions
|
||||||
if depth_map[2]:
|
if depth_map[2]:
|
||||||
parent_section_index = depth_map[2][0].parent_index
|
parent_section_index = depth_map[2][0].parent_index
|
||||||
typ = self.type_011
|
|
||||||
else:
|
else:
|
||||||
parent_section_index = depth_map[1][0].index
|
parent_section_index = depth_map[1][0].index
|
||||||
typ = (self.type_110 if parent_section_index == 1 else
|
typ = self.type_011
|
||||||
self.type_011)
|
|
||||||
|
|
||||||
buf.write(typ)
|
buf.write(typ)
|
||||||
|
|
||||||
if parent_section_index > 1:
|
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
|
||||||
# Write starting section information
|
# Write starting section information
|
||||||
if spanner is None:
|
if spanner is None:
|
||||||
num_articles = len(depth_map[1])
|
num_articles = len(depth_map[1])
|
||||||
@ -429,9 +435,8 @@ class Indexer(object): # {{{
|
|||||||
# Index Encoding 28-32
|
# Index Encoding 28-32
|
||||||
buf.write(pack(b'>I', 65001)) # utf-8
|
buf.write(pack(b'>I', 65001)) # utf-8
|
||||||
|
|
||||||
# Index language 32-36
|
# Unknown 32-36
|
||||||
buf.write(iana2mobi(
|
buf.write(b'\xff'*4)
|
||||||
str(self.oeb.metadata.language[0])))
|
|
||||||
|
|
||||||
# Number of index entries 36-40
|
# Number of index entries 36-40
|
||||||
buf.write(pack(b'>I', len(self.indices)))
|
buf.write(pack(b'>I', len(self.indices)))
|
||||||
@ -680,15 +685,20 @@ class Indexer(object): # {{{
|
|||||||
found_node = False
|
found_node = False
|
||||||
sections = [i for i in self.indices if i.depth == 1]
|
sections = [i for i in self.indices if i.depth == 1]
|
||||||
deepest = max(i.depth for i in self.indices)
|
deepest = max(i.depth for i in self.indices)
|
||||||
|
|
||||||
for i in xrange(self.number_of_text_records):
|
for i in xrange(self.number_of_text_records):
|
||||||
offset = i * RECORD_SIZE
|
offset = i * RECORD_SIZE
|
||||||
next_offset = offset + RECORD_SIZE
|
next_offset = offset + RECORD_SIZE
|
||||||
data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]),
|
data = {'ends':[], 'completes':[], 'starts':[],
|
||||||
('spans', None), ('offset', offset)])
|
'spans':None, 'offset':offset, 'record_number':i+1}
|
||||||
|
|
||||||
for index in self.indices:
|
for index in self.indices:
|
||||||
if index.offset >= next_offset:
|
if index.offset >= next_offset:
|
||||||
# Node starts after current record
|
# Node starts after current record
|
||||||
break
|
if index.depth == deepest:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
continue
|
||||||
if index.next_offset <= offset:
|
if index.next_offset <= offset:
|
||||||
# Node ends before current record
|
# Node ends before current record
|
||||||
continue
|
continue
|
||||||
@ -706,13 +716,15 @@ class Indexer(object): # {{{
|
|||||||
data['ends'].append(index)
|
data['ends'].append(index)
|
||||||
elif index.depth == deepest:
|
elif index.depth == deepest:
|
||||||
data['spans'] = index
|
data['spans'] = index
|
||||||
|
|
||||||
if (data['ends'] or data['completes'] or data['starts'] or
|
if (data['ends'] or data['completes'] or data['starts'] or
|
||||||
data['spans'] is not None):
|
data['spans'] is not None):
|
||||||
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
|
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
|
||||||
found_node, all_sections=sections)
|
found_node, all_sections=sections)
|
||||||
found_node = True
|
found_node = True
|
||||||
else:
|
else:
|
||||||
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False)
|
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
|
||||||
|
after_first=found_node)
|
||||||
|
|
||||||
def get_trailing_byte_sequence(self, num):
|
def get_trailing_byte_sequence(self, num):
|
||||||
return self.tbs_map[num].bytestring
|
return self.tbs_map[num].bytestring
|
||||||
|
Loading…
x
Reference in New Issue
Block a user