mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
MOBI inspect: Handle indexing data from amazon produced periodicals as well
This commit is contained in:
parent
fbc770710b
commit
5b5421d21e
@ -18,6 +18,11 @@ from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
|||||||
get_trailing_data, decode_tbs)
|
get_trailing_data, decode_tbs)
|
||||||
from calibre.utils.magick.draw import identify_data
|
from calibre.utils.magick.draw import identify_data
|
||||||
|
|
||||||
|
def format_bytes(byts):
|
||||||
|
byts = bytearray(byts)
|
||||||
|
byts = [hex(b)[2:] for b in byts]
|
||||||
|
return ' '.join(byts)
|
||||||
|
|
||||||
# PalmDB {{{
|
# PalmDB {{{
|
||||||
class PalmDOCAttributes(object):
|
class PalmDOCAttributes(object):
|
||||||
|
|
||||||
@ -76,7 +81,7 @@ class PalmDB(object):
|
|||||||
self.ident = self.type + self.creator
|
self.ident = self.type + self.creator
|
||||||
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
||||||
raise ValueError('Unknown book ident: %r'%self.ident)
|
raise ValueError('Unknown book ident: %r'%self.ident)
|
||||||
self.uid_seed, = struct.unpack(b'>I', self.raw[68:72])
|
self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72])
|
||||||
self.next_rec_list_id = self.raw[72:76]
|
self.next_rec_list_id = self.raw[72:76]
|
||||||
|
|
||||||
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
|
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
|
||||||
@ -97,7 +102,7 @@ class PalmDB(object):
|
|||||||
ans.append('Sort Info ID: %r'%self.sort_info_id)
|
ans.append('Sort Info ID: %r'%self.sort_info_id)
|
||||||
ans.append('Type: %r'%self.type)
|
ans.append('Type: %r'%self.type)
|
||||||
ans.append('Creator: %r'%self.creator)
|
ans.append('Creator: %r'%self.creator)
|
||||||
ans.append('UID seed: %r'%self.uid_seed)
|
ans.append('Last record UID +1: %r'%self.last_record_uid)
|
||||||
ans.append('Next record list id: %r'%self.next_rec_list_id)
|
ans.append('Next record list id: %r'%self.next_rec_list_id)
|
||||||
ans.append('Number of records: %s'%self.number_of_records)
|
ans.append('Number of records: %s'%self.number_of_records)
|
||||||
|
|
||||||
@ -535,6 +540,15 @@ class Tag(object): # {{{
|
|||||||
21 : ('Parent section index', 'parent_index'),
|
21 : ('Parent section index', 'parent_index'),
|
||||||
22 : ('Description offset in cncx', 'desc_offset'),
|
22 : ('Description offset in cncx', 'desc_offset'),
|
||||||
23 : ('Author offset in cncx', 'author_offset'),
|
23 : ('Author offset in cncx', 'author_offset'),
|
||||||
|
69 : ('Offset from first image record num to the'
|
||||||
|
' image record associated with this article',
|
||||||
|
'image_index'),
|
||||||
|
70 : ('Description offset in cncx', 'desc_offset'),
|
||||||
|
71 : ('Image attribution offset in cncx',
|
||||||
|
'image_attr_offset'),
|
||||||
|
72 : ('Image caption offset in cncx',
|
||||||
|
'image_caption_offset'),
|
||||||
|
73 : ('Author offset in cncx', 'author_offset'),
|
||||||
},
|
},
|
||||||
|
|
||||||
'chapter_with_subchapters' : {
|
'chapter_with_subchapters' : {
|
||||||
@ -546,6 +560,8 @@ class Tag(object): # {{{
|
|||||||
5 : ('Class offset in cncx', 'class_offset'),
|
5 : ('Class offset in cncx', 'class_offset'),
|
||||||
22 : ('First section index', 'first_child_index'),
|
22 : ('First section index', 'first_child_index'),
|
||||||
23 : ('Last section index', 'last_child_index'),
|
23 : ('Last section index', 'last_child_index'),
|
||||||
|
69 : ('Offset from first image record num to masthead'
|
||||||
|
' record', 'image_index'),
|
||||||
},
|
},
|
||||||
|
|
||||||
'section' : {
|
'section' : {
|
||||||
@ -558,21 +574,23 @@ class Tag(object): # {{{
|
|||||||
|
|
||||||
|
|
||||||
def __init__(self, tagx, vals, entry_type, cncx):
|
def __init__(self, tagx, vals, entry_type, cncx):
|
||||||
self.value = vals if len(vals) > 1 else vals[0]
|
self.value = vals if len(vals) > 1 else vals[0] if vals else None
|
||||||
self.entry_type = entry_type
|
self.entry_type = entry_type
|
||||||
|
tag_type = tagx.tag
|
||||||
|
|
||||||
self.cncx_value = None
|
self.cncx_value = None
|
||||||
if tagx.tag in self.TAG_MAP:
|
if tag_type in self.TAG_MAP:
|
||||||
self.attr, self.desc = self.TAG_MAP[tagx.tag]
|
self.attr, self.desc = self.TAG_MAP[tag_type]
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
td = self.INTERPRET_MAP[entry_type]
|
td = self.INTERPRET_MAP[entry_type]
|
||||||
except:
|
except:
|
||||||
raise ValueError('Unknown entry type: %s'%entry_type)
|
raise ValueError('Unknown entry type: %s'%entry_type)
|
||||||
try:
|
try:
|
||||||
self.desc, self.attr = td[tagx.tag]
|
self.desc, self.attr = td[tag_type]
|
||||||
except:
|
except:
|
||||||
raise ValueError('Unknown tag: %d for entry type: %s'%(
|
self.desc = '??Unknown (tag value: %d)'%tag_type
|
||||||
tagx.tag, entry_type))
|
self.attr = 'unknown'
|
||||||
if '_offset' in self.attr:
|
if '_offset' in self.attr:
|
||||||
self.cncx_value = cncx[self.value]
|
self.cncx_value = cncx[self.value]
|
||||||
|
|
||||||
@ -603,7 +621,7 @@ class IndexEntry(object): # {{{
|
|||||||
0x3f : 'article',
|
0x3f : 'article',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, ident, entry_type, raw, cncx, tagx_entries, flags=0):
|
def __init__(self, ident, entry_type, raw, cncx, tagx_entries):
|
||||||
self.index = ident
|
self.index = ident
|
||||||
self.raw = raw
|
self.raw = raw
|
||||||
self.tags = []
|
self.tags = []
|
||||||
@ -617,11 +635,25 @@ class IndexEntry(object): # {{{
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
|
raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
|
||||||
|
|
||||||
|
self.flags = 0
|
||||||
|
|
||||||
|
if self.entry_type in ('periodical', 'article'):
|
||||||
|
large_tags = [t for t in tagx_entries if t.tag > 64]
|
||||||
|
if large_tags:
|
||||||
|
self.flags = ord(raw[0])
|
||||||
|
raw = raw[1:]
|
||||||
|
|
||||||
|
|
||||||
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
|
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
|
||||||
entry_type]
|
entry_type]
|
||||||
|
|
||||||
|
flags = self.flags
|
||||||
for tag in expected_tags:
|
for tag in expected_tags:
|
||||||
vals = []
|
vals = []
|
||||||
|
if tag.tag > 64:
|
||||||
|
has_tag = flags & 0b1
|
||||||
|
flags = flags >> 1
|
||||||
|
if not has_tag: continue
|
||||||
for i in range(tag.num_values):
|
for i in range(tag.num_values):
|
||||||
if not raw:
|
if not raw:
|
||||||
raise ValueError('Index entry does not match TAGX header')
|
raise ValueError('Index entry does not match TAGX header')
|
||||||
@ -630,26 +662,11 @@ class IndexEntry(object): # {{{
|
|||||||
vals.append(val)
|
vals.append(val)
|
||||||
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
|
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
|
||||||
|
|
||||||
if flags & 0b10:
|
|
||||||
# Look for optional description and author
|
|
||||||
desc_tag = [t for t in tagx_entries if t.tag == 22]
|
|
||||||
if desc_tag and raw:
|
|
||||||
val, consumed = decint(raw)
|
|
||||||
raw = raw[consumed:]
|
|
||||||
if val:
|
|
||||||
self.tags.append(Tag(desc_tag[0], [val], self.entry_type,
|
|
||||||
cncx))
|
|
||||||
if flags & 0b100:
|
|
||||||
aut_tag = [t for t in tagx_entries if t.tag == 23]
|
|
||||||
if aut_tag and raw:
|
|
||||||
val, consumed = decint(raw)
|
|
||||||
raw = raw[consumed:]
|
|
||||||
if val:
|
|
||||||
self.tags.append(Tag(aut_tag[0], [val], self.entry_type,
|
|
||||||
cncx))
|
|
||||||
|
|
||||||
self.consumed = len(orig_raw) - len(raw)
|
self.consumed = len(orig_raw) - len(raw)
|
||||||
self.trailing_bytes = raw
|
self.trailing_bytes = raw
|
||||||
|
if self.trailing_bytes.replace(b'\0', b''):
|
||||||
|
raise ValueError('IndexEntry has leftover bytes: %s'%format_bytes(
|
||||||
|
self.trailing_bytes))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label(self):
|
def label(self):
|
||||||
@ -701,10 +718,12 @@ class IndexEntry(object): # {{{
|
|||||||
return -1
|
return -1
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d, byte_size=%d)'%(
|
ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, '
|
||||||
self.index, self.entry_type, bin(self.entry_type_raw)[2:],
|
'length=%d, byte_size=%d)'%(
|
||||||
|
self.index, self.entry_type, bin(self.flags)[2:],
|
||||||
len(self.tags), self.byte_size)]
|
len(self.tags), self.byte_size)]
|
||||||
for tag in self.tags:
|
for tag in self.tags:
|
||||||
|
if tag.value is not None:
|
||||||
ans.append('\t'+str(tag))
|
ans.append('\t'+str(tag))
|
||||||
if self.first_child_index != -1:
|
if self.first_child_index != -1:
|
||||||
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
||||||
@ -724,6 +743,7 @@ class IndexRecord(object): # {{{
|
|||||||
|
|
||||||
def __init__(self, record, index_header, cncx):
|
def __init__(self, record, index_header, cncx):
|
||||||
self.record = record
|
self.record = record
|
||||||
|
self.alltext = None
|
||||||
raw = self.record.raw
|
raw = self.record.raw
|
||||||
|
|
||||||
if raw[:4] != b'INDX':
|
if raw[:4] != b'INDX':
|
||||||
@ -763,17 +783,14 @@ class IndexRecord(object): # {{{
|
|||||||
next_off = len(indxt)
|
next_off = len(indxt)
|
||||||
index, consumed = decode_hex_number(indxt[off:])
|
index, consumed = decode_hex_number(indxt[off:])
|
||||||
entry_type = ord(indxt[off+consumed])
|
entry_type = ord(indxt[off+consumed])
|
||||||
d, flags = 1, 0
|
pos = off+consumed+1
|
||||||
if index_header.index_type == 6:
|
idxe = IndexEntry(index, entry_type,
|
||||||
flags = ord(indxt[off+consumed+d])
|
|
||||||
d += 1
|
|
||||||
pos = off+consumed+d
|
|
||||||
self.indices.append(IndexEntry(index, entry_type,
|
|
||||||
indxt[pos:next_off], cncx,
|
indxt[pos:next_off], cncx,
|
||||||
index_header.tagx_entries, flags=flags))
|
index_header.tagx_entries)
|
||||||
|
self.indices.append(idxe)
|
||||||
|
|
||||||
rest = indxt[pos+self.indices[-1].consumed:]
|
rest = indxt[pos+self.indices[-1].consumed:]
|
||||||
if rest.replace(b'\0', ''): # There can be padding null bytes
|
if rest.replace(b'\0', b''): # There can be padding null bytes
|
||||||
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
||||||
|
|
||||||
def get_parent(self, index):
|
def get_parent(self, index):
|
||||||
@ -803,7 +820,11 @@ class IndexRecord(object): # {{{
|
|||||||
a('Index offsets: %r'%self.index_offsets)
|
a('Index offsets: %r'%self.index_offsets)
|
||||||
a('\nIndex Entries (%d entries):'%len(self.indices))
|
a('\nIndex Entries (%d entries):'%len(self.indices))
|
||||||
for entry in self.indices:
|
for entry in self.indices:
|
||||||
a(str(entry)+'\n')
|
offset = entry.offset
|
||||||
|
a(str(entry))
|
||||||
|
if offset is not None and self.alltext is not None:
|
||||||
|
a('\tHTML at offset: %r'%self.alltext[offset:offset+100])
|
||||||
|
a('')
|
||||||
|
|
||||||
return '\n'.join(ans)
|
return '\n'.join(ans)
|
||||||
|
|
||||||
@ -826,8 +847,15 @@ class CNCX(object) : # {{{
|
|||||||
while pos < len(raw):
|
while pos < len(raw):
|
||||||
length, consumed = decint(raw[pos:])
|
length, consumed = decint(raw[pos:])
|
||||||
if length > 0:
|
if length > 0:
|
||||||
|
try:
|
||||||
self.records[pos+record_offset] = raw[
|
self.records[pos+record_offset] = raw[
|
||||||
pos+consumed:pos+consumed+length].decode(codec)
|
pos+consumed:pos+consumed+length].decode(codec)
|
||||||
|
except:
|
||||||
|
byts = raw[pos+consumed:pos+consumed+length]
|
||||||
|
r = format_bytes(byts)
|
||||||
|
print ('CNCX entry at offset %d has unknown format %s'%(
|
||||||
|
pos+record_offset, r))
|
||||||
|
self.records[pos+record_offset] = r
|
||||||
pos += consumed+length
|
pos += consumed+length
|
||||||
record_offset += 0x10000
|
record_offset += 0x10000
|
||||||
|
|
||||||
@ -849,6 +877,7 @@ class TextRecord(object): # {{{
|
|||||||
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
|
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
|
||||||
raw_trailing_bytes = record.raw[len(self.raw):]
|
raw_trailing_bytes = record.raw[len(self.raw):]
|
||||||
self.raw = decompress(self.raw)
|
self.raw = decompress(self.raw)
|
||||||
|
|
||||||
if 0 in self.trailing_data:
|
if 0 in self.trailing_data:
|
||||||
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
|
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
|
||||||
if 1 in self.trailing_data:
|
if 1 in self.trailing_data:
|
||||||
@ -1114,13 +1143,13 @@ class MOBIFile(object): # {{{
|
|||||||
self.mobi_header = MOBIHeader(self.records[0])
|
self.mobi_header = MOBIHeader(self.records[0])
|
||||||
|
|
||||||
if 'huff' in self.mobi_header.compression.lower():
|
if 'huff' in self.mobi_header.compression.lower():
|
||||||
huffrecs = [r.raw for r in
|
huffrecs = [self.records[r].raw for r in
|
||||||
xrange(self.mobi_header.huffman_record_offset,
|
xrange(self.mobi_header.huffman_record_offset,
|
||||||
self.mobi_header.huffman_record_offset +
|
self.mobi_header.huffman_record_offset +
|
||||||
self.mobi_header.huffman_record_count)]
|
self.mobi_header.huffman_record_count)]
|
||||||
from calibre.ebooks.mobi.huffcdic import HuffReader
|
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||||
huffs = HuffReader(huffrecs)
|
huffs = HuffReader(huffrecs)
|
||||||
decompress = huffs.decompress
|
decompress = lambda x: huffs.decompress([x])
|
||||||
elif 'palmdoc' in self.mobi_header.compression.lower():
|
elif 'palmdoc' in self.mobi_header.compression.lower():
|
||||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
decompress = decompress_doc
|
decompress = decompress_doc
|
||||||
@ -1181,7 +1210,7 @@ class MOBIFile(object): # {{{
|
|||||||
print (str(self.mobi_header).encode('utf-8'), file=f)
|
print (str(self.mobi_header).encode('utf-8'), file=f)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def inspect_mobi(path_or_stream, prefix='decompiled'):
|
def inspect_mobi(path_or_stream, prefix='decompiled'): # {{{
|
||||||
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
||||||
open(path_or_stream, 'rb'))
|
open(path_or_stream, 'rb'))
|
||||||
f = MOBIFile(stream)
|
f = MOBIFile(stream)
|
||||||
@ -1193,7 +1222,22 @@ def inspect_mobi(path_or_stream, prefix='decompiled'):
|
|||||||
os.mkdir(ddir)
|
os.mkdir(ddir)
|
||||||
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
||||||
f.print_header(f=out)
|
f.print_header(f=out)
|
||||||
|
|
||||||
|
alltext = os.path.join(ddir, 'text.html')
|
||||||
|
with open(alltext, 'wb') as of:
|
||||||
|
alltext = b''
|
||||||
|
for rec in f.text_records:
|
||||||
|
of.write(rec.raw)
|
||||||
|
alltext += rec.raw
|
||||||
|
of.seek(0)
|
||||||
|
root = html.fromstring(alltext.decode('utf-8'))
|
||||||
|
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
|
||||||
|
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
|
||||||
|
include_meta_content_type=True))
|
||||||
|
|
||||||
|
|
||||||
if f.index_header is not None:
|
if f.index_header is not None:
|
||||||
|
f.index_record.alltext = alltext
|
||||||
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
|
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
|
||||||
print(str(f.index_header), file=out)
|
print(str(f.index_header), file=out)
|
||||||
print('\n\n', file=out)
|
print('\n\n', file=out)
|
||||||
@ -1211,21 +1255,11 @@ def inspect_mobi(path_or_stream, prefix='decompiled'):
|
|||||||
for rec in getattr(f, attr):
|
for rec in getattr(f, attr):
|
||||||
rec.dump(tdir)
|
rec.dump(tdir)
|
||||||
|
|
||||||
alltext = os.path.join(ddir, 'text.html')
|
|
||||||
with open(alltext, 'wb') as of:
|
|
||||||
alltext = b''
|
|
||||||
for rec in f.text_records:
|
|
||||||
of.write(rec.raw)
|
|
||||||
alltext += rec.raw
|
|
||||||
of.seek(0)
|
|
||||||
root = html.fromstring(alltext.decode('utf-8'))
|
|
||||||
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
|
|
||||||
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
|
|
||||||
include_meta_content_type=True))
|
|
||||||
|
|
||||||
|
|
||||||
print ('Debug data saved to:', ddir)
|
print ('Debug data saved to:', ddir)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
inspect_mobi(sys.argv[1])
|
inspect_mobi(sys.argv[1])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user