py3: Various MOBI fixes found while reviewing the previous py3 merge

This commit is contained in:
Kovid Goyal 2019-07-07 18:14:13 +05:30
parent 134692af38
commit b86e9f0f27
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 56 additions and 28 deletions

View File

@ -28,6 +28,7 @@ class PalmDOCAttributes(object):
def __str__(self):
return '%s: %s'%(self.name, bool(self.val))
__unicode__ = __str__
def __init__(self, raw):
self.val = struct.unpack(b'<H', raw)[0]
@ -44,6 +45,7 @@ class PalmDOCAttributes(object):
def __str__(self):
attrs = '\n\t'.join([unicode_type(x) for x in self.attributes])
return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
__unicode__ = __str__
class PalmDB(object):
@ -102,6 +104,7 @@ class PalmDB(object):
ans.append('Number of records: %s'%self.number_of_records)
return '\n'.join(ans)
__unicode__ = __str__
# }}}
@ -257,6 +260,8 @@ class EXTHHeader(object):
for r in self.records:
ans.append(unicode_type(r))
return '\n'.join(ans)
__unicode__ = __str__
# }}}

View File

@ -19,7 +19,7 @@ from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
from calibre.utils.imghdr import what
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.debug.headers import TextRecord
from polyglot.builtins import unicode_type, range, iteritems, as_bytes
from polyglot.builtins import unicode_type, range, iteritems, as_bytes, print_to_binary_file
class TagX(object): # {{{
@ -583,7 +583,7 @@ class TBSIndexing(object): # {{{
types[tbs_type] += strings
for typ, strings in iteritems(types):
with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
f.write('\n'.join(strings))
f.write(as_bytes('\n'.join(strings)))
def dump_record(self, r, dat):
ans = []
@ -788,14 +788,15 @@ class MOBIFile(object): # {{{
self.index_record.indices, self.mobi_header.type_raw)
def print_header(self, f=sys.stdout):
print(unicode_type(self.palmdb).encode('utf-8'), file=f)
print(file=f)
print('Record headers:', file=f)
p = print_to_binary_file(f)
p(unicode_type(self.palmdb))
p()
p('Record headers:')
for i, r in enumerate(self.records):
print('%6d. %s'%(i, r.header), file=f)
p('%6d. %s'%(i, r.header))
print(file=f)
print(unicode_type(self.mobi_header).encode('utf-8'), file=f)
p()
p(unicode_type(self.mobi_header))
# }}}
@ -820,18 +821,20 @@ def inspect_mobi(mobi_file, ddir):
if f.index_header is not None:
f.index_record.alltext = alltext
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
print = print_to_binary_file(out)
print(unicode_type(f.index_header), file=out)
print('\n\n', file=out)
if f.secondary_index_header is not None:
print(unicode_type(f.secondary_index_header).encode('utf-8'), file=out)
print(unicode_type(f.secondary_index_header), file=out)
print('\n\n', file=out)
if f.secondary_index_record is not None:
print(unicode_type(f.secondary_index_record).encode('utf-8'), file=out)
print(unicode_type(f.secondary_index_record), file=out)
print('\n\n', file=out)
print(unicode_type(f.cncx).encode('utf-8'), file=out)
print(unicode_type(f.cncx), file=out)
print('\n\n', file=out)
print(unicode_type(f.index_record), file=out)
with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
print = print_to_binary_file(out)
print(unicode_type(f.tbs_indexing), file=out)
f.tbs_indexing.dump(ddir)

View File

@ -17,7 +17,7 @@ from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, itervalues, map, unicode_type, zip
from polyglot.builtins import iteritems, itervalues, map, unicode_type, zip, print_to_binary_file
class FDST(object):
@ -94,14 +94,15 @@ class MOBIFile(object):
self.read_tbs()
def print_header(self, f=sys.stdout):
print(unicode_type(self.mf.palmdb).encode('utf-8'), file=f)
print(file=f)
print('Record headers:', file=f)
p = print_to_binary_file(f)
p(unicode_type(self.mf.palmdb))
p()
p('Record headers:')
for i, r in enumerate(self.mf.records):
print('%6d. %s'%(i, r.header), file=f)
p('%6d. %s'%(i, r.header))
print(file=f)
print(unicode_type(self.mf.mobi8_header).encode('utf-8'), file=f)
p()
p(unicode_type(self.mf.mobi8_header))
def read_fdst(self):
self.fdst = None

View File

@ -75,7 +75,7 @@ class EXTHHeader(object): # {{{
# they are messed up in the PDB header
try:
title = self.decode(content)
except:
except Exception:
pass
elif idx == 524: # Lang code
try:
@ -83,7 +83,7 @@ class EXTHHeader(object): # {{{
lang = canonicalize_lang(lang)
if lang:
self.mi.language = lang
except:
except Exception:
pass
elif idx == 525:
try:
@ -138,8 +138,8 @@ class EXTHHeader(object): # {{{
self.mi.tags = list(set(self.mi.tags))
elif idx == 106:
try:
self.mi.pubdate = parse_date(content, as_utc=False)
except:
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
except Exception:
pass
elif idx == 108:
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
@ -165,7 +165,7 @@ class EXTHHeader(object): # {{{
try:
self.uuid = content.decode('ascii')
self.mi.set_identifier('mobi-asin', self.uuid)
except:
except Exception:
self.uuid = None
elif idx == 116:
self.start_offset, = struct.unpack(b'>L', content)
@ -302,14 +302,14 @@ class MetadataHeader(BookHeader):
try:
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
return 'joint'
except:
except Exception:
pass
return None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in [b'BOOKMOBI', b'TEXTREAD']:
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % ident)
return ident

View File

@ -123,6 +123,9 @@ class CNCX(object): # {{{
def iteritems(self):
return iteritems(self.records)
def items(self):
return iteritems(self.records)
# }}}

View File

@ -89,7 +89,7 @@ class MobiReader(object):
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in [b'BOOKMOBI', b'TEXTREAD']:
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % repr(self.ident))
self.sections = []

View File

@ -223,7 +223,7 @@ def get_trailing_data(record, extra_data_flags):
if extra_data_flags & 0b1:
# Only the first two bits are used for the size since there can
# never be more than 3 trailing multibyte chars
sz = (ord(record[-1]) & 0b11) + 1
sz = (ord(record[-1:]) & 0b11) + 1
consumed = 1
if sz > consumed:
data[0] = record[-sz:-consumed]
@ -298,7 +298,7 @@ def decode_tbs(byts, flag_size=4):
extra[0b0010] = x
consumed += consumed2
if flags & 0b0100:
extra[0b0100] = ord(byts[0])
extra[0b0100] = ord(byts[0:1])
byts = byts[1:]
consumed += 1
if flags & 0b0001:

View File

@ -182,3 +182,19 @@ else:
def reload(module):
return builtins.reload(module)
def print_to_binary_file(fileobj, encoding='utf-8'):
def print(*a, **kw):
f = kw.get('file', fileobj)
if a:
sep = as_bytes(kw.get('sep', ' '), encoding)
for x in a:
x = as_bytes(x, encoding)
f.write(x)
if x is not a[-1]:
f.write(sep)
f.write(as_bytes(kw.get('end', '\n')))
return print