py3: Various MOBI fixes found while reviewing the previous py3 merge

This commit is contained in:
Kovid Goyal 2019-07-07 18:14:13 +05:30
parent 134692af38
commit b86e9f0f27
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 56 additions and 28 deletions

View File

@ -28,6 +28,7 @@ class PalmDOCAttributes(object):
def __str__(self): def __str__(self):
return '%s: %s'%(self.name, bool(self.val)) return '%s: %s'%(self.name, bool(self.val))
__unicode__ = __str__
def __init__(self, raw): def __init__(self, raw):
self.val = struct.unpack(b'<H', raw)[0] self.val = struct.unpack(b'<H', raw)[0]
@ -44,6 +45,7 @@ class PalmDOCAttributes(object):
def __str__(self): def __str__(self):
attrs = '\n\t'.join([unicode_type(x) for x in self.attributes]) attrs = '\n\t'.join([unicode_type(x) for x in self.attributes])
return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs) return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
__unicode__ = __str__
class PalmDB(object): class PalmDB(object):
@ -102,6 +104,7 @@ class PalmDB(object):
ans.append('Number of records: %s'%self.number_of_records) ans.append('Number of records: %s'%self.number_of_records)
return '\n'.join(ans) return '\n'.join(ans)
__unicode__ = __str__
# }}} # }}}
@ -257,6 +260,8 @@ class EXTHHeader(object):
for r in self.records: for r in self.records:
ans.append(unicode_type(r)) ans.append(unicode_type(r))
return '\n'.join(ans) return '\n'.join(ans)
__unicode__ = __str__
# }}} # }}}

View File

@ -19,7 +19,7 @@ from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.headers import TextRecord
from polyglot.builtins import unicode_type, range, iteritems, as_bytes from polyglot.builtins import unicode_type, range, iteritems, as_bytes, print_to_binary_file
class TagX(object): # {{{ class TagX(object): # {{{
@ -583,7 +583,7 @@ class TBSIndexing(object): # {{{
types[tbs_type] += strings types[tbs_type] += strings
for typ, strings in iteritems(types): for typ, strings in iteritems(types):
with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f: with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
f.write('\n'.join(strings)) f.write(as_bytes('\n'.join(strings)))
def dump_record(self, r, dat): def dump_record(self, r, dat):
ans = [] ans = []
@ -788,14 +788,15 @@ class MOBIFile(object): # {{{
self.index_record.indices, self.mobi_header.type_raw) self.index_record.indices, self.mobi_header.type_raw)
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print(unicode_type(self.palmdb).encode('utf-8'), file=f) p = print_to_binary_file(f)
print(file=f) p(unicode_type(self.palmdb))
print('Record headers:', file=f) p()
p('Record headers:')
for i, r in enumerate(self.records): for i, r in enumerate(self.records):
print('%6d. %s'%(i, r.header), file=f) p('%6d. %s'%(i, r.header))
print(file=f) p()
print(unicode_type(self.mobi_header).encode('utf-8'), file=f) p(unicode_type(self.mobi_header))
# }}} # }}}
@ -820,18 +821,20 @@ def inspect_mobi(mobi_file, ddir):
if f.index_header is not None: if f.index_header is not None:
f.index_record.alltext = alltext f.index_record.alltext = alltext
with open(os.path.join(ddir, 'index.txt'), 'wb') as out: with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
print = print_to_binary_file(out)
print(unicode_type(f.index_header), file=out) print(unicode_type(f.index_header), file=out)
print('\n\n', file=out) print('\n\n', file=out)
if f.secondary_index_header is not None: if f.secondary_index_header is not None:
print(unicode_type(f.secondary_index_header).encode('utf-8'), file=out) print(unicode_type(f.secondary_index_header), file=out)
print('\n\n', file=out) print('\n\n', file=out)
if f.secondary_index_record is not None: if f.secondary_index_record is not None:
print(unicode_type(f.secondary_index_record).encode('utf-8'), file=out) print(unicode_type(f.secondary_index_record), file=out)
print('\n\n', file=out) print('\n\n', file=out)
print(unicode_type(f.cncx).encode('utf-8'), file=out) print(unicode_type(f.cncx), file=out)
print('\n\n', file=out) print('\n\n', file=out)
print(unicode_type(f.index_record), file=out) print(unicode_type(f.index_record), file=out)
with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out: with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
print = print_to_binary_file(out)
print(unicode_type(f.tbs_indexing), file=out) print(unicode_type(f.tbs_indexing), file=out)
f.tbs_indexing.dump(ddir) f.tbs_indexing.dump(ddir)

View File

@ -17,7 +17,7 @@ from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, itervalues, map, unicode_type, zip from polyglot.builtins import iteritems, itervalues, map, unicode_type, zip, print_to_binary_file
class FDST(object): class FDST(object):
@ -94,14 +94,15 @@ class MOBIFile(object):
self.read_tbs() self.read_tbs()
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print(unicode_type(self.mf.palmdb).encode('utf-8'), file=f) p = print_to_binary_file(f)
print(file=f) p(unicode_type(self.mf.palmdb))
print('Record headers:', file=f) p()
p('Record headers:')
for i, r in enumerate(self.mf.records): for i, r in enumerate(self.mf.records):
print('%6d. %s'%(i, r.header), file=f) p('%6d. %s'%(i, r.header))
print(file=f) p()
print(unicode_type(self.mf.mobi8_header).encode('utf-8'), file=f) p(unicode_type(self.mf.mobi8_header))
def read_fdst(self): def read_fdst(self):
self.fdst = None self.fdst = None

View File

@ -75,7 +75,7 @@ class EXTHHeader(object): # {{{
# they are messed up in the PDB header # they are messed up in the PDB header
try: try:
title = self.decode(content) title = self.decode(content)
except: except Exception:
pass pass
elif idx == 524: # Lang code elif idx == 524: # Lang code
try: try:
@ -83,7 +83,7 @@ class EXTHHeader(object): # {{{
lang = canonicalize_lang(lang) lang = canonicalize_lang(lang)
if lang: if lang:
self.mi.language = lang self.mi.language = lang
except: except Exception:
pass pass
elif idx == 525: elif idx == 525:
try: try:
@ -138,8 +138,8 @@ class EXTHHeader(object): # {{{
self.mi.tags = list(set(self.mi.tags)) self.mi.tags = list(set(self.mi.tags))
elif idx == 106: elif idx == 106:
try: try:
self.mi.pubdate = parse_date(content, as_utc=False) self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
except: except Exception:
pass pass
elif idx == 108: elif idx == 108:
self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
@ -165,7 +165,7 @@ class EXTHHeader(object): # {{{
try: try:
self.uuid = content.decode('ascii') self.uuid = content.decode('ascii')
self.mi.set_identifier('mobi-asin', self.uuid) self.mi.set_identifier('mobi-asin', self.uuid)
except: except Exception:
self.uuid = None self.uuid = None
elif idx == 116: elif idx == 116:
self.start_offset, = struct.unpack(b'>L', content) self.start_offset, = struct.unpack(b'>L', content)
@ -302,14 +302,14 @@ class MetadataHeader(BookHeader):
try: try:
if self.section_data(kf8_header_index-1) == b'BOUNDARY': if self.section_data(kf8_header_index-1) == b'BOUNDARY':
return 'joint' return 'joint'
except: except Exception:
pass pass
return None return None
def identity(self): def identity(self):
self.stream.seek(60) self.stream.seek(60)
ident = self.stream.read(8).upper() ident = self.stream.read(8).upper()
if ident not in [b'BOOKMOBI', b'TEXTREAD']: if ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % ident) raise MobiError('Unknown book type: %s' % ident)
return ident return ident

View File

@ -123,6 +123,9 @@ class CNCX(object): # {{{
def iteritems(self): def iteritems(self):
return iteritems(self.records) return iteritems(self.records)
def items(self):
return iteritems(self.records)
# }}} # }}}

View File

@ -89,7 +89,7 @@ class MobiReader(object):
self.num_sections, = struct.unpack('>H', raw[76:78]) self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper() self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in [b'BOOKMOBI', b'TEXTREAD']: if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % repr(self.ident)) raise MobiError('Unknown book type: %s' % repr(self.ident))
self.sections = [] self.sections = []

View File

@ -223,7 +223,7 @@ def get_trailing_data(record, extra_data_flags):
if extra_data_flags & 0b1: if extra_data_flags & 0b1:
# Only the first two bits are used for the size since there can # Only the first two bits are used for the size since there can
# never be more than 3 trailing multibyte chars # never be more than 3 trailing multibyte chars
sz = (ord(record[-1]) & 0b11) + 1 sz = (ord(record[-1:]) & 0b11) + 1
consumed = 1 consumed = 1
if sz > consumed: if sz > consumed:
data[0] = record[-sz:-consumed] data[0] = record[-sz:-consumed]
@ -298,7 +298,7 @@ def decode_tbs(byts, flag_size=4):
extra[0b0010] = x extra[0b0010] = x
consumed += consumed2 consumed += consumed2
if flags & 0b0100: if flags & 0b0100:
extra[0b0100] = ord(byts[0]) extra[0b0100] = ord(byts[0:1])
byts = byts[1:] byts = byts[1:]
consumed += 1 consumed += 1
if flags & 0b0001: if flags & 0b0001:

View File

@ -182,3 +182,19 @@ else:
def reload(module): def reload(module):
return builtins.reload(module) return builtins.reload(module)
def print_to_binary_file(fileobj, encoding='utf-8'):
def print(*a, **kw):
f = kw.get('file', fileobj)
if a:
sep = as_bytes(kw.get('sep', ' '), encoding)
for x in a:
x = as_bytes(x, encoding)
f.write(x)
if x is not a[-1]:
f.write(sep)
f.write(as_bytes(kw.get('end', '\n')))
return print