Fix #4230 (Can't view/convert certain Mobipocket files)

This commit is contained in:
Kovid Goyal 2009-12-16 15:32:21 -07:00
parent 3737a93493
commit 40cb977203
2 changed files with 15 additions and 7 deletions

View File

@ -16,10 +16,16 @@ class MOBIInput(InputFormatPlugin):
accelerators): accelerators):
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader import MobiReader
from lxml import html from lxml import html
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
parse_cache = {} parse_cache = {}
mr.extract_content('.', parse_cache) try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
mr.extract_content('.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
mr.extract_content('.', parse_cache)
raw = parse_cache.pop('calibre_raw_mobi_markup', False) raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw: if raw:
if isinstance(raw, unicode): if isinstance(raw, unicode):

View File

@ -108,7 +108,7 @@ class EXTHHeader(object):
class BookHeader(object): class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log): def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log self.log = log
self.compression_type = raw[:2] self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.records, self.records_size = struct.unpack('>HH', raw[8:12])
@ -141,7 +141,8 @@ class BookHeader(object):
self.codec = 'cp1252' if user_encoding is None else user_encoding self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec)) self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
or (try_extra_data_fix and self.length == 0xE4):
self.extra_flags = 0 self.extra_flags = 0
else: else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
@ -229,7 +230,8 @@ class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None): def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False):
self.log = log self.log = log
self.debug = debug self.debug = debug
self.embedded_mi = None self.embedded_mi = None
@ -284,7 +286,7 @@ class MobiReader(object):
self.book_header = BookHeader(self.sections[0][0], self.ident, self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log) user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace') self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir, parse_cache): def extract_content(self, output_dir, parse_cache):