From 2b48f393598dad13787a297a2a58b72792ff396b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 17 Mar 2012 11:10:56 -0400 Subject: [PATCH 01/13] Fixes for smartypants from http://www.mobileread.com/forums/showthread.php?t=171920 submitted by Leigh Parry. --- src/calibre/utils/smartypants.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py index 8763a313fc..fa3119bf53 100644 --- a/src/calibre/utils/smartypants.py +++ b/src/calibre/utils/smartypants.py @@ -591,6 +591,21 @@ def educateQuotes(str): str = re.sub(r'''""''', """””""", str) str = re.sub(r"""''""", """’’""", str) + # Special case for Quotes at inside of other entities, e.g.: + #

A double quote--"within dashes"--would be nice.

+ str = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", str) + str = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", str) + str = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", str) + str = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", str) + + # Special case for Quotes at end of line with a preceeding space (may change just to end of line) + str = re.sub(r"""(?<=\s)"$""", r"""”""", str) + str = re.sub(r"""(?<=\s)'$""", r"""’""", str) + + # Special case for Quotes at beginning of line with a space - multiparagraph quoted text: + str = re.sub(r"""^"(?=\s)""", r"""“""", str) + str = re.sub(r"""^'(?=\s)""", r"""‘""", str) + # Special case for decade abbreviations (the '80s): str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str) From 8d23a63a46783283d3cb67c93ee7e9611ecbcff2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 21:35:59 +0530 Subject: [PATCH 02/13] ... --- src/calibre/ebooks/mobi/reader/markup.py | 3 ++- src/calibre/ebooks/mobi/reader/mobi8.py | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 26583cf30c..721de28ff4 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -33,7 +33,8 @@ def update_internal_links(mobi8_reader): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) - filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset) + filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32), + int(offset, 32)) suffix = (b'#' + idtag) if idtag else b'' replacement = filename.encode(mr.header.codec) + suffix tag = posfid_index_pattern.sub(replacement, tag, 1) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 1e4d63d72e..5105e20f0b 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -230,11 +230,10 @@ class Mobi8Reader(object): def get_id_tag_by_pos_fid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file - row = int(posfid, 32) - off = int(offset, 32) - [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row] - pos = insertpos + off - fname = self.get_file_info(pos).filename + insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] + pos = insertpos + offset + fi = self.get_file_info(pos) + fname = fi.filename # an existing "id=" must exist in original xhtml otherwise it would not # have worked for linking. Amazon seems to have added its own # additional "aid=" inside tags whose contents seem to represent some @@ -318,7 +317,6 @@ class Mobi8Reader(object): for entry in index_entries: pos = entry['pos'] fi = self.get_file_info(pos) - #print (11111111, fi, entry['pos_fid']) if fi.filename is None: raise ValueError('Index entry has invalid pos: %d'%pos) idtag = self.get_id_tag(pos).decode(self.header.codec) From a83654a4990900f56bb5725adf911e1074a5733c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 22:57:12 +0530 Subject: [PATCH 03/13] Refactor mobi inspect code in preparation for KF8 support --- src/calibre/debug.py | 2 +- .../ebooks/conversion/plugins/mobi_output.py | 2 +- src/calibre/ebooks/mobi/debug/__init__.py | 16 + src/calibre/ebooks/mobi/debug/headers.py | 474 ++++++++++++++++++ src/calibre/ebooks/mobi/debug/main.py | 39 ++ .../ebooks/mobi/{debug.py => debug/mobi6.py} | 452 +---------------- 6 files changed, 540 insertions(+), 445 deletions(-) create mode 100644 src/calibre/ebooks/mobi/debug/__init__.py create mode 100644 src/calibre/ebooks/mobi/debug/headers.py create mode 100644 src/calibre/ebooks/mobi/debug/main.py rename src/calibre/ebooks/mobi/{debug.py => debug/mobi6.py} (63%) diff --git a/src/calibre/debug.py b/src/calibre/debug.py index 13cccd3e01..f5f803ec84 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -234,7 +234,7 @@ def main(args=sys.argv): sql_dump = args[-1] reinit_db(opts.reinitialize_db, sql_dump=sql_dump) elif opts.inspect_mobi: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi for path in args[1:]: prints('Inspecting:', path) inspect_mobi(path) diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 7288f095d7..06580be1ba 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -179,7 +179,7 @@ class MOBIOutput(OutputFormatPlugin): writer(oeb, output_path) if opts.extract_to is not None: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi ddir = opts.extract_to inspect_mobi(output_path, ddir=ddir) diff --git a/src/calibre/ebooks/mobi/debug/__init__.py b/src/calibre/ebooks/mobi/debug/__init__.py new file mode 100644 index 0000000000..b472bf3148 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +def format_bytes(byts): + byts = bytearray(byts) + byts = [hex(b)[2:] for b in byts] + return ' '.join(byts) + + diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py new file mode 100644 index 0000000000..7965253be6 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, datetime + +from calibre.utils.date import utc_tz +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.mobi.debug import format_bytes + +# PalmDB {{{ +class PalmDOCAttributes(object): + + class Attr(object): + + def __init__(self, name, field, val): + self.name = name + self.val = val & field + + def __str__(self): + return '%s: %s'%(self.name, bool(self.val)) + + def __init__(self, raw): + self.val = struct.unpack(b'H', self.raw[34:36])[0] + + palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) + self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] + self.creation_date = (palm_epoch + + datetime.timedelta(seconds=self.creation_date_raw)) + self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] + self.modification_date = (palm_epoch + + datetime.timedelta(seconds=self.modification_date_raw)) + self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] + self.last_backup_date = (palm_epoch + + datetime.timedelta(seconds=self.last_backup_date_raw)) + self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] + self.app_info_id = self.raw[52:56] + self.sort_info_id = self.raw[56:60] + self.type = self.raw[60:64] + self.creator = self.raw[64:68] + self.ident = self.type + self.creator + if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): + raise ValueError('Unknown book ident: %r'%self.ident) + self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) + self.next_rec_list_id = self.raw[72:76] + + self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) + + def __str__(self): + ans = ['*'*20 + ' PalmDB Header '+ '*'*20] + ans.append('Name: %r'%self.name) + ans.append(str(self.attributes)) + ans.append('Version: %s'%self.version) + ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), + self.creation_date_raw)) + ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), + self.modification_date_raw)) + ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), + self.last_backup_date_raw)) + ans.append('Modification number: %s'%self.modification_number) + ans.append('App Info ID: %r'%self.app_info_id) + ans.append('Sort Info ID: %r'%self.sort_info_id) + ans.append('Type: %r'%self.type) + ans.append('Creator: %r'%self.creator) + ans.append('Last record UID +1: %r'%self.last_record_uid) + ans.append('Next record list id: %r'%self.next_rec_list_id) + ans.append('Number of records: %s'%self.number_of_records) + + return '\n'.join(ans) +# }}} + +class Record(object): # {{{ + + def __init__(self, raw, header): + self.offset, self.flags, self.uid = header + self.raw = raw + + @property + def header(self): + return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, + self.uid, self.raw[:4], len(self.raw)) +# }}} + +# EXTH {{{ +class EXTHRecord(object): + + def __init__(self, type_, data): + self.type = type_ + self.data = data + self.name = { + 1 : 'DRM Server id', + 2 : 'DRM Commerce id', + 3 : 'DRM ebookbase book id', + 100 : 'author', + 101 : 'publisher', + 102 : 'imprint', + 103 : 'description', + 104 : 'isbn', + 105 : 'subject', + 106 : 'publishingdate', + 107 : 'review', + 108 : 'contributor', + 109 : 'rights', + 110 : 'subjectcode', + 111 : 'type', + 112 : 'source', + 113 : 'asin', + 114 : 'versionnumber', + 115 : 'sample', + 116 : 'startreading', + 117 : 'adult', + 118 : 'retailprice', + 119 : 'retailpricecurrency', + 121 : 'KF8 header section index', + 125 : 'KF8 resources (images/fonts) count', + 129 : 'KF8 cover URI', + 131 : 'KF8 unknown count', + 201 : 'coveroffset', + 202 : 'thumboffset', + 203 : 'hasfakecover', + 204 : 'Creator Software', + 205 : 'Creator Major Version', # '>I' + 206 : 'Creator Minor Version', # '>I' + 207 : 'Creator Build Number', # '>I' + 208 : 'watermark', + 209 : 'tamper_proof_keys', + 300 : 'fontsignature', + 301 : 'clippinglimit', # percentage '>B' + 402 : 'publisherlimit', + 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled + 501 : 'cdetype', # 4 chars (PDOC or EBOK) + 502 : 'lastupdatetime', + 503 : 'updatedtitle', + }.get(self.type, repr(self.type)) + + if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', + 'Creator Major Version', 'Creator Minor Version', + 'Creator Build Number', 'Creator Software', 'startreading'} or + self.type in {121, 125, 131}): + self.data, = struct.unpack(b'>I', self.data) + + def __str__(self): + return '%s (%d): %r'%(self.name, self.type, self.data) + +class EXTHHeader(object): + + def __init__(self, raw): + self.raw = raw + if not self.raw.startswith(b'EXTH'): + raise ValueError('EXTH header does not start with EXTH') + self.length, = struct.unpack(b'>I', self.raw[4:8]) + self.count, = struct.unpack(b'>I', self.raw[8:12]) + + pos = 12 + self.records = [] + for i in xrange(self.count): + pos = self.read_record(pos) + self.records.sort(key=lambda x:x.type) + self.rmap = {x.type:x for x in self.records} + self.get = self.rmap.get + + def __getitem__(self, type_): + return self.rmap.__getitem__(type_) + + def read_record(self, pos): + type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) + data = self.raw[(pos+8):(pos+length)] + self.records.append(EXTHRecord(type_, data)) + return pos + length + + @property + def kf8_header_index(self): + return self.rmap.get(121, None) + + def __str__(self): + ans = ['*'*20 + ' EXTH Header '+ '*'*20] + ans.append('EXTH header length: %d'%self.length) + ans.append('Number of EXTH records: %d'%self.count) + ans.append('EXTH records...') + for r in self.records: + ans.append(str(r)) + return '\n'.join(ans) +# }}} + +class MOBIHeader(object): # {{{ + + def __init__(self, record0): + self.raw = record0.raw + + self.compression_raw = self.raw[:2] + self.compression = {1: 'No compression', 2: 'PalmDoc compression', + 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', + self.compression_raw)[0], + repr(self.compression_raw)) + self.unused = self.raw[2:4] + self.text_length, = struct.unpack(b'>I', self.raw[4:8]) + self.number_of_text_records, self.text_record_size = \ + struct.unpack(b'>HH', self.raw[8:12]) + self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) + self.encryption_type = { + 0: 'No encryption', + 1: 'Old mobipocket encryption', + 2: 'Mobipocket encryption' + }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) + self.unknown = self.raw[14:16] + + self.identifier = self.raw[16:20] + if self.identifier != b'MOBI': + raise ValueError('Identifier %r unknown'%self.identifier) + + self.length, = struct.unpack(b'>I', self.raw[20:24]) + self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) + self.type = { + 2 : 'Mobipocket book', + 3 : 'PalmDOC book', + 4 : 'Audio', + 257 : 'News', + 258 : 'News Feed', + 259 : 'News magazine', + 513 : 'PICS', + 514 : 'Word', + 515 : 'XLS', + 516 : 'PPT', + 517 : 'TEXT', + 518 : 'HTML', + }.get(self.type_raw, repr(self.type_raw)) + + self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) + self.encoding = { + 1252 : 'cp1252', + 65001: 'utf-8', + }.get(self.encoding_raw, repr(self.encoding_raw)) + self.uid = self.raw[32:36] + self.file_version, = struct.unpack(b'>I', self.raw[36:40]) + self.reserved = self.raw[40:48] + self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) + self.reserved2 = self.raw[52:80] + self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) + self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) + self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) + self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) + langcode = self.locale_raw + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + self.language = main_language.get(langid, 'ENGLISH') + self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') + + self.input_language = self.raw[96:100] + self.output_langauage = self.raw[100:104] + self.min_version, = struct.unpack(b'>I', self.raw[104:108]) + self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) + self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) + self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) + self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) + self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) + self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) + self.has_exth = bool(self.exth_flags & 0x40) + self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 + if self.has_drm_data: + self.unknown3 = self.raw[132:164] + self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) + self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) + self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) + self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) + self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 + self.has_fcis_flis = False + self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 + if self.has_extra_data_flags: + self.unknown4 = self.raw[180:192] + self.first_content_record, self.last_content_record = \ + struct.unpack(b'>HH', self.raw[192:196]) + self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) + (self.fcis_number, self.fcis_count, self.flis_number, + self.flis_count) = struct.unpack(b'>IIII', + self.raw[200:216]) + self.unknown6 = self.raw[216:224] + self.srcs_record_index = struct.unpack(b'>I', + self.raw[224:228])[0] + self.num_srcs_records = struct.unpack(b'>I', + self.raw[228:232])[0] + self.unknown7 = self.raw[232:240] + self.extra_data_flags = struct.unpack(b'>I', + self.raw[240:244])[0] + self.has_multibytes = bool(self.extra_data_flags & 0b1) + self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) + self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) + self.primary_index_record, = struct.unpack(b'>I', + self.raw[244:248]) + + if self.file_version >= 8: + (self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, + self.fdst_idx, self.fdst_count) = struct.unpack_from( + b'>LLLLLL', self.raw, 248) + self.unknown9 = self.raw[272:self.length] + + if self.has_exth: + self.exth_offset = 16 + self.length + + self.exth = EXTHHeader(self.raw[self.exth_offset:]) + + self.end_of_exth = self.exth_offset + self.exth.length + self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] + + def __str__(self): + ans = ['*'*20 + ' MOBI Header '+ '*'*20] + a = ans.append + i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) + ans.append('Compression: %s'%self.compression) + ans.append('Unused: %r'%self.unused) + ans.append('Number of text records: %d'%self.number_of_text_records) + ans.append('Text record size: %d'%self.text_record_size) + ans.append('Encryption: %s'%self.encryption_type) + ans.append('Unknown: %r'%self.unknown) + ans.append('Identifier: %r'%self.identifier) + ans.append('Header length: %d'% self.length) + ans.append('Type: %s'%self.type) + ans.append('Encoding: %s'%self.encoding) + ans.append('UID: %r'%self.uid) + ans.append('File version: %d'%self.file_version) + ans.append('Reserved: %r'%self.reserved) + ans.append('Secondary index record: %d (null val: %d)'%( + self.secondary_index_record, NULL_INDEX)) + ans.append('Reserved2: %r'%self.reserved2) + ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, + self.first_non_book_record)) + ans.append('Full name offset: %d'%self.fullname_offset) + ans.append('Full name length: %d bytes'%self.fullname_length) + ans.append('Langcode: %r'%self.locale_raw) + ans.append('Language: %s'%self.language) + ans.append('Sub language: %s'%self.sublanguage) + ans.append('Input language: %r'%self.input_language) + ans.append('Output language: %r'%self.output_langauage) + ans.append('Min version: %d'%self.min_version) + ans.append('First Image index: %d'%self.first_image_index) + ans.append('Huffman record offset: %d'%self.huffman_record_offset) + ans.append('Huffman record count: %d'%self.huffman_record_count) + ans.append('DATP record offset: %r'%self.datp_record_offset) + ans.append('DATP record count: %r'%self.datp_record_count) + ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) + if self.has_drm_data: + ans.append('Unknown3: %r'%self.unknown3) + ans.append('DRM Offset: %s'%self.drm_offset) + ans.append('DRM Count: %s'%self.drm_count) + ans.append('DRM Size: %s'%self.drm_size) + ans.append('DRM Flags: %r'%self.drm_flags) + if self.has_extra_data_flags: + ans.append('Unknown4: %r'%self.unknown4) + ans.append('First content record: %d'% self.first_content_record) + ans.append('Last content record: %d'% self.last_content_record) + ans.append('Unknown5: %d'% self.unknown5) + ans.append('FCIS number: %d'% self.fcis_number) + ans.append('FCIS count: %d'% self.fcis_count) + ans.append('FLIS number: %d'% self.flis_number) + ans.append('FLIS count: %d'% self.flis_count) + ans.append('Unknown6: %r'% self.unknown6) + ans.append('SRCS record index: %d'%self.srcs_record_index) + ans.append('Number of SRCS records?: %d'%self.num_srcs_records) + ans.append('Unknown7: %r'%self.unknown7) + ans.append(('Extra data flags: %s (has multibyte: %s) ' + '(has indexing: %s) (has uncrossable breaks: %s)')%( + bin(self.extra_data_flags), self.has_multibytes, + self.has_indexing_bytes, self.has_uncrossable_breaks )) + ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, + self.primary_index_record)) + if self.file_version >= 8: + ans.append('Unknown8: %r'%self.unknown8) + i('SKEL Index', self.skel_idx) + i('Sections Index', self.sect_idx) + i('Other Index', self.oth_idx) + i('FDST record', self.fdst_idx) + a('FDST Count: %d'%self.fdst_count) + if self.unknown9: + a('Unknown9: %r'%self.unknown9) + + ans = '\n'.join(ans) + + if self.has_exth: + ans += '\n\n' + str(self.exth) + ans += '\n\nBytes after EXTH (%d bytes): %s'%( + len(self.bytes_after_exth), + format_bytes(self.bytes_after_exth)) + + ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + + self.fullname_length)) + + ans += '\nRecord 0 length: %d'%len(self.raw) + return ans +# }}} + +class MOBIFile(object): + + def __init__(self, stream): + self.raw = stream.read() + self.palmdb = PalmDB(self.raw[:78]) + + self.record_headers = [] + self.records = [] + for i in xrange(self.palmdb.number_of_records): + pos = 78 + i * 8 + offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) + flags, val = a1, a2 << 16 | a3 << 8 | a4 + self.record_headers.append((offset, flags, val)) + + def section(section_number): + if section_number == self.palmdb.number_of_records - 1: + end_off = len(self.raw) + else: + end_off = self.record_headers[section_number + 1][0] + off = self.record_headers[section_number][0] + return self.raw[off:end_off] + + for i in range(self.palmdb.number_of_records): + self.records.append(Record(section(i), self.record_headers[i])) + + self.mobi_header = MOBIHeader(self.records[0]) + self.huffman_record_nums = [] + + if 'huff' in self.mobi_header.compression.lower(): + self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, + self.mobi_header.huffman_record_offset + + self.mobi_header.huffman_record_count)) + huffrecs = [self.records[r].raw for r in self.huffman_record_nums] + from calibre.ebooks.mobi.huffcdic import HuffReader + huffs = HuffReader(huffrecs) + decompress = huffs.unpack + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + decompress = decompress_doc + else: + decompress = lambda x: x + + self.decompress = decompress + + self.kf8_type = None + mh = self.mobi_header + if mh.file_version >= 8: + self.kf8_type = 'standalone' + elif mh.has_exth and mh.exth.kf8_header_index is not None: + self.kf8_type = 'joint' + + + diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py new file mode 100644 index 0000000000..71844150f1 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, shutil + +from calibre.ebooks.mobi.debug.headers import MOBIFile +from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 + +def inspect_mobi(path_or_stream, ddir=None): # {{{ + stream = (path_or_stream if hasattr(path_or_stream, 'read') else + open(path_or_stream, 'rb')) + f = MOBIFile(stream) + if ddir is None: + ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.makedirs(ddir) + if f.kf8_type is None: + inspect_mobi6(f, ddir) + elif f.kf8_type == 'joint': + p6 = os.path.join(ddir, 'mobi6') + inspect_mobi6(f, p6) + +# }}} + +def main(): + inspect_mobi(sys.argv[1]) + +if __name__ == '__main__': + main() + diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug/mobi6.py similarity index 63% rename from src/calibre/ebooks/mobi/debug.py rename to src/calibre/ebooks/mobi/debug/mobi6.py index 35484d0b39..5f0eda4345 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -7,403 +7,19 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os, shutil +import struct, sys, os from collections import OrderedDict, defaultdict from lxml import html -from calibre.utils.date import utc_tz -from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import (parse_index_record, parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data +from calibre.ebooks.mobi.debug import format_bytes -def format_bytes(byts): - byts = bytearray(byts) - byts = [hex(b)[2:] for b in byts] - return ' '.join(byts) - -# PalmDB {{{ -class PalmDOCAttributes(object): - - class Attr(object): - - def __init__(self, name, field, val): - self.name = name - self.val = val & field - - def __str__(self): - return '%s: %s'%(self.name, bool(self.val)) - - def __init__(self, raw): - self.val = struct.unpack(b'H', self.raw[34:36])[0] - - palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) - self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] - self.creation_date = (palm_epoch + - datetime.timedelta(seconds=self.creation_date_raw)) - self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] - self.modification_date = (palm_epoch + - datetime.timedelta(seconds=self.modification_date_raw)) - self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] - self.last_backup_date = (palm_epoch + - datetime.timedelta(seconds=self.last_backup_date_raw)) - self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] - self.app_info_id = self.raw[52:56] - self.sort_info_id = self.raw[56:60] - self.type = self.raw[60:64] - self.creator = self.raw[64:68] - self.ident = self.type + self.creator - if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): - raise ValueError('Unknown book ident: %r'%self.ident) - self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) - self.next_rec_list_id = self.raw[72:76] - - self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) - - def __str__(self): - ans = ['*'*20 + ' PalmDB Header '+ '*'*20] - ans.append('Name: %r'%self.name) - ans.append(str(self.attributes)) - ans.append('Version: %s'%self.version) - ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), - self.creation_date_raw)) - ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), - self.modification_date_raw)) - ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), - self.last_backup_date_raw)) - ans.append('Modification number: %s'%self.modification_number) - ans.append('App Info ID: %r'%self.app_info_id) - ans.append('Sort Info ID: %r'%self.sort_info_id) - ans.append('Type: %r'%self.type) - ans.append('Creator: %r'%self.creator) - ans.append('Last record UID +1: %r'%self.last_record_uid) - ans.append('Next record list id: %r'%self.next_rec_list_id) - ans.append('Number of records: %s'%self.number_of_records) - - return '\n'.join(ans) -# }}} - -class Record(object): # {{{ - - def __init__(self, raw, header): - self.offset, self.flags, self.uid = header - self.raw = raw - - @property - def header(self): - return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, - self.uid, self.raw[:4], len(self.raw)) -# }}} - -# EXTH {{{ -class EXTHRecord(object): - - def __init__(self, type_, data): - self.type = type_ - self.data = data - self.name = { - 1 : 'DRM Server id', - 2 : 'DRM Commerce id', - 3 : 'DRM ebookbase book id', - 100 : 'author', - 101 : 'publisher', - 102 : 'imprint', - 103 : 'description', - 104 : 'isbn', - 105 : 'subject', - 106 : 'publishingdate', - 107 : 'review', - 108 : 'contributor', - 109 : 'rights', - 110 : 'subjectcode', - 111 : 'type', - 112 : 'source', - 113 : 'asin', - 114 : 'versionnumber', - 115 : 'sample', - 116 : 'startreading', - 117 : 'adult', - 118 : 'retailprice', - 119 : 'retailpricecurrency', - 121 : 'KF8 header section index', - 125 : 'KF8 resources (images/fonts) count', - 129 : 'KF8 cover URI', - 131 : 'KF8 unknown count', - 201 : 'coveroffset', - 202 : 'thumboffset', - 203 : 'hasfakecover', - 204 : 'Creator Software', - 205 : 'Creator Major Version', # '>I' - 206 : 'Creator Minor Version', # '>I' - 207 : 'Creator Build Number', # '>I' - 208 : 'watermark', - 209 : 'tamper_proof_keys', - 300 : 'fontsignature', - 301 : 'clippinglimit', # percentage '>B' - 402 : 'publisherlimit', - 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled - 501 : 'cdetype', # 4 chars (PDOC or EBOK) - 502 : 'lastupdatetime', - 503 : 'updatedtitle', - }.get(self.type, repr(self.type)) - - if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', - 'Creator Major Version', 'Creator Minor Version', - 'Creator Build Number', 'Creator Software', 'startreading'} or - self.type in {121, 125, 131}): - self.data, = struct.unpack(b'>I', self.data) - - def __str__(self): - return '%s (%d): %r'%(self.name, self.type, self.data) - -class EXTHHeader(object): - - def __init__(self, raw): - self.raw = raw - if not self.raw.startswith(b'EXTH'): - raise ValueError('EXTH header does not start with EXTH') - self.length, = struct.unpack(b'>I', self.raw[4:8]) - self.count, = struct.unpack(b'>I', self.raw[8:12]) - - pos = 12 - self.records = [] - for i in xrange(self.count): - pos = self.read_record(pos) - self.records.sort(key=lambda x:x.type) - - def read_record(self, pos): - type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) - data = self.raw[(pos+8):(pos+length)] - self.records.append(EXTHRecord(type_, data)) - return pos + length - - def __str__(self): - ans = ['*'*20 + ' EXTH Header '+ '*'*20] - ans.append('EXTH header length: %d'%self.length) - ans.append('Number of EXTH records: %d'%self.count) - ans.append('EXTH records...') - for r in self.records: - ans.append(str(r)) - return '\n'.join(ans) -# }}} - -class MOBIHeader(object): # {{{ - - def __init__(self, record0): - self.raw = record0.raw - - self.compression_raw = self.raw[:2] - self.compression = {1: 'No compression', 2: 'PalmDoc compression', - 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', - self.compression_raw)[0], - repr(self.compression_raw)) - self.unused = self.raw[2:4] - self.text_length, = struct.unpack(b'>I', self.raw[4:8]) - self.number_of_text_records, self.text_record_size = \ - struct.unpack(b'>HH', self.raw[8:12]) - self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) - self.encryption_type = { - 0: 'No encryption', - 1: 'Old mobipocket encryption', - 2: 'Mobipocket encryption' - }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) - self.unknown = self.raw[14:16] - - self.identifier = self.raw[16:20] - if self.identifier != b'MOBI': - raise ValueError('Identifier %r unknown'%self.identifier) - - self.length, = struct.unpack(b'>I', self.raw[20:24]) - self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) - self.type = { - 2 : 'Mobipocket book', - 3 : 'PalmDOC book', - 4 : 'Audio', - 257 : 'News', - 258 : 'News Feed', - 259 : 'News magazine', - 513 : 'PICS', - 514 : 'Word', - 515 : 'XLS', - 516 : 'PPT', - 517 : 'TEXT', - 518 : 'HTML', - }.get(self.type_raw, repr(self.type_raw)) - - self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) - self.encoding = { - 1252 : 'cp1252', - 65001: 'utf-8', - }.get(self.encoding_raw, repr(self.encoding_raw)) - self.uid = self.raw[32:36] - self.file_version = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] - self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] - self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) - self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) - self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) - self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) - langcode = self.locale_raw - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - self.language = main_language.get(langid, 'ENGLISH') - self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') - - self.input_language = self.raw[96:100] - self.output_langauage = self.raw[100:104] - self.min_version, = struct.unpack(b'>I', self.raw[104:108]) - self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) - self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) - self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) - self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) - self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) - self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) - self.has_exth = bool(self.exth_flags & 0x40) - self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 - if self.has_drm_data: - self.unknown3 = self.raw[132:164] - self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) - self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) - self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) - self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) - self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 - self.has_fcis_flis = False - self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False - self.extra_data_flags = 0 - if self.has_extra_data_flags: - self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) - (self.fcis_number, self.fcis_count, self.flis_number, - self.flis_count) = struct.unpack(b'>IIII', - self.raw[200:216]) - self.unknown6 = self.raw[216:224] - self.srcs_record_index = struct.unpack(b'>I', - self.raw[224:228])[0] - self.num_srcs_records = struct.unpack(b'>I', - self.raw[228:232])[0] - self.unknown7 = self.raw[232:240] - self.extra_data_flags = struct.unpack(b'>I', - self.raw[240:244])[0] - self.has_multibytes = bool(self.extra_data_flags & 0b1) - self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) - self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) - self.primary_index_record, = struct.unpack(b'>I', - self.raw[244:248]) - - if self.has_exth: - self.exth_offset = 16 + self.length - - self.exth = EXTHHeader(self.raw[self.exth_offset:]) - - self.end_of_exth = self.exth_offset + self.exth.length - self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] - - def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] - ans.append('Compression: %s'%self.compression) - ans.append('Unused: %r'%self.unused) - ans.append('Number of text records: %d'%self.number_of_text_records) - ans.append('Text record size: %d'%self.text_record_size) - ans.append('Encryption: %s'%self.encryption_type) - ans.append('Unknown: %r'%self.unknown) - ans.append('Identifier: %r'%self.identifier) - ans.append('Header length: %d'% self.length) - ans.append('Type: %s'%self.type) - ans.append('Encoding: %s'%self.encoding) - ans.append('UID: %r'%self.uid) - ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) - ans.append('Secondary index record: %d (null val: %d)'%( - self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) - ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, - self.first_non_book_record)) - ans.append('Full name offset: %d'%self.fullname_offset) - ans.append('Full name length: %d bytes'%self.fullname_length) - ans.append('Langcode: %r'%self.locale_raw) - ans.append('Language: %s'%self.language) - ans.append('Sub language: %s'%self.sublanguage) - ans.append('Input language: %r'%self.input_language) - ans.append('Output language: %r'%self.output_langauage) - ans.append('Min version: %d'%self.min_version) - ans.append('First Image index: %d'%self.first_image_index) - ans.append('Huffman record offset: %d'%self.huffman_record_offset) - ans.append('Huffman record count: %d'%self.huffman_record_count) - ans.append('DATP record offset: %r'%self.datp_record_offset) - ans.append('DATP record count: %r'%self.datp_record_count) - ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) - if self.has_drm_data: - ans.append('Unknown3: %r'%self.unknown3) - ans.append('DRM Offset: %s'%self.drm_offset) - ans.append('DRM Count: %s'%self.drm_count) - ans.append('DRM Size: %s'%self.drm_size) - ans.append('DRM Flags: %r'%self.drm_flags) - if self.has_extra_data_flags: - ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) - ans.append('FCIS number: %d'% self.fcis_number) - ans.append('FCIS count: %d'% self.fcis_count) - ans.append('FLIS number: %d'% self.flis_number) - ans.append('FLIS count: %d'% self.flis_count) - ans.append('Unknown6: %r'% self.unknown6) - ans.append('SRCS record index: %d'%self.srcs_record_index) - ans.append('Number of SRCS records?: %d'%self.num_srcs_records) - ans.append('Unknown7: %r'%self.unknown7) - ans.append(('Extra data flags: %s (has multibyte: %s) ' - '(has indexing: %s) (has uncrossable breaks: %s)')%( - bin(self.extra_data_flags), self.has_multibytes, - self.has_indexing_bytes, self.has_uncrossable_breaks )) - ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, - self.primary_index_record)) - - ans = '\n'.join(ans) - - if self.has_exth: - ans += '\n\n' + str(self.exth) - ans += '\n\nBytes after EXTH (%d bytes): %s'%( - len(self.bytes_after_exth), - format_bytes(self.bytes_after_exth)) - - ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + - self.fullname_length)) - - ans += '\nRecord 0 length: %d'%len(self.raw) - return ans -# }}} class TagX(object): # {{{ @@ -1130,46 +746,10 @@ class TBSIndexing(object): # {{{ class MOBIFile(object): # {{{ - def __init__(self, stream): - self.raw = stream.read() - - self.palmdb = PalmDB(self.raw[:78]) - - self.record_headers = [] - self.records = [] - for i in xrange(self.palmdb.number_of_records): - pos = 78 + i * 8 - offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) - flags, val = a1, a2 << 16 | a3 << 8 | a4 - self.record_headers.append((offset, flags, val)) - - def section(section_number): - if section_number == self.palmdb.number_of_records - 1: - end_off = len(self.raw) - else: - end_off = self.record_headers[section_number + 1][0] - off = self.record_headers[section_number][0] - return self.raw[off:end_off] - - for i in range(self.palmdb.number_of_records): - self.records.append(Record(section(i), self.record_headers[i])) - - self.mobi_header = MOBIHeader(self.records[0]) - self.huffman_record_nums = [] - - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x + def __init__(self, mf): + for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header', + 'huffman_record_nums',): + setattr(self, x, getattr(mf, x)) self.index_header = self.index_record = None self.indexing_record_nums = set() @@ -1201,7 +781,7 @@ class MOBIFile(object): # {{{ if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] @@ -1241,17 +821,8 @@ class MOBIFile(object): # {{{ print (str(self.mobi_header).encode('utf-8'), file=f) # }}} -def inspect_mobi(path_or_stream, ddir=None): # {{{ - stream = (path_or_stream if hasattr(path_or_stream, 'read') else - open(path_or_stream, 'rb')) - f = MOBIFile(stream) - if ddir is None: - ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] - try: - shutil.rmtree(ddir) - except: - pass - os.makedirs(ddir) +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) @@ -1299,9 +870,4 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ # }}} -def main(): - inspect_mobi(sys.argv[1]) - -if __name__ == '__main__': - main() From 0479f31a5f072a8c02661c373391eaf498dc1209 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 23:14:55 +0530 Subject: [PATCH 04/13] KF8 Input: Fix some links pointing a little above or below their intended target when viewing or converting KF8 files --- src/calibre/ebooks/mobi/reader/markup.py | 3 ++- src/calibre/ebooks/mobi/reader/mobi8.py | 28 ++++++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 721de28ff4..8bb7f211f3 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -36,7 +36,8 @@ def update_internal_links(mobi8_reader): filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32), int(offset, 32)) suffix = (b'#' + idtag) if idtag else b'' - replacement = filename.encode(mr.header.codec) + suffix + replacement = filename.split('/')[-1].encode( + mr.header.codec) + suffix tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = ''.join([x.decode(mr.header.codec) for x in srcpieces]) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 5105e20f0b..ec7166ebb0 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -233,7 +233,6 @@ class Mobi8Reader(object): insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] pos = insertpos + offset fi = self.get_file_info(pos) - fname = fi.filename # an existing "id=" must exist in original xhtml otherwise it would not # have worked for linking. Amazon seems to have added its own # additional "aid=" inside tags whose contents seem to represent some @@ -242,7 +241,7 @@ class Mobi8Reader(object): # so find the closest "id=" before position the file by actually # searching in that file idtext = self.get_id_tag(pos) - return fname, idtext + return '%s/%s'%(fi.type, fi.filename), idtext def get_id_tag(self, pos): # find the correct tag by actually searching in the destination @@ -253,12 +252,13 @@ class Mobi8Reader(object): textblock = self.parts[fi.num] id_map = [] npos = pos - fi.start - # if npos inside a tag then search all text before the its end of tag - # marker pgt = textblock.find(b'>', npos) plt = textblock.find(b'<', npos) - if pgt < plt: + # if npos inside a tag then search all text before the its end of tag marker + # else not in a tag need to search the preceding tag + if plt == npos or pgt < plt: npos = pgt + 1 + textblock = textblock[0:npos] # find id links only inside of tags # inside any < > pair find all "id=' and return whatever is inside # the quotes @@ -315,12 +315,18 @@ class Mobi8Reader(object): # Add href and anchor info to the index entries for entry in index_entries: - pos = entry['pos'] - fi = self.get_file_info(pos) - if fi.filename is None: - raise ValueError('Index entry has invalid pos: %d'%pos) - idtag = self.get_id_tag(pos).decode(self.header.codec) - entry['href'] = '%s/%s'%(fi.type, fi.filename) + pos_fid = entry['pos_fid'] + if pos_fid is None: + pos = entry['pos'] + fi = self.get_file_info(pos) + if fi.filename is None: + raise ValueError('Index entry has invalid pos: %d'%pos) + idtag = self.get_id_tag(pos).decode(self.header.codec) + href = '%s/%s'%(fi.type, fi.filename) + else: + href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) + + entry['href'] = href entry['idtag'] = idtag # Build the TOC object From b6d02adfe3a52ebb21e602c73a51c82f976ba37f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 23:38:09 +0530 Subject: [PATCH 05/13] Update FHM UK --- recipes/fhm_uk.recipe | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index ab271ad753..0e2d5c1ebe 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'FHM UK' description = 'Good News for Men' - cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' + cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg' + # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' __author__ = 'Dave Asbury' - # last updated 27/1/12 + # last updated 17/3/12 language = 'en_GB' oldest_article = 28 max_articles_per_feed = 12 @@ -29,6 +30,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): feeds = [ (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), - (u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), - (u'Gaming',u'http://feed43.com/0755006465351035.xml'), - ] + (u'Upgrade',u'http://feed43.com/0877305847443234.xml'), + #(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), + #(u'Gaming',u'http://feed43.com/0755006465351035.xml'), + (u'Gaming',u'http://feed43.com/6537162612465672.xml'), + ] From 06f3a1868463710d019f3878441fc3445fc2458b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 12:15:37 +0530 Subject: [PATCH 06/13] Some progress on KF8 support in inspect MOBI --- src/calibre/ebooks/mobi/debug/headers.py | 124 +++++++++++++++------- src/calibre/ebooks/mobi/debug/main.py | 9 ++ src/calibre/ebooks/mobi/debug/mobi6.py | 48 ++------- src/calibre/ebooks/mobi/reader/headers.py | 12 +-- 4 files changed, 106 insertions(+), 87 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 7965253be6..06318c4527 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -7,12 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime +import struct, datetime, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.utils import get_trailing_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -188,10 +189,13 @@ class EXTHHeader(object): pos = self.read_record(pos) self.records.sort(key=lambda x:x.type) self.rmap = {x.type:x for x in self.records} - self.get = self.rmap.get def __getitem__(self, type_): - return self.rmap.__getitem__(type_) + return self.rmap.__getitem__(type_).data + + def get(self, type_, default=None): + ans = self.rmap.get(type_, default) + return getattr(ans, 'data', default) def read_record(self, pos): type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) @@ -201,7 +205,7 @@ class EXTHHeader(object): @property def kf8_header_index(self): - return self.rmap.get(121, None) + return self.get(121, None) def __str__(self): ans = ['*'*20 + ' EXTH Header '+ '*'*20] @@ -263,9 +267,10 @@ class MOBIHeader(object): # {{{ }.get(self.encoding_raw, repr(self.encoding_raw)) self.uid = self.raw[32:36] self.file_version, = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] + self.meta_orth_indx, self.meta_infl_indx = struct.unpack( + b'>II', self.raw[40:48]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] + self.reserved = self.raw[52:80] self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) @@ -299,9 +304,8 @@ class MOBIHeader(object): # {{{ self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) + self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II', + self.raw, 192) (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) @@ -320,10 +324,9 @@ class MOBIHeader(object): # {{{ self.raw[244:248]) if self.file_version >= 8: - (self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, - self.fdst_idx, self.fdst_count) = struct.unpack_from( - b'>LLLLLL', self.raw, 248) - self.unknown9 = self.raw[272:self.length] + (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx + ) = struct.unpack_from(b'>4L', self.raw, 248) + self.unknown9 = self.raw[264:self.length] if self.has_exth: self.exth_offset = 16 + self.length @@ -334,7 +337,7 @@ class MOBIHeader(object): # {{{ self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] + ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20] a = ans.append i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) ans.append('Compression: %s'%self.compression) @@ -349,10 +352,11 @@ class MOBIHeader(object): # {{{ ans.append('Encoding: %s'%self.encoding) ans.append('UID: %r'%self.uid) ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) + ans.append('Meta Orth Index: %d'%self.meta_orth_indx) + ans.append('Meta Infl Index: %d'%self.meta_infl_indx) ans.append('Secondary index record: %d (null val: %d)'%( self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) + ans.append('Reserved: %r'%self.reserved) ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, self.first_non_book_record)) ans.append('Full name offset: %d'%self.fullname_offset) @@ -377,9 +381,8 @@ class MOBIHeader(object): # {{{ ans.append('DRM Flags: %r'%self.drm_flags) if self.has_extra_data_flags: ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) + ans.append('FDST Index: %d'% self.fdst_idx) + ans.append('FDST Count: %d'% self.fdst_count) ans.append('FCIS number: %d'% self.fcis_number) ans.append('FCIS count: %d'% self.fcis_count) ans.append('FLIS number: %d'% self.flis_number) @@ -398,6 +401,7 @@ class MOBIHeader(object): # {{{ ans.append('Unknown8: %r'%self.unknown8) i('SKEL Index', self.skel_idx) i('Sections Index', self.sect_idx) + i('Unknown8', self.unknown8) i('Other Index', self.oth_idx) i('FDST record', self.fdst_idx) a('FDST Count: %d'%self.fdst_count) @@ -447,28 +451,74 @@ class MOBIFile(object): self.mobi_header = MOBIHeader(self.records[0]) self.huffman_record_nums = [] - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x - - self.decompress = decompress - self.kf8_type = None - mh = self.mobi_header + mh = mh8 = self.mobi_header if mh.file_version >= 8: self.kf8_type = 'standalone' elif mh.has_exth and mh.exth.kf8_header_index is not None: self.kf8_type = 'joint' - + kf8i = mh.exth.kf8_header_index + mh8 = MOBIHeader(self.records[kf8i]) + self.mobi8_header = mh8 + + if 'huff' in self.mobi_header.compression.lower(): + from calibre.ebooks.mobi.huffcdic import HuffReader + + def huffit(off, cnt): + huffman_record_nums = list(xrange(off, off+cnt)) + huffrecs = [self.records[r].raw for r in huffman_record_nums] + huffs = HuffReader(huffrecs) + return huffman_record_nums, huffs.unpack + + if self.kf8_type == 'joint': + recs6, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + recs8, d8 = huffit(mh8.huffman_record_offset + kf8i, + mh8.huffman_record_count) + self.huffman_record_nums = recs6 + recs8 + else: + self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + d8 = d6 + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + d8 = d6 = decompress_doc + else: + d8 = d6 = lambda x: x + + self.decompress6, self.decompress8 = d6, d8 + +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + raw_trailing_bytes = record.raw[len(self.raw):] + self.raw = decompress(self.raw) + + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + self.trailing_data['raw_bytes'] = raw_trailing_bytes + + for typ, val in self.trailing_data.iteritems(): + if isinstance(typ, int): + print ('Record %d has unknown trailing data of type: %d : %r'% + (idx, typ, val)) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py index 71844150f1..624da65846 100644 --- a/src/calibre/ebooks/mobi/debug/main.py +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -11,6 +11,7 @@ import sys, os, shutil from calibre.ebooks.mobi.debug.headers import MOBIFile from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 +from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8 def inspect_mobi(path_or_stream, ddir=None): # {{{ stream = (path_or_stream if hasattr(path_or_stream, 'read') else @@ -27,7 +28,15 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ inspect_mobi6(f, ddir) elif f.kf8_type == 'joint': p6 = os.path.join(ddir, 'mobi6') + os.mkdir(p6) inspect_mobi6(f, p6) + p8 = os.path.join(ddir, 'mobi8') + os.mkdir(p8) + inspect_mobi8(f, p8) + else: + inspect_mobi8(f, ddir) + + print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py index 5f0eda4345..640f58c661 100644 --- a/src/calibre/ebooks/mobi/debug/mobi6.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -16,9 +16,10 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import (parse_index_record, parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs, read_font_record) + decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.debug.headers import TextRecord class TagX(object): # {{{ @@ -472,39 +473,6 @@ class CNCX(object): # {{{ # }}} -class TextRecord(object): # {{{ - - def __init__(self, idx, record, extra_data_flags, decompress): - self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) - raw_trailing_bytes = record.raw[len(self.raw):] - self.raw = decompress(self.raw) - - if 0 in self.trailing_data: - self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) - if 1 in self.trailing_data: - self.trailing_data['indexing'] = self.trailing_data.pop(1) - if 2 in self.trailing_data: - self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) - self.trailing_data['raw_bytes'] = raw_trailing_bytes - - for typ, val in self.trailing_data.iteritems(): - if isinstance(typ, int): - print ('Record %d has unknown trailing data of type: %d : %r'% - (idx, typ, val)) - - self.idx = idx - - def dump(self, folder): - name = '%06d'%self.idx - with open(os.path.join(folder, name+'.txt'), 'wb') as f: - f.write(self.raw) - with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: - for k, v in self.trailing_data.iteritems(): - raw = '%s : %r\n\n'%(k, v) - f.write(raw.encode('utf-8')) - -# }}} - class ImageRecord(object): # {{{ def __init__(self, idx, record, fmt): @@ -781,7 +749,7 @@ class MOBIFile(object): # {{{ if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] @@ -833,12 +801,11 @@ def inspect_mobi(mobi_file, ddir): of.write(rec.raw) alltext += rec.raw of.seek(0) - if f.mobi_header.file_version < 8: - root = html.fromstring(alltext.decode('utf-8')) - with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: - of.write(html.tostring(root, pretty_print=True, encoding='utf-8', - include_meta_content_type=True)) + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) if f.index_header is not None: f.index_record.alltext = alltext @@ -866,7 +833,6 @@ def inspect_mobi(mobi_file, ddir): rec.dump(tdir) - print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index eaad81730d..db2b07e53a 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -187,19 +187,13 @@ class BookHeader(object): self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) if self.mobi_version >= 8: - self.skelidx, = struct.unpack_from('>L', raw, 0xFC) - - # Index into
sections in raw_ml - self.dividx, = struct.unpack_from('>L', raw, 0xF8) - - # Index into Other files - self.othidx, = struct.unpack_from('>L', raw, 0x104) + self.dividx, self.skelidx, self.datpidx, self.othidx = \ + struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece - self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) - self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4) + self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX From 43cf8faebc59b94b5965c14b829b9694f8c15c0b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 12:48:01 +0530 Subject: [PATCH 07/13] ... --- src/calibre/ebooks/mobi/debug/headers.py | 33 ++++++++++++++++-------- src/calibre/ebooks/mobi/reader/mobi6.py | 9 +++++-- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 06318c4527..2cc7954559 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -219,8 +219,9 @@ class EXTHHeader(object): class MOBIHeader(object): # {{{ - def __init__(self, record0): + def __init__(self, record0, offset): self.raw = record0.raw + self.header_offset = offset self.compression_raw = self.raw[:2] self.compression = {1: 'No compression', 2: 'PalmDoc compression', @@ -327,6 +328,19 @@ class MOBIHeader(object): # {{{ (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx ) = struct.unpack_from(b'>4L', self.raw, 248) self.unknown9 = self.raw[264:self.length] + if self.meta_orth_indx != self.sect_idx: + raise ValueError('KF8 header has different Meta orth and ' + 'section indices') + + # The following are all relative to the position of the header record + # make them absolute for ease of debugging + for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', + 'meta_orth_indx', 'huffman_record_offset', + 'first_non_book_record', 'datp_record_offset', 'fcis_number', + 'flis_number', 'primary_index_record', 'fdst_idx', + 'first_image_index'): + if hasattr(self, x): + setattr(self, x, self.header_offset+getattr(self, x)) if self.has_exth: self.exth_offset = 16 + self.length @@ -352,8 +366,8 @@ class MOBIHeader(object): # {{{ ans.append('Encoding: %s'%self.encoding) ans.append('UID: %r'%self.uid) ans.append('File version: %d'%self.file_version) - ans.append('Meta Orth Index: %d'%self.meta_orth_indx) - ans.append('Meta Infl Index: %d'%self.meta_infl_indx) + i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx) + i('Meta Infl Index', self.meta_infl_indx) ans.append('Secondary index record: %d (null val: %d)'%( self.secondary_index_record, NULL_INDEX)) ans.append('Reserved: %r'%self.reserved) @@ -398,13 +412,10 @@ class MOBIHeader(object): # {{{ ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, self.primary_index_record)) if self.file_version >= 8: - ans.append('Unknown8: %r'%self.unknown8) - i('SKEL Index', self.skel_idx) i('Sections Index', self.sect_idx) - i('Unknown8', self.unknown8) + i('SKEL Index', self.skel_idx) + i('DATP Index', self.datp_idx) i('Other Index', self.oth_idx) - i('FDST record', self.fdst_idx) - a('FDST Count: %d'%self.fdst_count) if self.unknown9: a('Unknown9: %r'%self.unknown9) @@ -448,7 +459,7 @@ class MOBIFile(object): for i in range(self.palmdb.number_of_records): self.records.append(Record(section(i), self.record_headers[i])) - self.mobi_header = MOBIHeader(self.records[0]) + self.mobi_header = MOBIHeader(self.records[0], 0) self.huffman_record_nums = [] self.kf8_type = None @@ -458,7 +469,7 @@ class MOBIFile(object): elif mh.has_exth and mh.exth.kf8_header_index is not None: self.kf8_type = 'joint' kf8i = mh.exth.kf8_header_index - mh8 = MOBIHeader(self.records[kf8i]) + mh8 = MOBIHeader(self.records[kf8i], kf8i) self.mobi8_header = mh8 if 'huff' in self.mobi_header.compression.lower(): @@ -473,7 +484,7 @@ class MOBIFile(object): if self.kf8_type == 'joint': recs6, d6 = huffit(mh.huffman_record_offset, mh.huffman_record_count) - recs8, d8 = huffit(mh8.huffman_record_offset + kf8i, + recs8, d8 = huffit(mh8.huffman_record_offset, mh8.huffman_record_count) self.huffman_record_nums = recs6 + recs8 else: diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index c8dec607c1..92bdd1d3bf 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -118,12 +118,17 @@ class MobiReader(object): try: self.book_header = BookHeader(self.sections[k8i][0], self.ident, user_encoding, self.log) - # The following are only correct in the Mobi 6 - # header not the Mobi 8 header + + # Only the first_image_index from the MOBI 6 header is + # useful for x in ('first_image_index',): setattr(self.book_header, x, getattr(bh, x)) + + # We need to do this because the MOBI 6 text extract code + # does not know anything about the kf8 offset if hasattr(self.book_header, 'huff_offset'): self.book_header.huff_offset += k8i + self.kf8_type = 'joint' self.kf8_boundary = k8i-1 except: From 9712175d74aa11a73b83dbb2f759ebda5472acfe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 13:23:16 +0530 Subject: [PATCH 08/13] MOBI Input: Fix regression caused by KF8 support that broke reading on ancient PRC files from Baen --- src/calibre/ebooks/mobi/reader/headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index db2b07e53a..20a31cde50 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -186,7 +186,7 @@ class BookHeader(object): if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) - if self.mobi_version >= 8: + if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) From dd20e427b5353c7e51fa9aec31bafb75ba1df80c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 13:27:00 +0530 Subject: [PATCH 09/13] ... --- src/calibre/ebooks/mobi/reader/headers.py | 2 ++ src/calibre/ebooks/mobi/reader/mobi6.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index 20a31cde50..06d349d5de 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -186,6 +186,8 @@ class BookHeader(object): if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 92bdd1d3bf..6dd789755d 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -107,7 +107,10 @@ class MobiReader(object): self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) - if self.book_header.mobi_version == 8: + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative + if (self.book_header.mobi_version == 8 and hasattr(self.book_header, + 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: From be1e281012c3140b76bd2bc4396afd02b4f1bc94 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:11:05 +0530 Subject: [PATCH 10/13] ... --- src/calibre/ebooks/mobi/debug/mobi8.py | 62 ++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 src/calibre/ebooks/mobi/debug/mobi8.py diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py new file mode 100644 index 0000000000..e4a92ee95c --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os + +from calibre.ebooks.mobi.debug.headers import TextRecord + +class MOBIFile(object): + + def __init__(self, mf): + self.mf = mf + h, h8 = mf.mobi_header, mf.mobi8_header + first_text_record = 1 + offset = 0 + res_end = len(mf.records) + if mf.kf8_type == 'joint': + offset = h.exth.kf8_header_index + res_end = offset - 1 + + self.resource_records = mf.records[h.first_non_book_record:res_end] + self.text_records = [TextRecord(i, r, h8.extra_data_flags, + mf.decompress8) for i, r in + enumerate(mf.records[first_text_record+offset: + first_text_record+offset+h8.number_of_text_records])] + + self.raw_text = b''.join(r.raw for r in self.text_records) + + def print_header(self, f=sys.stdout): + print (str(self.mf.palmdb).encode('utf-8'), file=f) + print (file=f) + print ('Record headers:', file=f) + for i, r in enumerate(self.mf.records): + print ('%6d. %s'%(i, r.header), file=f) + + print (file=f) + print (str(self.mf.mobi8_header).encode('utf-8'), file=f) + + +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) + with open(os.path.join(ddir, 'header.txt'), 'wb') as out: + f.print_header(f=out) + + alltext = os.path.join(ddir, 'raw_text.html') + with open(alltext, 'wb') as of: + of.write(f.raw_text) + + for tdir, attr in [('text_records', 'text_records'), ('images', + 'image_records'), ('binary', 'binary_records'), ('font', + 'font_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr, []): + rec.dump(tdir) + + From 86771a52863167d10caf7f4ffd3d8368f8a9bf3f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:14:54 +0530 Subject: [PATCH 11/13] Fix #958320 (Sony PRS not set to accept XMDF (*.zbf) by default) --- src/calibre/devices/prs505/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index bfce4fa1be..3ba3fcf50f 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -27,7 +27,7 @@ class PRS505(USBMS): booklist_class = CollectionsBookList - FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] + FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt', 'zbf'] CAN_SET_METADATA = ['title', 'authors', 'collections'] CAN_DO_DEVICE_DB_PLUGBOARD = True From 34da8d4060e5ae63719493f91b59dbd36d1d78e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:29:26 +0530 Subject: [PATCH 12/13] Fix #958145 ([Enhancement] add a link to 'adding books' preferences in the 'add books' function) --- src/calibre/gui2/actions/add.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index bb695db841..bbdef5b1b5 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -70,6 +70,9 @@ class AddAction(InterfaceAction): self.add_menu.addSeparator() ma('add-formats', _('Add files to selected book records'), triggered=self.add_formats, shortcut=_('Shift+A')) + self.add_menu.addSeparator() + ma('add-config', _('Configure the adding of books'), + triggered=self.add_config) self.qaction.triggered.connect(self.add_books) @@ -78,6 +81,11 @@ class AddAction(InterfaceAction): for action in list(self.add_menu.actions())[1:]: action.setEnabled(enabled) + def add_config(self): + self.gui.iactions['Preferences'].do_config( + initial_plugin=('Import/Export', 'Adding'), + close_after_initial=True) + def add_formats(self, *args): if self.gui.stack.currentIndex() != 0: return From e66f422d9fb37f2ff8551eef8952ea82b515bd9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 21:20:02 +0530 Subject: [PATCH 13/13] Fix #958442 (Device Info for Ectaco Jetbook Color) --- src/calibre/customize/builtins.py | 7 +++---- src/calibre/devices/jetbook/driver.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2908444665..55742b3ee3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -625,7 +625,8 @@ from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK, POCKETBOOK701, POCKETBOOK360P, PI2) from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800 -from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI +from calibre.devices.jetbook.driver import (JETBOOK, MIBUK, JETBOOK_MINI, + JETBOOK_COLOR) from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX, KINDLE_FIRE) from calibre.devices.nook.driver import NOOK, NOOK_COLOR @@ -664,9 +665,7 @@ plugins += [ ILIAD, IREXDR1000, IREXDR800, - JETBOOK, - JETBOOK_MINI, - MIBUK, + JETBOOK, JETBOOK_MINI, MIBUK, JETBOOK_COLOR, SHINEBOOK, POCKETBOOK360, POCKETBOOK301, POCKETBOOK602, POCKETBOOK701, POCKETBOOK360P, PI2, diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 0d328ba637..7f2f48a0b4 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -125,4 +125,29 @@ class JETBOOK_MINI(USBMS): SUPPORTS_SUB_DIRS = True +class JETBOOK_COLOR(USBMS): + + ''' +set([(u'0x951', + u'0x160b', + u'0x0', + u'Freescale', + u'Mass Storage Device', + u'0802270905553')]) + ''' + + FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'djvu'] + + gui_name = 'JetBook Color' + name = 'JetBook Color Device Interface' + description = _('Communicate with the JetBook Color reader.') + author = 'Kovid Goyal' + + VENDOR_ID = [0x951] + PRODUCT_ID = [0x160b] + BCD = [0x0] + EBOOK_DIR_MAIN = 'My Books' + + SUPPORTS_SUB_DIRS = True +