From 427060533522e005f82e6866046abb8b3ec81dee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 19:49:57 -0600 Subject: [PATCH 01/26] New MOBI output: Write the TBS sequences for periodicals. Also fully decoded all TBS sequences, only unknown bits left are in the opening sequence that seems to depend on the type of record being indexed. The rules are simple, so I just use them instead of spending more time looking for deeper meaning. --- src/calibre/ebooks/mobi/debug.py | 214 ++++++-------------- src/calibre/ebooks/mobi/tbs_periodicals.rst | 89 +++++++- src/calibre/ebooks/mobi/utils.py | 94 +++++++++ src/calibre/ebooks/mobi/writer2/indexer.py | 166 +++++++++++---- 4 files changed, 375 insertions(+), 188 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index b85d73f55c..67f20e691f 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_fvwi) + get_trailing_data, decode_tbs) from calibre.utils.magick.draw import identify_data # PalmDB {{{ @@ -949,20 +949,22 @@ class TBSIndexing(object): # {{{ ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) - def bin3(num): + def bin4(num): ans = bin(num)[2:] - return '0'*(3-len(ans)) + ans + return bytes('0'*(4-len(ans)) + ans) + + def repr_extra(x): + return str({bin4(k):v for k, v in extra.iteritems()}) tbs_type = 0 if len(byts): - outer, consumed = decint(byts) + outermost_index, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - tbs_type = outer & 0b111 - ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) - ans.append('Outer Index entry: %d'%(outer >> 3)) - arg1, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg1) + for k in extra: + tbs_type |= k + ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) + ans.append('Outermost index: %d'%outermost_index) + ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) if self.doc_type in (257, 259): # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts, dat['geom'][0]) @@ -977,53 +979,21 @@ class TBSIndexing(object): # {{{ def interpret_periodical(self, tbs_type, byts, record_offset): ans = [] - def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{ - if psi is None: - # Assume parent section is 1 - psi = self.get_index(1) - if msg is None: - msg = ('Article index at start of record or first article' - ' index, relative to parent section') - if byts: - # byts could be empty - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - ai = (arg >> 4) - ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai, - ai+psi.index)) - if flags == 1: - arg, consumed = decint(byts) - if arg == 0: - # EOF of record, otherwise ignore and hope someone else - # will deal with these bytes - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - elif flags in (4, 5): - num = byts[0] - byts = byts[1:] - ans.append('Number of article nodes in the record (byte): %d'%num) - if flags == 5: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('%s (vwi)): %d'%(fmsg, arg)) - elif flags == 0: - pass - else: - raise ValueError('Unknown flags: %d'%flags) - return byts - - # }}} - def read_section_transitions(byts, psi=None): # {{{ if psi is None: - # Assume parent section is 1 + # Assume previous section is 1 psi = self.get_index(1) while byts: - ai, flags, consumed = decode_fvwi(byts) + ai, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - if flags & 0b1000: + if extra.get(0b0010, None) is not None: + raise ValueError('Dont know how to interpret flag 0b0010' + ' while reading section transitions') + if extra.get(0b1000, None) is not None: + if len(extra) > 1: + raise ValueError('Dont know how to interpret flags' + ' %r while reading section transitions'%extra) nsi = self.get_index(psi.index+1) ans.append('Last article in this record of section %d' ' (relative to next section index [%d]): ' @@ -1036,113 +1006,57 @@ class TBSIndexing(object): # {{{ ' (relative to its parent section): ' '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) - if flags == 0: - ans.append('The section %d has only one article' - ' in this record'%psi.index) - continue + num = extra.get(0b0100, None) + if num is None: + msg = ('The section %d has at most one article' + ' in this record')%psi.index + else: + msg = ('Number of articles in this record of ' + 'section %d: %d')%(psi.index, num) + ans.append(msg) - if flags & 0b0100: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in this record of ' - 'section %d: %d'%(psi.index, num)) - - if flags & 0b0010: - raise ValueError( - 'Dont know how to interpret the 0b0010 flag') - - if flags & 0b0001: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('->Offset to start of next section (%d) from start' + offset = extra.get(0b0001, None) + if offset is not None: + if offset == 0: + ans.append('This record is spanned by the article:' + '%d'%(ai+psi.index)) + else: + ans.append('->Offset to start of next section (%d) from start' ' of record: %d [%d absolute offset]'%(psi.index+1, - arg, arg+record_offset)) + offset, offset+record_offset)) + return byts # }}} - if tbs_type == 3: # {{{ - arg2, consumed = decint(byts) + def read_starting_section(byts): # {{{ + si, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg2) - - arg3, consumed = decint(byts) - byts = byts[consumed:] - fsi = arg3 >> 4 - flags = arg3 & 0b1111 - ans.append('First section index (fvwi): %d'%fsi) - psi = self.get_index(fsi) - ans.append('Flags: %d'%flags) - if flags == 4: - ans.append('Number of articles in this section: %d'%byts[0]) - byts = byts[1:] - elif flags == 0: - pass - else: - raise ValueError('Unknown flags value: %d'%flags) - byts = read_section_transitions(byts, psi) - - # }}} - - elif tbs_type == 7: # {{{ - # This occurs for records that have no section nodes and - # whose parent section's index == 1 - ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2])) - byts = byts[2:] - arg, consumed = decint(byts) - byts = byts[consumed:] - ai = arg >> 4 - flags = arg & 0b1111 - ans.append('Article at start of record (fvwi): %d'%ai) - if flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in record (byte): %d'%num) - elif flags == 0: - pass - elif flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - else: - raise ValueError('Unknown flags value: %d'%flags) + if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: + raise ValueError('Dont know how to interpret flags %r' + ' when reading starting section'%extra) + si = self.get_index(si) + ans.append('The section at the start of this record is:' + ' %d'%si.index) + if 0b0100 in extra: + num = extra[0b0100] + ans.append('The number of articles from the section %d' + ' in this record: %d'%(si.index, num)) + elif 0b0001 in extra: + eof = extra[0b0001] + if eof != 0: + raise ValueError('Unknown eof value %s when reading' + ' starting section'%eof) + ans.append('This record is spanned by an article from' + ' the section: %d'%si.index) + return si, byts # }}} - elif tbs_type == 6: # {{{ - # This is used for records spanned by an article whose parent - # section's index == 1 or for the opening record if it contains the - # periodical start, section 1 start and at least one article. The - # two cases are distinguished by the flags on the article index - # vwi. - unk = byts[0] - byts = byts[1:] - ans.append('Unknown (byte: always 2?): %d'%unk) - byts = tbs_type_6(byts) - # }}} + if tbs_type & 0b0100: + # Starting section is the first section + ssi = self.get_index(1) + else: + ssi, byts = read_starting_section(byts) - elif tbs_type == 2: # {{{ - # This occurs for records with no section nodes and whose parent - # section's index != 1 (undefined (records before the first - # section) or > 1) - # This is also used for records that are spanned by an article - # whose parent section index > 1. In this case the flags of the - # vwi referring to the article at the start - # of the record are set to 1 instead of 4. - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - psi = (arg >> 4) - ans.append('Parent section index (fvwi): %d'%psi) - psi = self.get_index(psi) - ans.append('Flags: %d'%flags) - if flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi?: always 0?): %d'%arg) - byts = tbs_type_6(byts, psi=psi) - elif flags == 0: - byts = tbs_type_6(byts, psi=psi) - else: - raise ValueError('Unknown flags: %d'%flags) - # }}} + byts = read_section_transitions(byts, ssi) return byts, ans diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst index d770133625..2fa6ec90f3 100644 --- a/src/calibre/ebooks/mobi/tbs_periodicals.rst +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing. +Sequence encoding: + +0b1000 : Continuation bit + +First sequences: +0b0010 : 80 +0b0011 : 80 80 +0b0110 : 80 2 +0b0111 : 80 2 80 + +Other sequences: +0b0101 : 4 1a +0b0001 : c b1 + Opening record ---------------- @@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record. + Starting record with two section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 7 index entries (0 ends, 4 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c4 3 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net] + Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c4 3 + + Starting record with three section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 10 index entries (0 ends, 7 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results] + Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement] + Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews] + Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c0 b8 c4 4 + + + + Records with no nodes ------------------------ +subtype = 010 + These records are spanned by a single article. They are of two types: 1. If the parent section index is 1, TBS type of 6, like this:: @@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute] Flags (always 8?): 8 Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute] - Number of article nodes in the record (byte): 4 + Number of article nodes in the record belonging ot the last section (byte): 4 Ending record @@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end If the record had only a single article end, the last two bytes would be replaced with: f0 +If the last record has multiple section transitions, it is of type 6 and looks like:: + + Record #9: Starts at: 32768 Ends at: 34953 + Contains: 9 index entries (3 ends, 6 complete, 0 starts) + TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + Ends: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica] + Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe] + Complete: + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews] + Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot] + Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results] + Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute] + Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index ae1241e2f1..37d2093066 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -11,6 +11,7 @@ import struct from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail +from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 @@ -197,3 +198,96 @@ def encode_trailing_data(raw): lsize += 1 return raw + encoded +def encode_fvwi(val, flags): + ''' + Encode the value val and the 4 bit flags flags as a fvwi. This encoding is + used in the trailing byte sequences for indexing. Returns encoded + bytestring. + ''' + ans = (val << 4) | (flags & 0b1111) + return encint(ans) + + +def decode_fvwi(byts): + ''' + Decode encoded fvwi. Returns number, flags, consumed + ''' + arg, consumed = decint(bytes(byts)) + return (arg >> 4), (arg & 0b1111), consumed + +def decode_tbs(byts): + ''' + Trailing byte sequences for indexing consists of series of fvwi numbers. + This function reads the fvwi number and its associated flags. It them uses + the flags to read any more numbers that belong to the series. The flags are + the lowest 4 bits of the vwi (see the encode_fvwi function above). + + Returns the fvwi number, a dictionary mapping flags bits to the associated + data and the number of bytes consumed. + ''' + byts = bytes(byts) + val, flags, consumed = decode_fvwi(byts) + extra = {} + byts = byts[consumed:] + if flags & 0b1000: + extra[0b1000] = True + if flags & 0b0010: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0010] = x + consumed += consumed2 + if flags & 0b0100: + extra[0b0100] = ord(byts[0]) + byts = byts[1:] + consumed += 1 + if flags & 0b0001: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0001] = x + consumed += consumed2 + return val, extra, consumed + +def encode_tbs(val, extra): + ''' + Encode the number val and the extra data in the extra dict as an fvwi. See + decode_tbs above. + ''' + flags = 0 + for flag in extra: + flags |= flag + ans = encode_fvwi(val, flags) + + if 0b0010 in extra: + ans += encint(extra[0b0010]) + if 0b0100 in extra: + ans += bytes(bytearray([extra[0b0100]])) + if 0b0001 in extra: + ans += encint(extra[0b0001]) + return ans + +def utf8_text(text): + ''' + Convert a possibly null string to utf-8 bytes, guaranteeing to return a non + empty, normalized bytestring. + ''' + if text and text.strip(): + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else: + text = _('Unknown').encode('utf-8') + return text + +def align_block(raw, multiple=4, pad=b'\0'): + ''' + Return raw with enough pad bytes append to ensure its length is a multiple + of 4. + ''' + extra = len(raw) % multiple + if extra == 0: return raw + return raw + pad*(multiple - extra) + + + + diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 41c5d2ec91..04387f47f7 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -10,35 +10,13 @@ __docformat__ = 'restructuredtext en' from struct import pack from cStringIO import StringIO -from collections import OrderedDict +from collections import OrderedDict, defaultdict -from calibre.ebooks import normalize -from calibre.ebook.mobi.writer2 import RECORD_SIZE -from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex) +from calibre.ebooks.mobi.writer2 import RECORD_SIZE +from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, + encode_trailing_data, encode_tbs, align_block, utf8_text) from calibre.ebooks.mobi.langcodes import iana2mobi -def utf8_text(text): - ''' - Convert a possibly null string to utf-8 bytes, guaranteeing to return a non - empty, normalized bytestring. - ''' - if text and text.strip(): - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else: - text = _('Unknown').encode('utf-8') - return text - -def align_block(raw, multiple=4, pad=b'\0'): - ''' - Return raw with enough pad bytes append to ensure its length is a multiple - of 4. - ''' - extra = len(raw) % multiple - if extra == 0: return raw - return raw + pad*(multiple - extra) class CNCX(object): # {{{ @@ -98,7 +76,7 @@ class IndexEntry(object): # {{{ 'first_child_index': 22, 'last_child_index': 23, } - RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys()) + RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()} BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,] @@ -186,17 +164,123 @@ class TBS(object): # {{{ trailing byte sequence for the record. ''' - def __init__(self, data, is_periodical): - if is_periodical: - self.periodical_tbs(data) + def __init__(self, data, is_periodical, first=False, all_sections=[]): + if not data: + self.bytestring = encode_trailing_data(b'') else: - self.book_tbs(data) + self.section_map = OrderedDict((i.index, i) for i in + sorted(all_sections, key=lambda x:x.offset)) - def periodical_tbs(self, data): - self.bytestring = b'' + if is_periodical: + # The starting bytes. + # The value is zero which I think indicates the periodical + # index entry. The values for the various flags seem to be + # unused. If the 0b0100 is present, it means that the record + # deals with section 1 (or is the final record with section + # transitions). + self.type_010 = encode_tbs(0, {0b0010: 0}) + self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) + self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) + self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) - def book_tbs(self, data): - self.bytestring = b'' + depth_map = defaultdict(list) + for x in ('starts', 'ends', 'completes'): + for idx in data[x]: + depth_map[idx.depth].append(idx) + for l in depth_map.itervalues(): + l.sort(key=lambda x:x.offset) + self.periodical_tbs(data, first, depth_map) + else: + self.book_tbs(data, first) + + def periodical_tbs(self, data, first, depth_map): + buf = StringIO() + + has_section_start = (depth_map[1] and depth_map[1][0] in + data['starts']) + spanner = data['spans'] + first_node = None + for nodes in depth_map.values(): + for node in nodes: + if (first_node is None or (node.offset, node.depth) < + (first_node.offset, first_node.depth)): + first_node = node + + parent_section_index = -1 + if depth_map[0]: + # We have a terminal record + typ = (self.type_110 if has_section_start else self.type_010) + if first_node.depth > 0: + parent_section_index = (first_node.index if first_node.depth + == 1 else first_node.parent_index) + else: + if spanner is not None: + # record is spanned by a single article + parent_section_index = spanner.parent_index + typ = (self.type_110 if parent_section_index == 1 else + self.type_010) + elif not depth_map[1]: + # has only article nodes, i.e. spanned by a section + parent_section_index = self.depth_map[2][0].parent_index + typ = (self.type_111 if parent_section_index == 1 else + self.type_010) + else: + # has section transitions + parent_section_index = self.depth_map[2][0].parent_index + + buf.write(typ) + + if parent_section_index > 1: + # Write starting section information + if spanner is None: + num_articles = len(depth_map[1]) + extra = {} + if num_articles > 1: + extra = {0b0100: num_articles} + else: + extra = {0b0001: 0} + buf.write(encode_tbs(parent_section_index, extra)) + + if spanner is None: + articles = depth_map[2] + sections = [self.section_map[a.parent_index] for a in articles] + sections.sort(key=lambda x:x.offset) + section_map = {s:[a for a in articles is a.parent_index == + s.index] for s in sections} + for i, section in enumerate(sections): + # All the articles in this record that belong to section + articles = section_map[section] + first_article = articles[0] + last_article = articles[-1] + num = len(articles) + + try: + next_sec = sections[i+1] + except: + next_sec == None + + extra = {} + if num > 1: + extra[0b0100] = num + if i == 0 and next_sec is not None: + # Write offset to next section from start of record + # For some reason kindlegen only writes this offset + # for the first section transition. Imitate it. + extra[0b0001] = next_sec.offset - data['offset'] + + buf.write(encode_tbs(first_article.index-section.index, extra)) + + if next_sec is not None: + buf.write(encode_tbs(last_article.index-next_sec.index, + {0b1000: 0})) + else: + buf.write(encode_tbs(spanner.index - parent_section_index, + {0b0001: 0})) + + self.bytestring = encode_trailing_data(buf.getvalue()) + + def book_tbs(self, data, first): + self.bytestring = encode_trailing_data(b'') # }}} class Indexer(object): # {{{ @@ -548,11 +632,13 @@ class Indexer(object): # {{{ def calculate_trailing_byte_sequences(self): self.tbs_map = {} + found_node = False + sections = [i for i in self.indices if i.depth == 1] for i in xrange(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), - ('spans', None)]) + ('spans', None), ('offset', offset)]) for index in self.indices: if index.offset >= next_offset: # Node starts after current record @@ -574,7 +660,13 @@ class Indexer(object): # {{{ data['ends'].append(index) else: data['spans'] = index - self.tbs_map[i+1] = TBS(data, self.is_periodical) + if (data['ends'] or data['completes'] or data['starts'] or + data['spans'] is not None): + self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not + found_node, all_sections=sections) + found_node = True + else: + self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False) def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring From 1297576ee20028ce7302ac180dc6e7c2520ae760 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 20:23:52 -0600 Subject: [PATCH 02/26] New MOBI output: Allow calibre to convert OEB documents with a toc.ncx conforming to the kindlegen periodical specification into periodicals --- src/calibre/ebooks/mobi/writer2/indexer.py | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 04387f47f7..0f7a670cff 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -295,7 +295,9 @@ class Indexer(object): # {{{ self.log = oeb.log self.opts = opts - self.is_periodical = opts.mobi_periodical + self.is_periodical = self.detect_periodical() + self.log('Generating MOBI index for a %s'%('periodical' if + self.is_periodical else 'book')) self.is_flat_periodical = False if opts.mobi_periodical: periodical_node = iter(oeb.toc).next() @@ -317,6 +319,28 @@ class Indexer(object): # {{{ self.calculate_trailing_byte_sequences() + def detect_periodical(self): # {{{ + for node in self.oeb.toc.iterdescendants(): + if node.depth() == 1 and node.klass != 'article': + self.log.debug( + 'Not a periodical: Deepest node does not have ' + 'class="article"') + return False + if node.depth() == 2 and node.klass != 'section': + self.log.debug( + 'Not a periodical: Second deepest node does not have' + ' class="section"') + return False + if node.depth() == 3 and node.klass != 'periodical': + self.log.debug('Not a periodical: Third deepest node' + ' does not have class="periodical"') + return False + if node.depth() > 3: + self.log.debug('Not a periodical: Has nodes of depth > 3') + return False + return True + # }}} + def create_index_record(self): # {{{ header_length = 192 buf = StringIO() @@ -630,6 +654,7 @@ class Indexer(object): # {{{ return indices # }}} + # TBS {{{ def calculate_trailing_byte_sequences(self): self.tbs_map = {} found_node = False @@ -670,6 +695,7 @@ class Indexer(object): # {{{ def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring + # }}} # }}} From 586aa592459ea7c3279166393218ceb24463a649 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Tue, 26 Jul 2011 13:53:45 +0100 Subject: [PATCH 03/26] Fix 815573: Series number Tweak will not accept constant value --- resources/default_tweaks.py | 14 +++++++++++++- src/calibre/ebooks/metadata/opf2.py | 8 +++++++- src/calibre/library/database2.py | 13 ++++++++----- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 3e2cc4da57..65cb030f96 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -11,7 +11,7 @@ defaults. ''' #: Auto increment series index -# The algorithm used to assign a new book in an existing series a series number. +# The algorithm used to assign a book added to an existing series a series number. # New series numbers assigned using this tweak are always integer values, except # if a constant non-integer is specified. # Possible values are: @@ -27,7 +27,19 @@ defaults. # series_index_auto_increment = 'next' # series_index_auto_increment = 'next_free' # series_index_auto_increment = 16.5 +# +# Set the use_series_auto_increment_tweak_when_importing tweak to True to +# use the above values when importing/adding books. If this tweak is set to +# False (the default) then the series number will be set to 1 if it is not +# explicitly set to something else during the import. If set to True, then the +# series index will be set according to the series_index_auto_increment setting. +# Note that the use_series_auto_increment_tweak_when_importing tweak is used +# only when a value is not provided during import. If the importing regular +# expression produces a value for series_index, or if you are reading metadata +# from books and the import plugin produces a value, than that value will +# be used irrespective of the setting of the tweak. series_index_auto_increment = 'next' +use_series_auto_increment_tweak_when_importing = False #: Add separator after completing an author name # Should the completion separator be append diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 186821b0c3..7ad741848e 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -22,6 +22,7 @@ from calibre.utils.date import parse_date, isoformat from calibre.utils.localization import get_lang from calibre import prints, guess_type from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.config import tweaks class Resource(object): # {{{ ''' @@ -527,7 +528,12 @@ class OPF(object): # {{{ category = MetadataField('type') rights = MetadataField('rights') series = MetadataField('series', is_dc=False) - series_index = MetadataField('series_index', is_dc=False, formatter=float, none_is=1) + if tweaks['use_series_auto_increment_tweak_when_importing']: + series_index = MetadataField('series_index', is_dc=False, + formatter=float, none_is=None) + else: + series_index = MetadataField('series_index', is_dc=False, + formatter=float, none_is=1) title_sort = TitleSortField('title_sort', is_dc=False) rating = MetadataField('rating', is_dc=False, formatter=int) pubdate = MetadataField('date', formatter=parse_date, diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 8d16ffbc52..9ae8f0569b 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -3023,8 +3023,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): stream.seek(0) mi = get_metadata(stream, format, use_libprs_metadata=False) stream.seek(0) - if not mi.series_index: - mi.series_index = 1.0 + if mi.series_index is None: + mi.series_index = self.get_next_series_num_for(mi.series) mi.tags = [_('News')] if arg['add_title_tag']: mi.tags += [arg['title']] @@ -3076,7 +3076,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self._add_newbook_tag(mi) if not add_duplicates and self.has_book(mi): return None - series_index = 1.0 if mi.series_index is None else mi.series_index + series_index = self.get_next_series_num_for(mi.series) \ + if mi.series_index is None else mi.series_index aus = mi.author_sort if mi.author_sort else self.author_sort_from_authors(mi.authors) title = mi.title if isbytestring(aus): @@ -3123,7 +3124,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if not add_duplicates and self.has_book(mi): duplicates.append((path, format, mi)) continue - series_index = 1.0 if mi.series_index is None else mi.series_index + series_index = self.get_next_series_num_for(mi.series) \ + if mi.series_index is None else mi.series_index aus = mi.author_sort if mi.author_sort else self.author_sort_from_authors(mi.authors) title = mi.title if isinstance(aus, str): @@ -3157,7 +3159,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): def import_book(self, mi, formats, notify=True, import_hooks=True, apply_import_tags=True, preserve_uuid=False): - series_index = 1.0 if mi.series_index is None else mi.series_index + series_index = self.get_next_series_num_for(mi.series) \ + if mi.series_index is None else mi.series_index if apply_import_tags: self._add_newbook_tag(mi) if not mi.title: From e61b86cd243da4ff394fb82d94ab31fbe0bafbc8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 11:25:56 -0600 Subject: [PATCH 04/26] ... --- src/calibre/ebooks/mobi/debug.py | 6 +- src/calibre/ebooks/mobi/utils.py | 30 +++++--- src/calibre/ebooks/mobi/writer2/indexer.py | 80 ++++++++++++++-------- src/calibre/ebooks/mobi/writer2/main.py | 2 + 4 files changed, 77 insertions(+), 41 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 67f20e691f..f35d8ac075 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -957,15 +957,17 @@ class TBSIndexing(object): # {{{ return str({bin4(k):v for k, v in extra.iteritems()}) tbs_type = 0 + is_periodical = self.doc_type in (257, 258, 259) if len(byts): - outermost_index, extra, consumed = decode_tbs(byts) + outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if + is_periodical else 3) byts = byts[consumed:] for k in extra: tbs_type |= k ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) ans.append('Outermost index: %d'%outermost_index) ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) - if self.doc_type in (257, 259): # Hierarchical periodical + if is_periodical: # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts, dat['geom'][0]) ans += a diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 37d2093066..16aa2a3b64 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -66,11 +66,14 @@ def encint(value, forward=True): If forward is True the bytes returned are suitable for prepending to the output buffer, otherwise they must be append to the output buffer. ''' + if value < 0: + raise ValueError('Cannot encode negative numbers as vwi') # Encode vwi byts = bytearray() while True: b = value & 0b01111111 value >>= 7 # shift value to the right by 7 bits + byts.append(b) if value == 0: break @@ -198,24 +201,31 @@ def encode_trailing_data(raw): lsize += 1 return raw + encoded -def encode_fvwi(val, flags): +def encode_fvwi(val, flags, flag_size=4): ''' - Encode the value val and the 4 bit flags flags as a fvwi. This encoding is + Encode the value val and the flag_size bits from flags as a fvwi. This encoding is used in the trailing byte sequences for indexing. Returns encoded bytestring. ''' - ans = (val << 4) | (flags & 0b1111) + ans = val << flag_size + for i in xrange(flag_size): + ans |= (flags & (1 << i)) return encint(ans) -def decode_fvwi(byts): +def decode_fvwi(byts, flag_size=4): ''' Decode encoded fvwi. Returns number, flags, consumed ''' arg, consumed = decint(bytes(byts)) - return (arg >> 4), (arg & 0b1111), consumed + val = arg >> flag_size + flags = 0 + for i in xrange(flag_size): + flags |= (arg & (1 << i)) + return val, flags, consumed -def decode_tbs(byts): + +def decode_tbs(byts, flag_size=4): ''' Trailing byte sequences for indexing consists of series of fvwi numbers. This function reads the fvwi number and its associated flags. It them uses @@ -226,10 +236,10 @@ def decode_tbs(byts): data and the number of bytes consumed. ''' byts = bytes(byts) - val, flags, consumed = decode_fvwi(byts) + val, flags, consumed = decode_fvwi(byts, flag_size=flag_size) extra = {} byts = byts[consumed:] - if flags & 0b1000: + if flags & 0b1000 and flag_size > 3: extra[0b1000] = True if flags & 0b0010: x, consumed2 = decint(byts) @@ -247,7 +257,7 @@ def decode_tbs(byts): consumed += consumed2 return val, extra, consumed -def encode_tbs(val, extra): +def encode_tbs(val, extra, flag_size=4): ''' Encode the number val and the extra data in the extra dict as an fvwi. See decode_tbs above. @@ -255,7 +265,7 @@ def encode_tbs(val, extra): flags = 0 for flag in extra: flags |= flag - ans = encode_fvwi(val, flags) + ans = encode_fvwi(val, flags, flag_size=flag_size) if 0b0010 in extra: ans += encint(extra[0b0010]) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 0f7a670cff..ece96e3a7c 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -28,13 +28,12 @@ class CNCX(object): # {{{ MAX_STRING_LENGTH = 500 - def __init__(self, toc, opts): + def __init__(self, toc, is_periodical): self.strings = OrderedDict() - for item in toc: - if item is self.toc: continue + for item in toc.iterdescendants(): self.strings[item.title] = 0 - if opts.mobi_periodical: + if is_periodical: self.strings[item.klass] = 0 self.records = [] @@ -91,6 +90,17 @@ class IndexEntry(object): # {{{ self.first_child_index = None self.last_child_index = None + def __repr__(self): + return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,' + ' parent_index=%r)')%(self.offset, self.depth, self.length, + self.index, self.parent_index) + + @dynamic_property + def size(self): + def fget(self): return self.length + def fset(self, val): self.length = val + return property(fget=fget, fset=fset, doc='Alias for length') + @classmethod def tagx_block(cls, for_periodical=True): buf = bytearray() @@ -137,7 +147,7 @@ class IndexEntry(object): # {{{ def entry_type(self): ans = 0 for tag in self.tag_nums: - ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x + ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x return ans @property @@ -152,7 +162,7 @@ class IndexEntry(object): # {{{ val = getattr(self, attr) buf.write(encint(val)) - ans = buf.get_value() + ans = buf.getvalue() return ans # }}} @@ -175,13 +185,16 @@ class TBS(object): # {{{ # The starting bytes. # The value is zero which I think indicates the periodical # index entry. The values for the various flags seem to be - # unused. If the 0b0100 is present, it means that the record + # unused. If the 0b100 is present, it means that the record # deals with section 1 (or is the final record with section # transitions). - self.type_010 = encode_tbs(0, {0b0010: 0}) - self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) - self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) - self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) + self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) + self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, + flag_size=3) + self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, + flag_size=3) + self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: + 0}, flag_size=3) depth_map = defaultdict(list) for x in ('starts', 'ends', 'completes'): @@ -221,12 +234,18 @@ class TBS(object): # {{{ self.type_010) elif not depth_map[1]: # has only article nodes, i.e. spanned by a section - parent_section_index = self.depth_map[2][0].parent_index + parent_section_index = depth_map[2][0].parent_index typ = (self.type_111 if parent_section_index == 1 else self.type_010) else: # has section transitions - parent_section_index = self.depth_map[2][0].parent_index + if depth_map[2]: + parent_section_index = depth_map[2][0].parent_index + typ = self.type_011 + else: + parent_section_index = depth_map[1][0].index + typ = (self.type_110 if parent_section_index == 1 else + self.type_011) buf.write(typ) @@ -243,9 +262,10 @@ class TBS(object): # {{{ if spanner is None: articles = depth_map[2] - sections = [self.section_map[a.parent_index] for a in articles] - sections.sort(key=lambda x:x.offset) - section_map = {s:[a for a in articles is a.parent_index == + sections = set([self.section_map[a.parent_index] for a in + articles]) + sections = sorted(sections, key=lambda x:x.offset) + section_map = {s:[a for a in articles if a.parent_index == s.index] for s in sections} for i, section in enumerate(sections): # All the articles in this record that belong to section @@ -257,7 +277,7 @@ class TBS(object): # {{{ try: next_sec = sections[i+1] except: - next_sec == None + next_sec = None extra = {} if num > 1: @@ -299,14 +319,14 @@ class Indexer(object): # {{{ self.log('Generating MOBI index for a %s'%('periodical' if self.is_periodical else 'book')) self.is_flat_periodical = False - if opts.mobi_periodical: + if self.is_periodical: periodical_node = iter(oeb.toc).next() sections = tuple(periodical_node) self.is_flat_periodical = len(sections) == 1 self.records = [] - self.cncx = CNCX(oeb.toc, opts) + self.cncx = CNCX(oeb.toc, self.is_periodical) if self.is_periodical: self.indices = self.create_periodical_index() @@ -405,7 +425,7 @@ class Indexer(object): # {{{ buf.write(pack(b'>I', 0)) # Filled in later # Number of index records 24-28 - buf.write(pack('b>I', len(self.records))) + buf.write(pack(b'>I', len(self.records))) # Index Encoding 28-32 buf.write(pack(b'>I', 65001)) # utf-8 @@ -457,7 +477,7 @@ class Indexer(object): # {{{ idxt_offset = buf.tell() buf.write(b'IDXT') - buf.write(header_length + len(tagx_block)) + buf.write(pack(b'>H', header_length + len(tagx_block))) buf.write(b'\0') buf.seek(20) buf.write(pack(b'>I', idxt_offset)) @@ -567,7 +587,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, normalized_articles = x try: - sec.length = normalized_sections[s+1].offset - sec.offset + sec.length = normalized_sections[s+1][0].offset - sec.offset except: sec.length = self.serializer.body_end_offset - sec.offset for i, art in enumerate(normalized_articles): @@ -583,17 +603,18 @@ class Indexer(object): # {{{ normalized_articles)) normalized_sections[i] = (sec, normalized_articles) - normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1], + normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1], normalized_sections)) # Set indices i = 0 - for sec, normalized_articles in normalized_sections: + for sec, articles in normalized_sections: i += 1 sec.index = i + sec.parent_index = 0 - for sec, normalized_articles in normalized_sections: - for art in normalized_articles: + for sec, articles in normalized_sections: + for art in articles: i += 1 art.index = i art.parent_index = sec.index @@ -606,7 +627,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, articles = x try: - next_offset = normalized_sections[s+1].offset + next_offset = normalized_sections[s+1][0].offset except: next_offset = self.serializer.body_end_offset sec.length = next_offset - sec.offset @@ -622,7 +643,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, articles = x try: - next_sec = normalized_sections[s+1] + next_sec = normalized_sections[s+1][0] except: if (sec.length == 0 or sec.next_offset != self.serializer.body_end_offset): @@ -659,6 +680,7 @@ class Indexer(object): # {{{ self.tbs_map = {} found_node = False sections = [i for i in self.indices if i.depth == 1] + deepest = max(i.depth for i in self.indices) for i in xrange(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE @@ -683,7 +705,7 @@ class Indexer(object): # {{{ if index.next_offset <= next_offset: # Node ends in current record data['ends'].append(index) - else: + elif index.depth == deepest: data['spans'] = index if (data['ends'] or data['completes'] or data['starts'] or data['spans'] is not None): diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 06572f48c4..a5e80cc3cd 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -55,6 +55,7 @@ class MobiWriter(object): self.last_text_record_idx = 1 def __call__(self, oeb, path_or_stream): + self.log = oeb.log if hasattr(path_or_stream, 'write'): return self.dump_stream(oeb, path_or_stream) with open(path_or_stream, 'w+b') as stream: @@ -90,6 +91,7 @@ class MobiWriter(object): self.primary_index_record_idx = None try: self.indexer = Indexer(self.serializer, self.last_text_record_idx, + len(self.records[self.last_text_record_idx]), self.opts, self.oeb) except: self.log.exception('Failed to generate MOBI index:') From abe30422a6986ade37995802b921bb1cf083282e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 11:42:57 -0600 Subject: [PATCH 05/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index ece96e3a7c..311b4220d9 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -125,7 +125,7 @@ class IndexEntry(object): # {{{ buf.append(1) header = b'TAGX' - header += pack(b'>I', len(buf)) # table length + header += pack(b'>I', 12+len(buf)) # table length header += pack(b'>I', 1) # control byte count return header + bytes(buf) From e8cc278b186a653f11e4db07efa4918c1bf8fa32 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 12:06:09 -0600 Subject: [PATCH 06/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 3 +-- src/calibre/ebooks/mobi/writer2/main.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 311b4220d9..f6add97a53 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -52,11 +52,10 @@ class CNCX(object): # {{{ self.records.append(buf.getvalue()) buf.truncate(0) offset = len(self.records) * 0x10000 - + buf.write(raw) self.strings[key] = offset offset += len(raw) - buf.write(b'\0') # CNCX must end with zero byte self.records.append(align_block(buf.getvalue())) def __getitem__(self, string): diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index a5e80cc3cd..e614567508 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -279,7 +279,7 @@ class MobiWriter(object): last_content_record = len(self.records) - 1 # EOF record - self.records.append('\xE9\x8E\x0D\x0A') + self.records.append(b'\xE9\x8E\x0D\x0A') record0 = StringIO() # The MOBI Header @@ -309,8 +309,15 @@ class MobiWriter(object): # 0x10 - 0x13 : UID # 0x14 - 0x17 : Generator version + bt = 0x002 + if self.primary_index_record_idx is not None: + if self.indexer.is_flat_periodical: + bt = 0x102 + elif self.indexer.is_periodical: + bt = 0x103 + record0.write(pack(b'>IIIII', - 0xe8, 0x002, 65001, uid, 6)) + 0xe8, bt, 65001, uid, 6)) # 0x18 - 0x1f : Unknown record0.write(b'\xff' * 8) @@ -339,7 +346,8 @@ class MobiWriter(object): # 0x58 - 0x5b : Format version # 0x5c - 0x5f : First image record number record0.write(pack(b'>II', - 6, self.first_image_record if self.first_image_record else 0)) + 6, self.first_image_record if self.first_image_record else + len(self.records)-1)) # 0x60 - 0x63 : First HUFF/CDIC record number # 0x64 - 0x67 : Number of HUFF/CDIC records From 91ac0a879c285b2a57a4cdd1ee298383b0c3753d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 12:13:20 -0600 Subject: [PATCH 07/26] ... --- src/calibre/ebooks/mobi/debug.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index f35d8ac075..6c9a2136b7 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -73,7 +73,7 @@ class PalmDB(object): self.ident = self.type + self.creator if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): raise ValueError('Unknown book ident: %r'%self.ident) - self.uid_seed = self.raw[68:72] + self.uid_seed, = struct.unpack(b'>I', self.raw[68:72]) self.next_rec_list_id = self.raw[72:76] self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) @@ -290,7 +290,12 @@ class MOBIHeader(object): # {{{ (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) - self.unknown6 = self.raw[216:240] + self.unknown6 = self.raw[216:224] + self.srcs_record_index = struct.unpack(b'>I', + self.raw[224:228])[0] + self.num_srcs_records = struct.unpack(b'>I', + self.raw[228:232])[0] + self.unknown7 = self.raw[232:240] self.extra_data_flags = struct.unpack(b'>I', self.raw[240:244])[0] self.has_multibytes = bool(self.extra_data_flags & 0b1) @@ -356,6 +361,9 @@ class MOBIHeader(object): # {{{ ans.append('FLIS number: %d'% self.flis_number) ans.append('FLIS count: %d'% self.flis_count) ans.append('Unknown6: %r'% self.unknown6) + ans.append('SRCS record index: %d'%self.srcs_record_index) + ans.append('Number of SRCS records?: %d'%self.num_srcs_records) + ans.append('Unknown7: %r'%self.unknown7) ans.append(('Extra data flags: %s (has multibyte: %s) ' '(has indexing: %s) (has uncrossable breaks: %s)')%( bin(self.extra_data_flags), self.has_multibytes, From f47f4afe9f2bfd157eb973e69a2a59449aa2cc40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 13:41:23 -0600 Subject: [PATCH 08/26] ... --- src/calibre/ebooks/pdf/writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 516509fdd7..dc7f2edba9 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -165,6 +165,7 @@ class PDFWriter(QObject): # {{{ printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) self.view.print_(printer) + printer.abort() self._render_book() def _delete_tmpdir(self): @@ -186,6 +187,7 @@ class PDFWriter(QObject): # {{{ draw_image_page(printer, painter, p, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) painter.end() + printer.abort() def _write(self): From dbbde2c494743a78e077e0b42953039efbf25431 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 14:20:37 -0600 Subject: [PATCH 09/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index f6add97a53..f99f7824d0 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -500,12 +500,12 @@ class Indexer(object): # {{{ continue seen.add(offset) index = IndexEntry(offset, label) - self.indices.append(index) + indices.append(index) indices.sort(key=lambda x:x.offset) # Set lengths - for i, index in indices: + for i, index in enumerate(indices): try: next_offset = indices[i+1].offset except: @@ -516,11 +516,11 @@ class Indexer(object): # {{{ indices = [i for i in indices if i.length > 0] # Set index values - for i, index in indices: + for i, index in enumerate(indices): index.index = i # Set lengths again to close up any gaps left by filtering - for i, index in indices: + for i, index in enumerate(indices): try: next_offset = indices[i+1].offset except: From 3cd3ca6acd13d1cdc25512f56137e6b1c18dbe7e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 15:18:33 -0600 Subject: [PATCH 10/26] Improve Irish Times --- recipes/irish_times.recipe | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/recipes/irish_times.recipe b/recipes/irish_times.recipe index 3efcfc6d29..31ccd306e4 100644 --- a/recipes/irish_times.recipe +++ b/recipes/irish_times.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns" ''' irishtimes.com @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class IrishTimes(BasicNewsRecipe): title = u'The Irish Times' encoding = 'ISO-8859-15' - __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns" + __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns" language = 'en_IE' timefmt = ' (%A, %B %d, %Y)' @@ -18,6 +18,7 @@ class IrishTimes(BasicNewsRecipe): oldest_article = 1.0 max_articles_per_feed = 100 no_stylesheets = True + simultaneous_downloads= 5 r = re.compile('.*(?Phttp:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*') remove_tags = [dict(name='div', attrs={'class':'footer'})] @@ -25,17 +26,17 @@ class IrishTimes(BasicNewsRecipe): feeds = [ ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), - ('Ireland', 'http://rss.feedsportal.com/c/851/f/10845/index.rss'), - ('World', 'http://rss.feedsportal.com/c/851/f/10846/index.rss'), - ('Finance', 'http://rss.feedsportal.com/c/851/f/10847/index.rss'), - ('Features', 'http://rss.feedsportal.com/c/851/f/10848/index.rss'), - ('Sport', 'http://rss.feedsportal.com/c/851/f/10849/index.rss'), - ('Opinion', 'http://rss.feedsportal.com/c/851/f/10850/index.rss'), - ('Letters', 'http://rss.feedsportal.com/c/851/f/10851/index.rss'), + ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), + ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), + ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), + ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'), + ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), + ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), + ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'), - ('Health', 'http://rss.feedsportal.com/c/851/f/10852/index.rss'), - ('Education & Parenting', 'http://rss.feedsportal.com/c/851/f/10853/index.rss'), - ('Motors', 'http://rss.feedsportal.com/c/851/f/10854/index.rss'), + ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'), + ('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'), + ('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'), ('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'), ('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'), ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'), @@ -49,10 +50,16 @@ class IrishTimes(BasicNewsRecipe): def print_version(self, url): if url.count('rss.feedsportal.com'): - u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm') + #u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm') + u = url.find('irishtimes') + u = 'http://www.irishtimes.com' + url[u + 12:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0Bhtml/story01.htm', '_pf.html') else: u = url.replace('.html','_pf.html') return u def get_article_url(self, article): return article.link + From 0ab02460480af5f48be761913ba86b509e33f54b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 18:29:12 -0600 Subject: [PATCH 11/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 2 +- src/calibre/ebooks/oeb/base.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index f99f7824d0..4c428dd38d 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -31,7 +31,7 @@ class CNCX(object): # {{{ def __init__(self, toc, is_periodical): self.strings = OrderedDict() - for item in toc.iterdescendants(): + for item in toc.iterdescendants(breadth_first=True): self.strings[item.title] = 0 if is_periodical: self.strings[item.klass] = 0 diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index fb1910d717..56f4a3ee96 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1680,11 +1680,18 @@ class TOC(object): return True return False - def iterdescendants(self): + def iterdescendants(self, breadth_first=False): """Iterate over all descendant nodes in depth-first order.""" - for child in self.nodes: - for node in child.iter(): - yield node + if breadth_first: + for child in self.nodes: + yield child + for child in self.nodes: + for node in child.iterdescendants(breadth_first=True): + yield node + else: + for child in self.nodes: + for node in child.iter(): + yield node def __iter__(self): """Iterate over all immediate child nodes.""" From ae6f049792bc62eb43688d0b266a3dbbff450750 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 20:05:32 -0600 Subject: [PATCH 12/26] ... --- src/calibre/ebooks/mobi/debug.py | 13 ++-- src/calibre/ebooks/mobi/writer2/indexer.py | 76 +++++++++++++--------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 6c9a2136b7..4bf8d356cd 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -424,12 +424,7 @@ class IndexHeader(object): # {{{ if self.index_encoding == 'unknown': raise ValueError( 'Unknown index encoding: %d'%self.index_encoding_num) - self.locale_raw, = struct.unpack(b'>I', raw[32:36]) - langcode = self.locale_raw - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - self.language = main_language.get(langid, 'ENGLISH') - self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') + self.possibly_language = raw[32:36] self.num_index_entries, = struct.unpack('>I', raw[36:40]) self.ordt_start, = struct.unpack('>I', raw[40:44]) self.ligt_start, = struct.unpack('>I', raw[44:48]) @@ -489,8 +484,7 @@ class IndexHeader(object): # {{{ a('Number of index records: %d'%self.index_count) a('Index encoding: %s (%d)'%(self.index_encoding, self.index_encoding_num)) - a('Index language: %s - %s (%s)'%(self.language, self.sublanguage, - hex(self.locale_raw))) + a('Unknown (possibly language?): %r'%(self.possibly_language)) a('Number of index entries: %d'% self.num_index_entries) a('ORDT start: %d'%self.ordt_start) a('LIGT start: %d'%self.ligt_start) @@ -1038,6 +1032,7 @@ class TBSIndexing(object): # {{{ # }}} def read_starting_section(byts): # {{{ + orig = byts si, extra, consumed = decode_tbs(byts) byts = byts[consumed:] if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: @@ -1054,7 +1049,7 @@ class TBSIndexing(object): # {{{ eof = extra[0b0001] if eof != 0: raise ValueError('Unknown eof value %s when reading' - ' starting section'%eof) + ' starting section. All bytes: %r'%(eof, orig)) ans.append('This record is spanned by an article from' ' the section: %d'%si.index) return si, byts diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 4c428dd38d..14c5328622 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -15,7 +15,6 @@ from collections import OrderedDict, defaultdict from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, encode_trailing_data, encode_tbs, align_block, utf8_text) -from calibre.ebooks.mobi.langcodes import iana2mobi class CNCX(object): # {{{ @@ -173,28 +172,34 @@ class TBS(object): # {{{ trailing byte sequence for the record. ''' - def __init__(self, data, is_periodical, first=False, all_sections=[]): - if not data: - self.bytestring = encode_trailing_data(b'') - else: - self.section_map = OrderedDict((i.index, i) for i in - sorted(all_sections, key=lambda x:x.offset)) + def __init__(self, data, is_periodical, first=False, all_sections=[], + after_first=False): + self.section_map = OrderedDict((i.index, i) for i in + sorted(all_sections, key=lambda x:x.offset)) - if is_periodical: - # The starting bytes. - # The value is zero which I think indicates the periodical - # index entry. The values for the various flags seem to be - # unused. If the 0b100 is present, it means that the record - # deals with section 1 (or is the final record with section - # transitions). - self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) - self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, - flag_size=3) - self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, - flag_size=3) - self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: - 0}, flag_size=3) + if is_periodical: + # The starting bytes. + # The value is zero which I think indicates the periodical + # index entry. The values for the various flags seem to be + # unused. If the 0b100 is present, it means that the record + # deals with section 1 (or is the final record with section + # transitions). + self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) + self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, + flag_size=3) + self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, + flag_size=3) + self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: + 0}, flag_size=3) + if not data: + byts = b'' + if after_first: + # This can happen if a record contains only text between + # the periodical start and the first section + byts = self.type_011 + self.bytestring = encode_trailing_data(byts) + else: depth_map = defaultdict(list) for x in ('starts', 'ends', 'completes'): for idx in data[x]: @@ -202,6 +207,9 @@ class TBS(object): # {{{ for l in depth_map.itervalues(): l.sort(key=lambda x:x.offset) self.periodical_tbs(data, first, depth_map) + else: + if not data: + self.bytestring = encode_trailing_data(b'') else: self.book_tbs(data, first) @@ -240,15 +248,13 @@ class TBS(object): # {{{ # has section transitions if depth_map[2]: parent_section_index = depth_map[2][0].parent_index - typ = self.type_011 else: parent_section_index = depth_map[1][0].index - typ = (self.type_110 if parent_section_index == 1 else - self.type_011) + typ = self.type_011 buf.write(typ) - if parent_section_index > 1: + if typ not in (self.type_110, self.type_111) and parent_section_index > 0: # Write starting section information if spanner is None: num_articles = len(depth_map[1]) @@ -429,9 +435,8 @@ class Indexer(object): # {{{ # Index Encoding 28-32 buf.write(pack(b'>I', 65001)) # utf-8 - # Index language 32-36 - buf.write(iana2mobi( - str(self.oeb.metadata.language[0]))) + # Unknown 32-36 + buf.write(b'\xff'*4) # Number of index entries 36-40 buf.write(pack(b'>I', len(self.indices))) @@ -680,15 +685,20 @@ class Indexer(object): # {{{ found_node = False sections = [i for i in self.indices if i.depth == 1] deepest = max(i.depth for i in self.indices) + for i in xrange(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE - data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), - ('spans', None), ('offset', offset)]) + data = {'ends':[], 'completes':[], 'starts':[], + 'spans':None, 'offset':offset, 'record_number':i+1} + for index in self.indices: if index.offset >= next_offset: # Node starts after current record - break + if index.depth == deepest: + break + else: + continue if index.next_offset <= offset: # Node ends before current record continue @@ -706,13 +716,15 @@ class Indexer(object): # {{{ data['ends'].append(index) elif index.depth == deepest: data['spans'] = index + if (data['ends'] or data['completes'] or data['starts'] or data['spans'] is not None): self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not found_node, all_sections=sections) found_node = True else: - self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False) + self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False, + after_first=found_node) def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring From 4bbc23d70657bdbd6cfa143d396421c35565fe89 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 20:34:42 -0600 Subject: [PATCH 13/26] ... --- src/calibre/ebooks/mobi/debug.py | 28 ++++++++++++++++++++-------- src/calibre/ebooks/mobi/utils.py | 3 +++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 4bf8d356cd..cb028b9055 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -604,6 +604,9 @@ class IndexEntry(object): # {{{ self.raw = raw self.tags = [] self.entry_type_raw = entry_type + self.byte_size = len(raw) + + orig_raw = raw try: self.entry_type = self.TYPES[entry_type] @@ -641,8 +644,8 @@ class IndexEntry(object): # {{{ self.tags.append(Tag(aut_tag[0], [val], self.entry_type, cncx)) - if raw.replace(b'\x00', b''): # There can be padding null bytes - raise ValueError('Extra bytes in INDX table entry %d: %r'%(self.index, raw)) + self.consumed = len(orig_raw) - len(raw) + self.trailing_bytes = raw @property def label(self): @@ -694,13 +697,16 @@ class IndexEntry(object): # {{{ return -1 def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d)'%( - self.index, self.entry_type, bin(self.entry_type_raw)[2:], len(self.tags))] + ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d, byte_size=%d)'%( + self.index, self.entry_type, bin(self.entry_type_raw)[2:], + len(self.tags), self.byte_size)] for tag in self.tags: ans.append('\t'+str(tag)) if self.first_child_index != -1: ans.append('\tNumber of children: %d'%(self.last_child_index - self.first_child_index + 1)) + if self.trailing_bytes: + ans.append('\tTrailing bytes: %r'%self.trailing_bytes) return '\n'.join(ans) # }}} @@ -744,6 +750,7 @@ class IndexRecord(object): # {{{ raise ValueError('Extra bytes after IDXT table: %r'%rest) indxt = raw[192:self.idxt_offset] + self.size_of_indxt_block = len(indxt) self.indices = [] for i, off in enumerate(self.index_offsets): try: @@ -756,10 +763,14 @@ class IndexRecord(object): # {{{ if index_header.index_type == 6: flags = ord(indxt[off+consumed+d]) d += 1 + pos = off+consumed+d self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+d:next_off], cncx, + indxt[pos:next_off], cncx, index_header.tagx_entries, flags=flags)) - index = self.indices[-1] + + rest = indxt[pos+self.indices[-1].consumed:] + if rest.replace(b'\0', ''): # There can be padding null bytes + raise ValueError('Extra bytes after IDXT table: %r'%rest) def get_parent(self, index): if index.depth < 1: @@ -780,12 +791,13 @@ class IndexRecord(object): # {{{ u(self.unknown1) a('Unknown (header type? index record number? always 1?): %d'%self.header_type) u(self.unknown2) - a('IDXT Offset: %d'%self.idxt_offset) + a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, + self.idxt_offset)) a('IDXT Count: %d'%self.idxt_count) u(self.unknown3) u(self.unknown4) a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries:') + a('\nIndex Entries (%d entries):'%len(self.indices)) for entry in self.indices: a(str(entry)+'\n') diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 16aa2a3b64..839374af70 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -41,6 +41,9 @@ def encode_number_as_hex(num): number. ''' num = bytes(hex(num)[2:].upper()) + nlen = len(num) + if nlen % 2 != 0: + num = b'0'+num ans = bytearray(num) ans.insert(0, len(num)) return bytes(ans) From 8a0a978c526803acf6beb961009a1e3d56aeacee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 21:02:25 -0600 Subject: [PATCH 14/26] ... --- src/calibre/ebooks/mobi/debug.py | 1 + src/calibre/ebooks/mobi/writer2/main.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index cb028b9055..fe1e928dea 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -182,6 +182,7 @@ class EXTHHeader(object): self.records = [] for i in xrange(self.count): pos = self.read_record(pos) + self.records.sort(key=lambda x:x.type) def read_record(self, pos): type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index e614567508..8925d7f281 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -29,7 +29,6 @@ EXTH_CODES = { 'identifier': 104, 'subject': 105, 'pubdate': 106, - 'date': 106, 'review': 107, 'contributor': 108, 'rights': 109, @@ -479,16 +478,17 @@ class MobiWriter(object): nrecs += 1 # Write cdetype - if not self.opts.mobi_periodical: + if (self.primary_index_record_idx is None or not + self.indexer.is_periodical): data = b'EBOK' exth.write(pack(b'>II', 501, len(data)+8)) exth.write(data) nrecs += 1 # Add a publication date entry - if oeb.metadata['date'] != [] : + if oeb.metadata['date']: datestr = str(oeb.metadata['date'][0]) - elif oeb.metadata['timestamp'] != [] : + elif oeb.metadata['timestamp']: datestr = str(oeb.metadata['timestamp'][0]) if datestr is not None: From 807b7069d7e14d61bf72c5cc4d99c8664a90b4b0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 21:23:56 -0600 Subject: [PATCH 15/26] New mobi output: Make the MOBI header/extra records as similar to the output of kindlegen as possible --- src/calibre/ebooks/mobi/debug.py | 2 +- src/calibre/ebooks/mobi/writer2/main.py | 31 ++++++++++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index fe1e928dea..12bdb41f4b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -345,7 +345,7 @@ class MOBIHeader(object): # {{{ ans.append('Huffman record offset: %d'%self.huffman_record_offset) ans.append('Huffman record count: %d'%self.huffman_record_count) ans.append('Unknown2: %r'%self.unknown2) - ans.append('EXTH flags: %r (%s)'%(self.exth_flags, self.has_exth)) + ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) if self.has_drm_data: ans.append('Unknown3: %r'%self.unknown3) ans.append('DRM Offset: %s'%self.drm_offset) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 8925d7f281..476b53cd46 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -102,6 +102,10 @@ class MobiWriter(object): self.records[i] += tbs self.records.extend(self.indexer.records) + @property + def is_periodical(self): + return (self.primary_index_record_idx is None or not + self.indexer.is_periodical) # }}} @@ -277,6 +281,17 @@ class MobiWriter(object): exth = self.build_exth() last_content_record = len(self.records) - 1 + # FCIS/FLIS (Seem to server no purpose) + flis_number = len(self.records) + self.records.append( + b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ + b'\xff'*4) + fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + fcis += pack(b'>I', self.text_length) + fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + fcis_number = len(self.records) + self.records.append(fcis) + # EOF record self.records.append(b'\xE9\x8E\x0D\x0A') @@ -355,7 +370,12 @@ class MobiWriter(object): record0.write(b'\0' * 16) # 0x70 - 0x73 : EXTH flags - record0.write(pack(b'>I', 0x50)) + # Bit 6 (0b1000000) being set indicates the presence of an EXTH header + # The purpose of the other bits is unknown + exth_flags = 0b1011000 + if self.is_periodical: + exth_flags |= 0b1000 + record0.write(pack(b'>I', exth_flags)) # 0x74 - 0x93 : Unknown record0.write(b'\0' * 32) @@ -380,13 +400,13 @@ class MobiWriter(object): record0.write(b'\0\0\0\x01') # 0xb8 - 0xbb : FCIS record number - record0.write(pack(b'>I', 0xffffffff)) + record0.write(pack(b'>I', fcis_number)) # 0xbc - 0xbf : Unknown (FCIS record count?) - record0.write(pack(b'>I', 0xffffffff)) + record0.write(pack(b'>I', 1)) # 0xc0 - 0xc3 : FLIS record number - record0.write(pack(b'>I', 0xffffffff)) + record0.write(pack(b'>I', flis_number)) # 0xc4 - 0xc7 : Unknown (FLIS record count?) record0.write(pack(b'>I', 1)) @@ -478,8 +498,7 @@ class MobiWriter(object): nrecs += 1 # Write cdetype - if (self.primary_index_record_idx is None or not - self.indexer.is_periodical): + if self.is_periodical: data = b'EBOK' exth.write(pack(b'>II', 501, len(data)+8)) exth.write(data) From 766545324283daa20ceae668c7b9e1ad590df3b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 21:41:37 -0600 Subject: [PATCH 16/26] ... --- src/calibre/ebooks/mobi/writer2/main.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 476b53cd46..e13afa2ba7 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -511,12 +511,20 @@ class MobiWriter(object): datestr = str(oeb.metadata['timestamp'][0]) if datestr is not None: + datestr = bytes(datestr) + datestr = datestr.replace(b'+00:00', b'Z') exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8)) exth.write(datestr) nrecs += 1 else: raise NotImplementedError("missing date or timestamp needed for mobi_periodical") + # Write the same creator info as kindlegen 1.2 + for code, val in [(204, 202), (205, 1), (206, 2), (207, 33307)]: + exth.write(pack(b'>II', code, 12)) + exth.write(pack(b'>I', val)) + nrecs += 1 + if (oeb.metadata.cover and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids): id = unicode(oeb.metadata.cover[0]) From 3453746d906b5a36fbca9435a065a25217a599e0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 22:17:50 -0600 Subject: [PATCH 17/26] oops --- src/calibre/ebooks/mobi/debug.py | 2 ++ src/calibre/ebooks/mobi/utils.py | 2 +- src/calibre/ebooks/mobi/writer2/indexer.py | 10 +++++----- src/calibre/ebooks/mobi/writer2/main.py | 8 ++++---- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 12bdb41f4b..1279ba7793 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -844,6 +844,7 @@ class TextRecord(object): # {{{ def __init__(self, idx, record, extra_data_flags, decompress): self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + raw_trailing_bytes = record.raw[len(self.raw):] self.raw = decompress(self.raw) if 0 in self.trailing_data: self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) @@ -851,6 +852,7 @@ class TextRecord(object): # {{{ self.trailing_data['indexing'] = self.trailing_data.pop(1) if 2 in self.trailing_data: self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + self.trailing_data['raw_bytes'] = raw_trailing_bytes self.idx = idx diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 839374af70..6df9db3b3b 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -191,7 +191,7 @@ def encode_trailing_data(raw): where size is a backwards encoded vwi whose value is the length of the - entire return bytestring. + entire returned bytestring. data is the bytestring passed in as raw. This is the encoding used for trailing data entries at the end of text records. See get_trailing_data() for details. diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 14c5328622..f121e29835 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -14,7 +14,7 @@ from collections import OrderedDict, defaultdict from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, - encode_trailing_data, encode_tbs, align_block, utf8_text) + encode_tbs, align_block, utf8_text) class CNCX(object): # {{{ @@ -198,7 +198,7 @@ class TBS(object): # {{{ # This can happen if a record contains only text between # the periodical start and the first section byts = self.type_011 - self.bytestring = encode_trailing_data(byts) + self.bytestring = byts else: depth_map = defaultdict(list) for x in ('starts', 'ends', 'completes'): @@ -209,7 +209,7 @@ class TBS(object): # {{{ self.periodical_tbs(data, first, depth_map) else: if not data: - self.bytestring = encode_trailing_data(b'') + self.bytestring = b'' else: self.book_tbs(data, first) @@ -302,10 +302,10 @@ class TBS(object): # {{{ buf.write(encode_tbs(spanner.index - parent_section_index, {0b0001: 0})) - self.bytestring = encode_trailing_data(buf.getvalue()) + self.bytestring = buf.getvalue() def book_tbs(self, data, first): - self.bytestring = encode_trailing_data(b'') + self.bytestring = b'' # }}} class Indexer(object): # {{{ diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index e13afa2ba7..44c471d3d4 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -99,7 +99,7 @@ class MobiWriter(object): for i in xrange(len(self.records)): if i == 0: continue tbs = self.indexer.get_trailing_byte_sequence(i) - self.records[i] += tbs + self.records[i] += encode_trailing_data(tbs) self.records.extend(self.indexer.records) @property @@ -212,15 +212,15 @@ class MobiWriter(object): if self.compression == PALMDOC: data = compress_doc(data) record = StringIO() - record.write(data) - self.records.append(record.getvalue()) nrecords += 1 data, overlap = self.read_text_record(text) + record.write(data) - # Write information about the mutibyte character overlap, if any + # Write information about the multibyte character overlap, if any record.write(overlap) record.write(pack(b'>B', len(overlap))) + self.records.append(record.getvalue()) self.last_text_record_idx = nrecords From 543cabb4184f0f909056e406af6f217b3f604479 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 23:05:11 -0600 Subject: [PATCH 18/26] ... --- src/calibre/ebooks/mobi/utils.py | 24 ++++++++++++++++++++ src/calibre/ebooks/mobi/writer2/indexer.py | 26 ++-------------------- src/calibre/ebooks/mobi/writer2/main.py | 18 +++++---------- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 6df9db3b3b..80214b04d3 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -302,5 +302,29 @@ def align_block(raw, multiple=4, pad=b'\0'): return raw + pad*(multiple - extra) +def detect_periodical(toc, log): + ''' + Detect if the TOC object toc contains a periodical that conforms to the + structure required by kindlegen to generate a periodical. + ''' + for node in toc.iterdescendants(): + if node.depth() == 1 and node.klass != 'article': + log.debug( + 'Not a periodical: Deepest node does not have ' + 'class="article"') + return False + if node.depth() == 2 and node.klass != 'section': + log.debug( + 'Not a periodical: Second deepest node does not have' + ' class="section"') + return False + if node.depth() == 3 and node.klass != 'periodical': + log.debug('Not a periodical: Third deepest node' + ' does not have class="periodical"') + return False + if node.depth() > 3: + log.debug('Not a periodical: Has nodes of depth > 3') + return False + return True diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index f121e29835..54bef57ae3 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -14,7 +14,7 @@ from collections import OrderedDict, defaultdict from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, - encode_tbs, align_block, utf8_text) + encode_tbs, align_block, utf8_text, detect_periodical) class CNCX(object): # {{{ @@ -320,7 +320,7 @@ class Indexer(object): # {{{ self.log = oeb.log self.opts = opts - self.is_periodical = self.detect_periodical() + self.is_periodical = detect_periodical(self.oeb.toc, self.log) self.log('Generating MOBI index for a %s'%('periodical' if self.is_periodical else 'book')) self.is_flat_periodical = False @@ -344,28 +344,6 @@ class Indexer(object): # {{{ self.calculate_trailing_byte_sequences() - def detect_periodical(self): # {{{ - for node in self.oeb.toc.iterdescendants(): - if node.depth() == 1 and node.klass != 'article': - self.log.debug( - 'Not a periodical: Deepest node does not have ' - 'class="article"') - return False - if node.depth() == 2 and node.klass != 'section': - self.log.debug( - 'Not a periodical: Second deepest node does not have' - ' class="section"') - return False - if node.depth() == 3 and node.klass != 'periodical': - self.log.debug('Not a periodical: Third deepest node' - ' does not have class="periodical"') - return False - if node.depth() > 3: - self.log.debug('Not a periodical: Has nodes of depth > 3') - return False - return True - # }}} - def create_index_record(self): # {{{ header_length = 192 buf = StringIO() diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 44c471d3d4..e3f4081670 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -198,7 +198,6 @@ class MobiWriter(object): self.serializer = Serializer(self.oeb, self.images, write_page_breaks_after_item=self.write_page_breaks_after_item) text = self.serializer() - self.content_length = len(text) self.text_length = len(text) text = StringIO(text) nrecords = 0 @@ -206,21 +205,16 @@ class MobiWriter(object): if self.compression != UNCOMPRESSED: self.oeb.logger.info(' Compressing markup content...') - data, overlap = self.read_text_record(text) - - while len(data) > 0: + while text.tell() < self.text_length: + data, overlap = self.read_text_record(text) if self.compression == PALMDOC: data = compress_doc(data) - record = StringIO() + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) nrecords += 1 - data, overlap = self.read_text_record(text) - record.write(data) - - # Write information about the multibyte character overlap, if any - record.write(overlap) - record.write(pack(b'>B', len(overlap))) - self.records.append(record.getvalue()) self.last_text_record_idx = nrecords From 25ef6ef13ade559661423d47f4bf6b6b00a8de21 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 23:28:31 -0600 Subject: [PATCH 19/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 40 +++++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 54bef57ae3..917c7f1e4c 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -172,11 +172,12 @@ class TBS(object): # {{{ trailing byte sequence for the record. ''' - def __init__(self, data, is_periodical, first=False, all_sections=[], + def __init__(self, data, is_periodical, first=False, section_map={}, after_first=False): - self.section_map = OrderedDict((i.index, i) for i in - sorted(all_sections, key=lambda x:x.offset)) - + self.section_map = section_map + import pprint + pprint.pprint(data) + print() if is_periodical: # The starting bytes. # The value is zero which I think indicates the periodical @@ -216,21 +217,22 @@ class TBS(object): # {{{ def periodical_tbs(self, data, first, depth_map): buf = StringIO() - has_section_start = (depth_map[1] and depth_map[1][0] in - data['starts']) + has_section_start = (depth_map[1] and + set(depth_map[1]).intersection(set(data['starts']))) spanner = data['spans'] - first_node = None - for nodes in depth_map.values(): - for node in nodes: - if (first_node is None or (node.offset, node.depth) < - (first_node.offset, first_node.depth)): - first_node = node - parent_section_index = -1 + if depth_map[0]: # We have a terminal record + first_node = None + for nodes in (depth_map[1], depth_map[2]): + for node in nodes: + if (first_node is None or (node.offset, node.depth) < + (first_node.offset, first_node.depth)): + first_node = node + typ = (self.type_110 if has_section_start else self.type_010) - if first_node.depth > 0: + if first_node is not None and first_node.depth > 0: parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index) else: @@ -257,7 +259,8 @@ class TBS(object): # {{{ if typ not in (self.type_110, self.type_111) and parent_section_index > 0: # Write starting section information if spanner is None: - num_articles = len(depth_map[1]) + num_articles = len([a for a in depth_map[1] if a.parent_index + == parent_section_index]) extra = {} if num_articles > 1: extra = {0b0100: num_articles} @@ -662,6 +665,9 @@ class Indexer(object): # {{{ self.tbs_map = {} found_node = False sections = [i for i in self.indices if i.depth == 1] + section_map = OrderedDict((i.index, i) for i in + sorted(sections, key=lambda x:x.offset)) + deepest = max(i.depth for i in self.indices) for i in xrange(self.number_of_text_records): @@ -698,11 +704,11 @@ class Indexer(object): # {{{ if (data['ends'] or data['completes'] or data['starts'] or data['spans'] is not None): self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not - found_node, all_sections=sections) + found_node, section_map=section_map) found_node = True else: self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False, - after_first=found_node) + after_first=found_node, section_map=section_map) def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring From 4b7d3035600d92119829e5f06d70f5052418b6cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jul 2011 23:39:45 -0600 Subject: [PATCH 20/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 917c7f1e4c..f454412187 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -224,6 +224,8 @@ class TBS(object): # {{{ if depth_map[0]: # We have a terminal record + + # Find the first non periodical node first_node = None for nodes in (depth_map[1], depth_map[2]): for node in nodes: @@ -232,10 +234,17 @@ class TBS(object): # {{{ first_node = node typ = (self.type_110 if has_section_start else self.type_010) + + # parent_section_index is needed for the last record if first_node is not None and first_node.depth > 0: parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index) + else: + parent_section_index = max(self.section_map.iterkeys()) + else: + # Non terminal record + if spanner is not None: # record is spanned by a single article parent_section_index = spanner.parent_index From a4721656b0f93d69a485668fb7d141e854959750 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 00:01:18 -0600 Subject: [PATCH 21/26] ... --- src/calibre/ebooks/mobi/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 80214b04d3..4298276bc1 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -302,28 +302,32 @@ def align_block(raw, multiple=4, pad=b'\0'): return raw + pad*(multiple - extra) -def detect_periodical(toc, log): +def detect_periodical(toc, log=None): ''' Detect if the TOC object toc contains a periodical that conforms to the structure required by kindlegen to generate a periodical. ''' for node in toc.iterdescendants(): if node.depth() == 1 and node.klass != 'article': - log.debug( + if log is not None: + log.debug( 'Not a periodical: Deepest node does not have ' 'class="article"') return False if node.depth() == 2 and node.klass != 'section': - log.debug( + if log is not None: + log.debug( 'Not a periodical: Second deepest node does not have' ' class="section"') return False if node.depth() == 3 and node.klass != 'periodical': - log.debug('Not a periodical: Third deepest node' + if log is not None: + log.debug('Not a periodical: Third deepest node' ' does not have class="periodical"') return False if node.depth() > 3: - log.debug('Not a periodical: Has nodes of depth > 3') + if log is not None: + log.debug('Not a periodical: Has nodes of depth > 3') return False return True From 4683b3b30f71b8f5cb5570a22cd561cb40061c5e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 00:04:21 -0600 Subject: [PATCH 22/26] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index f454412187..d5226f68bd 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -175,9 +175,9 @@ class TBS(object): # {{{ def __init__(self, data, is_periodical, first=False, section_map={}, after_first=False): self.section_map = section_map - import pprint - pprint.pprint(data) - print() + #import pprint + #pprint.pprint(data) + #print() if is_periodical: # The starting bytes. # The value is zero which I think indicates the periodical From b461b58e8cfe5aa18a22cf14b247c3b689a9274f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 00:23:31 -0600 Subject: [PATCH 23/26] Fix #816094 ([Enhancement] Add COBY MP977 Support) --- src/calibre/customize/builtins.py | 4 ++-- src/calibre/devices/misc.py | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 9a01633cfe..620254b1f5 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -570,7 +570,7 @@ from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS, from calibre.devices.sne.driver import SNE from calibre.devices.misc import (PALMPRE, AVANT, SWEEX, PDNOVEL, GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR, - TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK) + TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK, COBY) from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.devices.bambook.driver import BAMBOOK @@ -705,7 +705,7 @@ plugins += [ EEEREADER, NEXTBOOK, ADAM, - MOOVYBOOK, + MOOVYBOOK, COBY, ITUNES, BOEYE_BEX, BOEYE_BDX, diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index 6c5706f039..92fce68f11 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -351,3 +351,29 @@ class MOOVYBOOK(USBMS): def get_main_ebook_dir(self, for_upload=False): return 'Books' if for_upload else self.EBOOK_DIR_MAIN +class COBY(USBMS): + + name = 'COBY MP977 device interface' + gui_name = 'COBY' + description = _('Communicate with the COBY') + author = 'Kovid Goyal' + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats + FORMATS = ['epub', 'pdf'] + + VENDOR_ID = [0x1e74] + PRODUCT_ID = [0x7121] + BCD = [0x02] + VENDOR_NAME = 'USB_2.0' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'MP977_DRIVER' + + EBOOK_DIR_MAIN = '' + + SUPPORTS_SUB_DIRS = False + + def get_carda_ebook_dir(self, for_upload=False): + if for_upload: + return 'eBooks' + return self.EBOOK_DIR_CARD_A + From d66fd24888834c5d6bd33c63a7aa8492797150e5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 00:37:20 -0600 Subject: [PATCH 24/26] Prevent metadata download from returning published dates earlier than 101 A.D. --- src/calibre/ebooks/metadata/sources/identify.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 97b6d15bc8..a7bcbc5a89 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -22,6 +22,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import utc_tz, as_utc from calibre.utils.html2text import html2text from calibre.utils.icu import lower +from calibre.utils.date import UNDEFINED_DATE # Download worker {{{ class Worker(Thread): @@ -490,6 +491,8 @@ def identify(log, abort, # {{{ max_tags = msprefs['max_tags'] for r in results: r.tags = r.tags[:max_tags] + if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year: + r.pubdate = None if msprefs['swap_author_names']: for r in results: From 9618a0ac4d523df1d7fbec9003c2208eb08997be Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 01:00:50 -0600 Subject: [PATCH 25/26] Fix #814722 (Option to save .opf metadata as in epub.) --- src/calibre/ebooks/metadata/opf2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 7ad741848e..35fd724ddd 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -1030,8 +1030,10 @@ class OPF(object): # {{{ attrib = attrib or {} attrib['name'] = 'calibre:' + name name = '{%s}%s' % (self.NAMESPACES['opf'], 'meta') + nsmap = dict(self.NAMESPACES) + del nsmap['opf'] elem = etree.SubElement(self.metadata, name, attrib=attrib, - nsmap=self.NAMESPACES) + nsmap=nsmap) elem.tail = '\n' return elem From 94212ff1cd91b14d9c4d5665b7940dce48112478 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Wed, 27 Jul 2011 08:26:01 +0100 Subject: [PATCH 26/26] Small change to libri_de_plugin.py details URL --- src/calibre/gui2/store/stores/libri_de_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/store/stores/libri_de_plugin.py b/src/calibre/gui2/store/stores/libri_de_plugin.py index ed93eeff0e..912ae668e8 100644 --- a/src/calibre/gui2/store/stores/libri_de_plugin.py +++ b/src/calibre/gui2/store/stores/libri_de_plugin.py @@ -24,7 +24,7 @@ class LibreDEStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'http://ad.zanox.com/ppc/?18817073C15644254T' - url_details = ('http://ad.zanox.com/ppc/?18845780C1371495675T&ULP=[[' + url_details = ('http://ad.zanox.com/ppc/?18848208C1197627693T&ULP=[[' 'http://www.libri.de/shop/action/productDetails?artiId={0}]]') if external or self.config.get('open_external', False):