From e313a72ec1776314f6402c4dcdcddb89ca07c4f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 15:53:29 -0600 Subject: [PATCH 01/30] When converting in the GUI, set all identifiers in the metadata in the output file, if the output file supports them. --- src/calibre/ebooks/oeb/transforms/metadata.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/metadata.py b/src/calibre/ebooks/oeb/transforms/metadata.py index f719ee3eb5..0db24dd2ad 100644 --- a/src/calibre/ebooks/oeb/transforms/metadata.py +++ b/src/calibre/ebooks/oeb/transforms/metadata.py @@ -47,15 +47,19 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False): m.add('series', mi.series) elif override_input_metadata: m.clear('series') - if not mi.is_null('isbn'): + identifiers = mi.get_identifiers() + set_isbn = False + for typ, val in identifiers.iteritems(): has = False + if typ.lower() == 'isbn': + set_isbn = True for x in m.identifier: - if x.scheme.lower() == 'isbn': - x.content = mi.isbn + if x.scheme.lower() == typ.lower(): + x.content = val has = True if not has: - m.add('identifier', mi.isbn, scheme='ISBN') - elif override_input_metadata: + m.add('identifier', val, scheme=typ.upper()) + if override_input_metadata and not set_isbn: m.filter('identifier', lambda x: x.scheme.lower() == 'isbn') if not mi.is_null('language'): m.clear('language') From 823cacf8113108d122aebb36cd5448fa4ee04909 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 16:20:56 -0600 Subject: [PATCH 02/30] ... --- src/calibre/ebooks/mobi/debug.py | 100 ++++++++++++++++++++---- src/calibre/ebooks/mobi/writer2/main.py | 82 +------------------ 2 files changed, 88 insertions(+), 94 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index cf3dee886a..1f5bf8ae23 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' import struct, datetime, sys, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.mobi.writer2.utils import decode_hex_number # PalmDB {{{ class PalmDOCAttributes(object): @@ -382,7 +383,7 @@ class TagX(object): # {{{ self.num_values, self.bitmask, self.bmask, self.eof) # }}} -class PrimaryIndexRecord(object): # {{{ +class IndexHeader(object): # {{{ def __init__(self, record): self.record = record @@ -437,9 +438,8 @@ class PrimaryIndexRecord(object): # {{{ raise ValueError('TAGX last entry is not EOF') idxt0_pos = self.header_length+self.tagx_header_length - last_name_len, = struct.unpack(b'>B', raw[idxt0_pos]) - count_pos = idxt0_pos+1+last_name_len - last_num = int(raw[idxt0_pos+1:count_pos], 16) + last_num, consumed = decode_hex_number(raw[idxt0_pos:]) + count_pos = idxt0_pos + consumed self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) if last_num != self.ncx_count - 1: @@ -457,9 +457,12 @@ class PrimaryIndexRecord(object): # {{{ def __str__(self): ans = ['*'*20 + ' Index Header '+ '*'*20] a = ans.append + def u(w): + a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, + len(w), not bool(w.replace(b'\0', b'')) )) + a('Header length: %d'%self.header_length) - a('Unknown1: %r (%d bytes) (All zeros: %r)'%(self.unknown1, - len(self.unknown1), not bool(self.unknown1.replace(b'\0', '')) )) + u(self.unknown1) a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) a('Offset to IDXT start: %d'%self.idxt_start) a('Number of index records: %d'%self.index_count) @@ -472,11 +475,9 @@ class PrimaryIndexRecord(object): # {{{ a('LIGT start: %d'%self.ligt_start) a('Number of LIGT entries: %d'%self.num_of_ligt_entries) a('Number of CTOC blocks: %d'%self.num_of_ctoc_blocks) - a('Unknown2: %r (%d bytes) (All zeros: %r)'%(self.unknown2, - len(self.unknown2), not bool(self.unknown2.replace(b'\0', '')) )) + u(self.unknown2) a('TAGX offset: %d'%self.tagx_offset) - a('Unknown3: %r (%d bytes) (All zeros: %r)'%(self.unknown3, - len(self.unknown3), not bool(self.unknown3.replace(b'\0', '')) )) + u(self.unknown3) a('\n\n') a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20) a('Header length: %d'%self.tagx_header_length) @@ -488,6 +489,71 @@ class PrimaryIndexRecord(object): # {{{ return '\n'.join(ans) # }}} +class IndexEntry(object): + + def __init__(self, ident, entry_type, raw): + self.id = ident + self.entry_type = entry_type + +class IndexRecord(object): # {{{ + + def __init__(self, record): + self.record = record + raw = self.record.raw + if raw[:4] != b'INDX': + raise ValueError('Invalid Primary Index Record') + + u = struct.unpack + + self.header_length, = u('>I', raw[4:8]) + self.unknown1 = raw[8:12] + self.header_type, = u('>I', raw[12:16]) + self.unknown2 = raw[16:20] + self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) + if self.idxt_offset < 192: + raise ValueError('Unknown Index record structure') + self.unknown3 = raw[28:36] + self.unknown4 = raw[36:192] # Should be 156 bytes + + self.index_offsets = [] + indices = raw[self.idxt_offset:] + if indices[:4] != b'IDXT': + raise ValueError("Invalid IDXT index table") + indices = indices[4:] + for i in range(self.idxt_count): + off, = u(b'>H', indices[i*2:(i+1)*2]) + self.index_offsets.append(off-192) + + indxt = raw[192:self.idxt_offset] + self.indices = [] + for off in self.index_offsets: + index = indxt[off:] + ident, consumed = decode_hex_number(index) + index = index[consumed:] + entry_type = u(b'>B', index[0]) + self.indices.append(IndexEntry(ident, entry_type, index[1:])) + + + def __str__(self): + ans = ['*'*20 + ' Index Record (%d bytes)'%len(self.record.raw)+ '*'*20] + a = ans.append + def u(w): + a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, + len(w), not bool(w.replace(b'\0', b'')) )) + a('Header length: %d'%self.header_length) + u(self.unknown1) + a('Header Type: %d'%self.header_type) + u(self.unknown2) + a('IDXT Offset: %d'%self.idxt_offset) + a('IDXT Count: %d'%self.idxt_count) + u(self.unknown3) + u(self.unknown4) + a('Index offsets: %r'%self.index_offsets) + + return '\n'.join(ans) + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -516,10 +582,11 @@ class MOBIFile(object): # {{{ self.mobi_header = MOBIHeader(self.records[0]) - self.primary_index_record = None + self.index_header = None pir = self.mobi_header.primary_index_record if pir != 0xffffffff: - self.primary_index_record = PrimaryIndexRecord(self.records[pir]) + self.index_header = IndexHeader(self.records[pir]) + self.index_record = IndexRecord(self.records[pir+1]) def print_header(self, f=sys.stdout): @@ -542,9 +609,12 @@ def inspect_mobi(path_or_stream): os.mkdir(ddir) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) - if f.primary_index_record is not None: - with open(os.path.join(ddir, 'primary_index_record.txt'), 'wb') as out: - print(str(f.primary_index_record), file=out) + if f.index_header is not None: + with open(os.path.join(ddir, 'index.txt'), 'wb') as out: + print(str(f.index_header), file=out) + print('\n\n', file=out) + print(str(f.index_record), file=out) + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 9daf17498e..9cc0ed9cb3 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -15,10 +15,11 @@ from calibre.ebooks import normalize from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.mobi.writer2.serializer import Serializer from calibre.ebooks.compression.palmdoc import compress_doc -from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED +from calibre.ebooks.mobi.writer2.utils import (rescale_image, decint, + DECINT_FORWARD, DECINT_BACKWARD) EXTH_CODES = { 'creator': 100, @@ -41,87 +42,10 @@ WRITE_UNCROSSABLE_BREAKS = False RECORD_SIZE = 0x1000 # 4096 -IMAGE_MAX_SIZE = 10 * 1024 * 1024 + MAX_THUMB_SIZE = 16 * 1024 MAX_THUMB_DIMEN = (180, 240) -# Almost like the one for MS LIT, but not quite. -DECINT_FORWARD = 0 -DECINT_BACKWARD = 1 - -def decint(value, direction): - ''' - Some parts of the Mobipocket format encode data as variable-width integers. - These integers are represented big-endian with 7 bits per byte in bits 1-7. - They may be either forward-encoded, in which case only the LSB has bit 8 set, - or backward-encoded, in which case only the MSB has bit 8 set. - For example, the number 0x11111 would be represented forward-encoded as: - - 0x04 0x22 0x91 - - And backward-encoded as: - - 0x84 0x22 0x11 - - This function encodes the integer ``value`` as a variable width integer and - returns the bytestring corresponding to it. - ''' - # Encode vwi - byts = bytearray() - while True: - b = value & 0x7f - value >>= 7 - byts.append(b) - if value == 0: - break - if direction == DECINT_FORWARD: - byts[0] |= 0x80 - elif direction == DECINT_BACKWARD: - byts[-1] |= 0x80 - return bytes(byts) - -def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): - ''' - Convert image setting all transparent pixels to white and changing format - to JPEG. Ensure the resultant image has a byte size less than - maxsizeb. - - If dimen is not None, generate a thumbnail of width=dimen, height=dimen - - Returns the image as a bytestring - ''' - if dimen is not None: - data = thumbnail(data, width=dimen, height=dimen, - compression_quality=90)[-1] - else: - # Replace transparent pixels with white pixels and convert to JPEG - data = save_cover_data_to(data, 'img.jpg', return_data=True) - if len(data) <= maxsizeb: - return data - orig_data = data - img = Image() - quality = 95 - - img.load(data) - while len(data) >= maxsizeb and quality >= 10: - quality -= 5 - img.set_compression_quality(quality) - data = img.export('jpg') - if len(data) <= maxsizeb: - return data - orig_data = data - - scale = 0.9 - while len(data) >= maxsizeb and scale >= 0.05: - img = Image() - img.load(orig_data) - w, h = img.size - img.size = (int(scale*w), int(scale*h)) - img.set_compression_quality(quality) - data = img.export('jpg') - scale -= 0.05 - return data - class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') From dd2718abcd4339d06f087c3ca49ccede4d44a6a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 16:29:39 -0600 Subject: [PATCH 03/30] ... --- src/calibre/manual/develop.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/calibre/manual/develop.rst b/src/calibre/manual/develop.rst index dac6f02c78..32a023d339 100755 --- a/src/calibre/manual/develop.rst +++ b/src/calibre/manual/develop.rst @@ -211,9 +211,9 @@ calibre-dev.bat:: Debugging tips ---------------- -Running |app| code in a python debugger is not easy unless you install from source on Linux. However, Python is a +Python is a dynamically typed language with excellent facilities for introspection. Kovid wrote the core |app| code without once -using a debugger. There are two main strategies to debug |app| code: +using a debugger. There are many strategies to debug |app| code: Using an interactive python interpreter ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -240,6 +240,12 @@ Similarly, you can start the ebook-viewer as:: calibre-debug -w /path/to/file/to/be/viewed +Using the debugger in PyDev +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to get the debugger in PyDev working with the |app| development environment, +see the `forum thread `_. + Executing arbitrary scripts in the |app| python environment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 2b379f7ab89dbc1b156a0347406c2740b8ed4086 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 16:58:12 -0600 Subject: [PATCH 04/30] ... --- src/calibre/ebooks/mobi/debug.py | 16 ++++++++++++---- src/calibre/ebooks/mobi/writer2/main.py | 7 +++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 1f5bf8ae23..ce7d78303e 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -276,6 +276,7 @@ class MOBIHeader(object): # {{{ self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 self.has_fcis_flis = False + self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] self.first_content_record, self.last_content_record = \ @@ -285,8 +286,11 @@ class MOBIHeader(object): # {{{ self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) self.unknown6 = self.raw[216:240] - self.extra_data_flags = bin(struct.unpack(b'>I', - self.raw[240:244])[0]) + self.extra_data_flags = struct.unpack(b'>I', + self.raw[240:244])[0] + self.has_multibytes = bool(self.extra_data_flags & 0b1) + self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) + self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) self.primary_index_record, = struct.unpack(b'>I', self.raw[244:248]) @@ -347,7 +351,10 @@ class MOBIHeader(object): # {{{ ans.append('FLIS number: %d'% self.flis_number) ans.append('FLIS count: %d'% self.flis_count) ans.append('Unknown6: %r'% self.unknown6) - ans.append('Extra data flags: %r'%self.extra_data_flags) + ans.append(('Extra data flags: %s (has multibyte: %s) ' + '(has indexing: %s) (has uncrossable breaks: %s)')%( + bin(self.extra_data_flags), self.has_multibytes, + self.has_indexing_bytes, self.has_uncrossable_breaks )) ans.append('Primary index record (null value: %d): %d'%(0xffffffff, self.primary_index_record)) @@ -489,11 +496,12 @@ class IndexHeader(object): # {{{ return '\n'.join(ans) # }}} -class IndexEntry(object): +class IndexEntry(object): # {{{ def __init__(self, ident, entry_type, raw): self.id = ident self.entry_type = entry_type +# }}} class IndexRecord(object): # {{{ diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 9cc0ed9cb3..76976ce81e 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -18,8 +18,7 @@ from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED -from calibre.ebooks.mobi.writer2.utils import (rescale_image, decint, - DECINT_FORWARD, DECINT_BACKWARD) +from calibre.ebooks.mobi.writer2.utils import (rescale_image, encint) EXTH_CODES = { 'creator': 100, @@ -167,13 +166,13 @@ class MobiWriter(object): # the next record. while breaks and (breaks[0] - offset) < RECORD_SIZE: pbreak = (breaks.pop(0) - running) >> 3 - encoded = decint(pbreak, DECINT_FORWARD) + encoded = encint(pbreak) record.write(encoded) running += pbreak << 3 nextra += len(encoded) lsize = 1 while True: - size = decint(nextra + lsize, DECINT_BACKWARD) + size = encint(nextra + lsize, forward=False) if len(size) == lsize: break lsize += 1 From 830b0b5a10c8580a56cef148527f10efcb8759f0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 17:11:40 -0600 Subject: [PATCH 05/30] Regex builder: Show a nicer error message when the user has the file open in another program on winblows. Fixes #811641 (Convert books fails) --- src/calibre/ebooks/mobi/writer2/utils.py | 118 ++++++++++++++++++++++ src/calibre/gui2/convert/regex_builder.py | 17 +++- 2 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/mobi/writer2/utils.py diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py new file mode 100644 index 0000000000..d3f7ff8c32 --- /dev/null +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct + +from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail + +DECINT_FORWARD = 0 +DECINT_BACKWARD = 1 +IMAGE_MAX_SIZE = 10 * 1024 * 1024 + +def decode_hex_number(raw): + ''' + Return a variable length number encoded using hexadecimal encoding. These + numbers have the first byte which tells the number of bytes that follow. + The bytes that follow are simply the hexadecimal representation of the + number. + + :param raw: Raw binary data as a bytestring + + :return: The number and the number of bytes from raw that the number + occupies + ''' + length, = struct.unpack(b'>B', raw[0]) + raw = raw[1:1+length] + consumed = length+1 + return int(raw, 16), consumed + +def encode_number_as_hex(num): + ''' + Encode num as a variable length encoded hexadecimal number. Returns the + bytestring containing the encoded number. These + numbers have the first byte which tells the number of bytes that follow. + The bytes that follow are simply the hexadecimal representation of the + number. + ''' + num = bytes(hex(num)[2:]) + ans = bytearray(num) + ans.insert(0, len(num)) + return bytes(ans) + +def encint(value, forward=True): + ''' + Some parts of the Mobipocket format encode data as variable-width integers. + These integers are represented big-endian with 7 bits per byte in bits 1-7. + They may be either forward-encoded, in which case only the first byte has bit 8 set, + or backward-encoded, in which case only the last byte has bit 8 set. + For example, the number 0x11111 would be represented forward-encoded as: + + 0x04 0x22 0x91 + + And backward-encoded as: + + 0x84 0x22 0x11 + + This function encodes the integer ``value`` as a variable width integer and + returns the bytestring corresponding to it. + ''' + # Encode vwi + byts = bytearray() + while True: + b = value & 0b1111111 + value >>= 7 + byts.append(b) + if value == 0: + break + byts[0 if forward else -1] |= 0b10000000 + return bytes(byts) + +def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): + ''' + Convert image setting all transparent pixels to white and changing format + to JPEG. Ensure the resultant image has a byte size less than + maxsizeb. + + If dimen is not None, generate a thumbnail of width=dimen, height=dimen + + Returns the image as a bytestring + ''' + if dimen is not None: + data = thumbnail(data, width=dimen, height=dimen, + compression_quality=90)[-1] + else: + # Replace transparent pixels with white pixels and convert to JPEG + data = save_cover_data_to(data, 'img.jpg', return_data=True) + if len(data) <= maxsizeb: + return data + orig_data = data + img = Image() + quality = 95 + + img.load(data) + while len(data) >= maxsizeb and quality >= 10: + quality -= 5 + img.set_compression_quality(quality) + data = img.export('jpg') + if len(data) <= maxsizeb: + return data + orig_data = data + + scale = 0.9 + while len(data) >= maxsizeb and scale >= 0.05: + img = Image() + img.load(orig_data) + w, h = img.size + img.size = (int(scale*w), int(scale*h)) + img.set_compression_quality(quality) + data = img.export('jpg') + scale -= 0.05 + return data + + diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index c7b03c9bb4..c41b8786d9 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -7,8 +7,8 @@ __docformat__ = 'restructuredtext en' import re, os from PyQt4.QtCore import SIGNAL, Qt, pyqtSignal -from PyQt4.QtGui import QDialog, QWidget, QDialogButtonBox, \ - QBrush, QTextCursor, QTextEdit +from PyQt4.QtGui import (QDialog, QWidget, QDialogButtonBox, + QBrush, QTextCursor, QTextEdit) from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit @@ -16,6 +16,7 @@ from calibre.gui2 import error_dialog, choose_files from calibre.ebooks.oeb.iterator import EbookIterator from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.gui2.dialogs.choose_format import ChooseFormatDialog +from calibre.constants import iswindows class RegexBuilder(QDialog, Ui_RegexBuilder): @@ -134,8 +135,16 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): _('Cannot build regex using the GUI builder without a book.'), show=True) return False - fpath = db.format(book_id, format, index_is_id=True, - as_path=True) + try: + fpath = db.format(book_id, format, index_is_id=True, + as_path=True) + except OSError: + if iswindows: + error_dialog(self, _('Could not open file'), + _('Could not open file, do you have it open in' + ' another program?'), show=True) + return False + raise try: self.open_book(fpath) finally: From 77ed3d106dd7e82459f76f9d3edc973c9abf6e03 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 17:13:50 -0600 Subject: [PATCH 06/30] ... --- src/calibre/gui2/convert/regex_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index c41b8786d9..d0573375da 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -141,7 +141,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): except OSError: if iswindows: error_dialog(self, _('Could not open file'), - _('Could not open file, do you have it open in' + _('Could not open the file, do you have it open in' ' another program?'), show=True) return False raise From 0b2dcf358ff247d3df66c215b7c643da1d333ef7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 18:01:52 -0600 Subject: [PATCH 07/30] ... --- src/calibre/gui2/convert/regex_builder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index d0573375da..bbbef7e741 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -140,9 +140,11 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): as_path=True) except OSError: if iswindows: + import traceback error_dialog(self, _('Could not open file'), _('Could not open the file, do you have it open in' - ' another program?'), show=True) + ' another program?'), show=True, + det_msg=traceback.format_exc()) return False raise try: From 0546c29187c475f35e08aadf0acfe2e434f30572 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 18:04:25 -0600 Subject: [PATCH 08/30] ... --- src/calibre/ebooks/mobi/writer2/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index d3f7ff8c32..9f0af7508c 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -11,8 +11,6 @@ import struct from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail -DECINT_FORWARD = 0 -DECINT_BACKWARD = 1 IMAGE_MAX_SIZE = 10 * 1024 * 1024 def decode_hex_number(raw): From 928f5e020d705b6288586eea1a13d6cab5f089e7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 22:43:44 -0600 Subject: [PATCH 09/30] ... --- src/calibre/ebooks/mobi/writer2/utils.py | 47 +++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 9f0af7508c..8166bdf328 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -49,28 +49,65 @@ def encint(value, forward=True): These integers are represented big-endian with 7 bits per byte in bits 1-7. They may be either forward-encoded, in which case only the first byte has bit 8 set, or backward-encoded, in which case only the last byte has bit 8 set. - For example, the number 0x11111 would be represented forward-encoded as: + For example, the number 0x11111 = 0b10001000100010001 would be represented + forward-encoded as: - 0x04 0x22 0x91 + 0x04 0x22 0x91 = 0b100 0b100010 0b10010001 And backward-encoded as: - 0x84 0x22 0x11 + 0x84 0x22 0x11 = 0b10000100 0b100010 0b10001 This function encodes the integer ``value`` as a variable width integer and returns the bytestring corresponding to it. + + If forward is True the bytes returned are suitable for prepending to the + output buffer, otherwise they must be append to the output buffer. ''' # Encode vwi byts = bytearray() while True: - b = value & 0b1111111 - value >>= 7 + b = value & 0b01111111 + value >>= 7 # shift value to the right by 7 bits byts.append(b) if value == 0: break byts[0 if forward else -1] |= 0b10000000 + byts.reverse() return bytes(byts) +def decint(raw, forward=True): + ''' + Read a variable width integer from the bytestring raw and return the + integer and the number of bytes read. If forward is True bytes are read + from the start of raw, otherwise from the end of raw. + + This function is the inverse of encint above, see its docs for more + details. + ''' + val = 0 + byts = bytearray() + for byte in raw if forward else reversed(raw): + bnum = ord(byte) + byts.append(bnum & 0b01111111) + if bnum & 0b10000000: + break + if not forward: + byts.reverse() + for byte in byts: + val <<= 7 # Shift value to the left by 7 bits + val |= byte + + return val, len(byts) + +def test_decint(num): + for d in (True, False): + raw = encint(num, forward=d) + sz = len(raw) + if (num, sz) != decint(raw, forward=d): + raise ValueError('Failed for num %d, forward=%r: %r != %r' % ( + num, d, (num, sz), decint(raw, forward=d))) + def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): ''' Convert image setting all transparent pixels to white and changing format From d244201457c46815712a62df640378366110420f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jul 2011 22:53:18 -0600 Subject: [PATCH 10/30] ... --- src/calibre/ebooks/mobi/writer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index de27481541..bd61ea559d 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1260,11 +1260,11 @@ class MobiWriter(object): data = compress_doc(data) record = StringIO() record.write(data) + # Write trailing muti-byte sequence if any + record.write(overlap) + record.write(pack('>B', len(overlap))) - # Marshall's utf-8 break code. if WRITE_PBREAKS : - record.write(overlap) - record.write(pack('>B', len(overlap))) nextra = 0 pbreak = 0 running = offset From 08f5775f6596f94d02c470447963e3ed320daa15 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 10:36:35 -0600 Subject: [PATCH 11/30] ebook-convert: Abort if a keyboard interrupt is raised during parsing --- src/calibre/ebooks/oeb/reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 422252f73e..5bb6b193f7 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -163,6 +163,8 @@ class OEBReader(object): if item.media_type in check: try: item.data + except KeyboardInterrupt: + raise except: self.logger.exception('Failed to parse content in %s'% item.href) From 59d9e1558004c53be8fc31b2f1838b1389587d91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 10:53:31 -0600 Subject: [PATCH 12/30] Conversion pipeline: Strip out large blocks of contiguous space (more than 10000 contiguous blanks) as these slow down the conversion process and are almost always indicative of an error in the input document. --- src/calibre/ebooks/conversion/preprocess.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 885d0621e0..751d4f8cd6 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -303,6 +303,9 @@ class CSSPreProcessor(object): class HTMLPreProcessor(object): PREPROCESS = [ + # Remove huge block of contiguous spaces as they slow down + # the following regexes pretty badly + (re.compile(r'\s{10000,}'), lambda m: ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), From dbefbfbd862b9bcb5d233f428c8451a5c5048a54 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 14:06:09 -0600 Subject: [PATCH 13/30] ... --- src/calibre/ebooks/mobi/writer.py | 632 +++++++++++++++--------------- 1 file changed, 321 insertions(+), 311 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index bd61ea559d..bf71ad55c2 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1642,6 +1642,61 @@ class MobiWriter(object): for record in self._records: self._write(record) + def _clean_text_value(self, text): + if text is not None and text.strip() : + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else : + text = "(none)".encode('utf-8') + return text + + def _compute_offset_length(self, i, node, entries) : + h = node.href + if h not in self._id_offsets: + self._oeb.log.warning('Could not find TOC entry:', node.title) + return -1, -1 + + offset = self._id_offsets[h] + length = None + # Calculate length based on next entry's offset + for sibling in entries[i+1:]: + h2 = sibling.href + if h2 in self._id_offsets: + offset2 = self._id_offsets[h2] + if offset2 > offset: + length = offset2 - offset + break + if length is None: + length = self._content_length - offset + return offset, length + + def _establish_document_structure(self) : + documentType = None + try : + klass = self._ctoc_map[0]['klass'] + except : + klass = None + + if klass == 'chapter' or klass == None : + documentType = 'book' + if self.opts.verbose > 2 : + self._oeb.logger.info("Adding a MobiBook to self._MobiDoc") + self._MobiDoc.documentStructure = MobiBook() + + elif klass == 'periodical' : + documentType = klass + if self.opts.verbose > 2 : + self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc") + self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode()) + self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle + else : + raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass) + return documentType + + # Index {{{ + def _generate_index(self): self._oeb.log('Generating INDX ...') self._primary_index_record = None @@ -1815,276 +1870,7 @@ class MobiWriter(object): open(os.path.join(t, n+'.bin'), 'wb').write(self._records[-(i+1)]) self._oeb.log.debug('Index records dumped to', t) - def _clean_text_value(self, text): - if text is not None and text.strip() : - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else : - text = "(none)".encode('utf-8') - return text - - def _add_to_ctoc(self, ctoc_str, record_offset): - # Write vwilen + string to ctoc - # Return offset - # Is there enough room for this string in the current ctoc record? - if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str): - # flush this ctoc, start a new one - # print "closing ctoc_record at 0x%X" % self._ctoc.tell() - # print "starting new ctoc with '%-50.50s ...'" % ctoc_str - # pad with 00 - pad = 0xfbf8 - self._ctoc.tell() - # print "padding %d bytes of 00" % pad - self._ctoc.write('\0' * (pad)) - self._ctoc_records.append(self._ctoc.getvalue()) - self._ctoc.truncate(0) - self._ctoc_offset += 0x10000 - record_offset = self._ctoc_offset - - offset = self._ctoc.tell() + record_offset - self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str) - return offset - - def _add_flat_ctoc_node(self, node, ctoc, title=None): - # Process 'chapter' or 'article' nodes only, force either to 'chapter' - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # article = chapter - if node.klass == 'article' : - ctoc_name_map['klass'] = 'chapter' - else : - ctoc_name_map['klass'] = node.klass - - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - return - - def _add_structured_ctoc_node(self, node, ctoc, title=None): - # Process 'periodical', 'section' and 'article' - - # Fetch the offset referencing the current ctoc_record - if node.klass is None : - return - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # Add the klass of this node - ctoc_name_map['klass'] = node.klass - - if node.klass == 'chapter': - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - elif node.klass == 'periodical' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'periodical' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'periodical': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._periodicalCount += 1 - - elif node.klass == 'section' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'section' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'section': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._sectionCount += 1 - - elif node.klass == 'article' : - # Add title offset/title - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'article' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'article': - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - # Add description offset/description - if node.description : - d = self._clean_text_value(node.description) - ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset) - else : - ctoc_name_map['descriptionOffset'] = None - - # Add author offset/attribution - if node.author : - a = self._clean_text_value(node.author) - ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset) - else : - ctoc_name_map['authorOffset'] = None - - self._articleCount += 1 - - else : - raise NotImplementedError( \ - 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ - (node.title, node.klass, node.play_order)) - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - def _generate_ctoc(self): - # Generate the compiled TOC strings - # Each node has 1-4 CTOC entries: - # Periodical (0xDF) - # title, class - # Section (0xFF) - # title, class - # Article (0x3F) - # title, class, description, author - # Chapter (0x0F) - # title, class - # nb: Chapters don't actually have @class, so we synthesize it - # in reader._toc_from_navpoint - - toc = self._oeb.toc - reduced_toc = [] - self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets - self._last_toc_entry = None - #ctoc = StringIO() - self._ctoc = StringIO() - - # Track the individual node types - self._periodicalCount = 0 - self._sectionCount = 0 - self._articleCount = 0 - self._chapterCount = 0 - - #first = True - - if self._conforming_periodical_toc : - self._oeb.logger.info('Generating structured CTOC ...') - for (child) in toc.iter(): - if self.opts.verbose > 2 : - self._oeb.logger.info(" %s" % child) - self._add_structured_ctoc_node(child, self._ctoc) - #first = False - - else : - self._oeb.logger.info('Generating flat CTOC ...') - previousOffset = -1 - currentOffset = 0 - for (i, child) in enumerate(toc.iterdescendants()): - # Only add chapters or articles at depth==1 - # no class defaults to 'chapter' - if child.klass is None : child.klass = 'chapter' - if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 : - if self.opts.verbose > 2 : - self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ - (child.klass, child.depth(), child) ) - - # Test to see if this child's offset is the same as the previous child's - # offset, skip it - h = child.href - - if h is None: - self._oeb.logger.warn(' Ignoring TOC entry with no href:', - child.title) - continue - if h not in self._id_offsets: - self._oeb.logger.warn(' Ignoring missing TOC entry:', - unicode(child)) - continue - - currentOffset = self._id_offsets[h] - # print "_generate_ctoc: child offset: 0x%X" % currentOffset - - if currentOffset != previousOffset : - self._add_flat_ctoc_node(child, self._ctoc) - reduced_toc.append(child) - previousOffset = currentOffset - else : - self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) - - else : - if self.opts.verbose > 2 : - self._oeb.logger.info("skipping class: %s depth %d at position %d" % \ - (child.klass, child.depth(),i)) - - # Update the TOC with our edited version - self._oeb.toc.nodes = reduced_toc - - # Instantiate a MobiDocument(mobitype) - if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \ - not self.opts.mobi_periodical : - mobiType = 0x002 - elif self._periodicalCount: - pt = None - if self._oeb.metadata.publication_type: - x = unicode(self._oeb.metadata.publication_type[0]).split(':') - if len(x) > 1: - pt = x[1] - mobiType = {'newspaper':0x101}.get(pt, 0x103) - else : - raise NotImplementedError('_generate_ctoc: Unrecognized document structured') - - self._MobiDoc = MobiDocument(mobiType) - - if self.opts.verbose > 2 : - structType = 'book' - if mobiType > 0x100 : - structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical' - self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) ) - if mobiType > 0x100 : - self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \ - (self._periodicalCount, self._sectionCount, self._articleCount) ) - else : - self._oeb.logger.info("chapterCount: %d" % self._chapterCount) - - # Apparently the CTOC must end with a null byte - self._ctoc.write('\0') - - ctoc = self._ctoc.getvalue() - rec_count = len(self._ctoc_records) - self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \ - (rec_count + 1, 'records, last record' if rec_count else 'record,', - len(ctoc)/655) ) - - return align_block(ctoc) - + # Index nodes {{{ def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) : pos = 0xc0 + indxt.tell() indices.write(pack('>H', pos)) # Save the offset for IDXTIndices @@ -2176,48 +1962,8 @@ class MobiWriter(object): indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX indxt.write(decint(0, DECINT_FORWARD)) # unknown byte - def _compute_offset_length(self, i, node, entries) : - h = node.href - if h not in self._id_offsets: - self._oeb.log.warning('Could not find TOC entry:', node.title) - return -1, -1 + # }}} - offset = self._id_offsets[h] - length = None - # Calculate length based on next entry's offset - for sibling in entries[i+1:]: - h2 = sibling.href - if h2 in self._id_offsets: - offset2 = self._id_offsets[h2] - if offset2 > offset: - length = offset2 - offset - break - if length is None: - length = self._content_length - offset - return offset, length - - def _establish_document_structure(self) : - documentType = None - try : - klass = self._ctoc_map[0]['klass'] - except : - klass = None - - if klass == 'chapter' or klass == None : - documentType = 'book' - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiBook to self._MobiDoc") - self._MobiDoc.documentStructure = MobiBook() - - elif klass == 'periodical' : - documentType = klass - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc") - self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode()) - self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle - else : - raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass) - return documentType def _generate_section_indices(self, child, currentSection, myPeriodical, myDoc ) : sectionTitles = list(child.iter())[1:] @@ -2495,6 +2241,270 @@ class MobiWriter(object): last_name, c = self._add_periodical_structured_articles(myDoc, indxt, indices) return align_block(indxt.getvalue()), c, align_block(indices.getvalue()), last_name + # }}} + + # CTOC {{{ + def _add_to_ctoc(self, ctoc_str, record_offset): + # Write vwilen + string to ctoc + # Return offset + # Is there enough room for this string in the current ctoc record? + if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str): + # flush this ctoc, start a new one + # print "closing ctoc_record at 0x%X" % self._ctoc.tell() + # print "starting new ctoc with '%-50.50s ...'" % ctoc_str + # pad with 00 + pad = 0xfbf8 - self._ctoc.tell() + # print "padding %d bytes of 00" % pad + self._ctoc.write('\0' * (pad)) + self._ctoc_records.append(self._ctoc.getvalue()) + self._ctoc.truncate(0) + self._ctoc_offset += 0x10000 + record_offset = self._ctoc_offset + + offset = self._ctoc.tell() + record_offset + self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str) + return offset + + def _add_flat_ctoc_node(self, node, ctoc, title=None): + # Process 'chapter' or 'article' nodes only, force either to 'chapter' + t = node.title if title is None else title + t = self._clean_text_value(t) + self._last_toc_entry = t + + # Create an empty dictionary for this node + ctoc_name_map = {} + + # article = chapter + if node.klass == 'article' : + ctoc_name_map['klass'] = 'chapter' + else : + ctoc_name_map['klass'] = node.klass + + # Add title offset to name map + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + self._chapterCount += 1 + + # append this node's name_map to map + self._ctoc_map.append(ctoc_name_map) + + return + + def _add_structured_ctoc_node(self, node, ctoc, title=None): + # Process 'periodical', 'section' and 'article' + + # Fetch the offset referencing the current ctoc_record + if node.klass is None : + return + t = node.title if title is None else title + t = self._clean_text_value(t) + self._last_toc_entry = t + + # Create an empty dictionary for this node + ctoc_name_map = {} + + # Add the klass of this node + ctoc_name_map['klass'] = node.klass + + if node.klass == 'chapter': + # Add title offset to name map + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + self._chapterCount += 1 + + elif node.klass == 'periodical' : + # Add title offset + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'periodical' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'periodical': + # Use the pre-existing instance + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + self._periodicalCount += 1 + + elif node.klass == 'section' : + # Add title offset + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'section' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'section': + # Use the pre-existing instance + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + self._sectionCount += 1 + + elif node.klass == 'article' : + # Add title offset/title + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'article' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'article': + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + # Add description offset/description + if node.description : + d = self._clean_text_value(node.description) + ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset) + else : + ctoc_name_map['descriptionOffset'] = None + + # Add author offset/attribution + if node.author : + a = self._clean_text_value(node.author) + ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset) + else : + ctoc_name_map['authorOffset'] = None + + self._articleCount += 1 + + else : + raise NotImplementedError( \ + 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ + (node.title, node.klass, node.play_order)) + + # append this node's name_map to map + self._ctoc_map.append(ctoc_name_map) + + def _generate_ctoc(self): + # Generate the compiled TOC strings + # Each node has 1-4 CTOC entries: + # Periodical (0xDF) + # title, class + # Section (0xFF) + # title, class + # Article (0x3F) + # title, class, description, author + # Chapter (0x0F) + # title, class + # nb: Chapters don't actually have @class, so we synthesize it + # in reader._toc_from_navpoint + + toc = self._oeb.toc + reduced_toc = [] + self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets + self._last_toc_entry = None + #ctoc = StringIO() + self._ctoc = StringIO() + + # Track the individual node types + self._periodicalCount = 0 + self._sectionCount = 0 + self._articleCount = 0 + self._chapterCount = 0 + + #first = True + + if self._conforming_periodical_toc : + self._oeb.logger.info('Generating structured CTOC ...') + for (child) in toc.iter(): + if self.opts.verbose > 2 : + self._oeb.logger.info(" %s" % child) + self._add_structured_ctoc_node(child, self._ctoc) + #first = False + + else : + self._oeb.logger.info('Generating flat CTOC ...') + previousOffset = -1 + currentOffset = 0 + for (i, child) in enumerate(toc.iterdescendants()): + # Only add chapters or articles at depth==1 + # no class defaults to 'chapter' + if child.klass is None : child.klass = 'chapter' + if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 : + if self.opts.verbose > 2 : + self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ + (child.klass, child.depth(), child) ) + + # Test to see if this child's offset is the same as the previous child's + # offset, skip it + h = child.href + + if h is None: + self._oeb.logger.warn(' Ignoring TOC entry with no href:', + child.title) + continue + if h not in self._id_offsets: + self._oeb.logger.warn(' Ignoring missing TOC entry:', + unicode(child)) + continue + + currentOffset = self._id_offsets[h] + # print "_generate_ctoc: child offset: 0x%X" % currentOffset + + if currentOffset != previousOffset : + self._add_flat_ctoc_node(child, self._ctoc) + reduced_toc.append(child) + previousOffset = currentOffset + else : + self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) + + else : + if self.opts.verbose > 2 : + self._oeb.logger.info("skipping class: %s depth %d at position %d" % \ + (child.klass, child.depth(),i)) + + # Update the TOC with our edited version + self._oeb.toc.nodes = reduced_toc + + # Instantiate a MobiDocument(mobitype) + if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \ + not self.opts.mobi_periodical : + mobiType = 0x002 + elif self._periodicalCount: + pt = None + if self._oeb.metadata.publication_type: + x = unicode(self._oeb.metadata.publication_type[0]).split(':') + if len(x) > 1: + pt = x[1] + mobiType = {'newspaper':0x101}.get(pt, 0x103) + else : + raise NotImplementedError('_generate_ctoc: Unrecognized document structured') + + self._MobiDoc = MobiDocument(mobiType) + + if self.opts.verbose > 2 : + structType = 'book' + if mobiType > 0x100 : + structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical' + self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) ) + if mobiType > 0x100 : + self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \ + (self._periodicalCount, self._sectionCount, self._articleCount) ) + else : + self._oeb.logger.info("chapterCount: %d" % self._chapterCount) + + # Apparently the CTOC must end with a null byte + self._ctoc.write('\0') + + ctoc = self._ctoc.getvalue() + rec_count = len(self._ctoc_records) + self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \ + (rec_count + 1, 'records, last record' if rec_count else 'record,', + len(ctoc)/655) ) + + return align_block(ctoc) + + # }}} class HTMLRecordData(object): """ A data structure containing indexing/navigation data for an HTML record """ From 2b45d99b02e300c4bdfc06566eb979f45d93a403 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 16:46:31 -0600 Subject: [PATCH 14/30] Improved Instapaper recipe --- recipes/instapaper.recipe | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index 0eb5cf0f09..c6175a783f 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -1,22 +1,31 @@ -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1299694372(BasicNewsRecipe): - title = u'Instapaper' - __author__ = 'Darko Miletic' - publisher = 'Instapaper.com' - category = 'info, custom, Instapaper' - oldest_article = 365 + title = u'Instapaper' + __author__ = 'Darko Miletic' + publisher = 'Instapaper.com' + category = 'info, custom, Instapaper' + oldest_article = 365 max_articles_per_feed = 100 no_stylesheets = True + remove_javascript = True + remove_tags = [ + dict(name='div', attrs={'id':'text_controls_toggle'}) + ,dict(name='script') + ,dict(name='div', attrs={'id':'text_controls'}) + ,dict(name='div', attrs={'id':'editing_controls'}) + ,dict(name='div', attrs={'class':'bar bottom'}) + ] use_embedded_content = False needs_subscription = True INDEX = u'http://www.instapaper.com' LOGIN = INDEX + u'/user/login' - - feeds = [(u'Instapaper Unread', u'http://www.instapaper.com/u'), (u'Instapaper Starred', u'http://www.instapaper.com/starred')] + feeds = [ + (u'Instapaper Unread', u'http://www.instapaper.com/u'), + (u'Instapaper Starred', u'http://www.instapaper.com/starred') + ] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -37,18 +46,20 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) - for item in soup.findAll('div', attrs={'class':'titleRow'}): - description = self.tag_to_string(item.div) + for item in soup.findAll('div', attrs={'class':'cornerControls'}): + #description = self.tag_to_string(item.div) atag = item.a if atag and atag.has_key('href'): url = atag['href'] - title = self.tag_to_string(atag) - date = strftime(self.timefmt) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description + 'url' :url }) totalfeeds.append((feedtitle, articles)) return totalfeeds + + def print_version(self, url): + return 'http://www.instapaper.com' + url + + def populate_article_metadata(self, article, soup, first): + article.title = soup.find('title').contents[0].strip() + From ca2c41516af57e1c036e87e1caf8bd1f0ccb0ef0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 17:53:26 -0600 Subject: [PATCH 15/30] Content server: Add a link at the bottom of the mobile interface to switch tot he full interface. Fixes #812525 ([Enhancement] Web app) --- resources/content_server/browse/browse.html | 1 + src/calibre/ebooks/mobi/debug.py | 45 +++++++++++++++++++-- src/calibre/library/server/mobile.py | 12 +++++- src/calibre/manual/faq.rst | 4 +- 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/resources/content_server/browse/browse.html b/resources/content_server/browse/browse.html index 6a9697dc06..cf17742c87 100644 --- a/resources/content_server/browse/browse.html +++ b/resources/content_server/browse/browse.html @@ -11,6 +11,7 @@ + diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index ce7d78303e..884311617d 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en' import struct, datetime, sys, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import decode_hex_number +from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint # PalmDB {{{ class PalmDOCAttributes(object): @@ -498,9 +498,45 @@ class IndexHeader(object): # {{{ class IndexEntry(object): # {{{ + TYPES = { + # Present in book type files + 0x0f : 'chapter', + 0x6f : 'chapter_with_subchapters', + 0x1f : 'subchapter', + # Present in periodicals + 0xdf : 'periodical', + 0xff : 'section', + 0x3f : 'article', + } + def __init__(self, ident, entry_type, raw): self.id = ident - self.entry_type = entry_type + self.fields = [] + self.sub_type = None + + try: + self.entry_type = self.TYPES[entry_type] + except KeyError: + raise ValueError('Unknown IndexEntry type: %s'%hex(entry_type)) + + if self.entry_type in (0xdf, 0xff): + self.subtype = ord(raw[0]) + raw = raw[1:] + while True: + val, consumed = decint(raw) + raw = raw[consumed:] + if val == 0: + break + else: + self.fields.append(val) + + + def __str__(self): + ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%( + self.id, self.entry_type, self.sub_type)] + ans.append('\tFields: %r'%self.fields) + return '\n'.join(ans) + # }}} class IndexRecord(object): # {{{ @@ -538,7 +574,7 @@ class IndexRecord(object): # {{{ index = indxt[off:] ident, consumed = decode_hex_number(index) index = index[consumed:] - entry_type = u(b'>B', index[0]) + entry_type, = u(b'>B', index[0]) self.indices.append(IndexEntry(ident, entry_type, index[1:])) @@ -557,6 +593,9 @@ class IndexRecord(object): # {{{ u(self.unknown3) u(self.unknown4) a('Index offsets: %r'%self.index_offsets) + a('\nIndex Entries:') + for entry in self.indices: + a(str(entry)+'\n') return '\n'.join(ans) diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index ad5ee4af96..3ce96a2b49 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -153,12 +153,22 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, bookt.append(TR(thumbnail, data)) # }}} + body.append(HR()) + body.append(DIV( + A(_('Switch to the full interface (non-mobile interface)'), + href="/browse", + style="text-decoration: none; color: blue", + title=_('The full interface gives you many more features, ' + 'but it may not work well on a small screen')), + style="text-align:center")) return HTML( HEAD( TITLE(__appname__ + ' Library'), LINK(rel='icon', href='http://calibre-ebook.com/favicon.ico', type='image/x-icon'), - LINK(rel='stylesheet', type='text/css', href=prefix+'/mobile/style.css') + LINK(rel='stylesheet', type='text/css', + href=prefix+'/mobile/style.css'), + LINK(rel='apple-touch-icon', href="/static/calibre.png") ), # End head body ) # End html diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 5601407282..556f508880 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -405,9 +405,9 @@ To those of you that claim that you need access to the filesystem to so that you If you are worried that someday |app| will cease to be developed, leaving all your books marooned in its folder structure, explore the powerful "Save to Disk" feature in |app| that lets you export all your files into a folder structure of arbitrary complexity based on their metadata. -Since I keep getting asked why there are numbers at the end of the title folder name, the reason is for *robustness*. That number is the id number of the book record in the |app| database. The presence of the number allows you to have multiple records with the same title and author names. More importantly, it is part of what allows |app| to magically regenerate the database with all metadata if the database file gets corrupted. Given that |app|'s mission is to get you to stop storing metadata in filenames and stop using the filesystem to find things, the increased robustness afforded by the id numbers is well worth the uglier folder names. +Finally, the reason there are numbers at the end of every title folder, is for *robustness*. That number is the id number of the book record in the |app| database. The presence of the number allows you to have multiple records with the same title and author names. It is also part of what allows |app| to magically regenerate the database with all metadata if the database file gets corrupted. Given that |app|'s mission is to get you to stop storing metadata in filenames and stop using the filesystem to find things, the increased robustness afforded by the id numbers is well worth the uglier folder names. -Finally, if you are irrevocably wedded to using the filesystem to store your metadata, feel free to patch your local copy of |app| to use whatever storage scheme you like. But, do not bother me with requests to change the directory structure, **they will be ignored**. +If you are still not convinced, then I'm afraid |app| is not for you. Look elsewhere for your book cataloguing needs. Just so we're clear, **this is not going to change**. Kindly do not contact us in an attempt to get us to change this. Why doesn't |app| have a column for foo? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 79ca569caae628806160e3372cdcd5cd2e6912bc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 19:32:46 -0600 Subject: [PATCH 16/30] Mobi debug: Decompile CTOC and fix interpretation of index entries --- src/calibre/ebooks/mobi/debug.py | 69 +++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 884311617d..9eccd508a0 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct, datetime, sys, os +from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint @@ -509,32 +510,31 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw): + def __init__(self, ident, entry_type, raw, is_last): self.id = ident self.fields = [] self.sub_type = None + self.raw = raw try: self.entry_type = self.TYPES[entry_type] except KeyError: - raise ValueError('Unknown IndexEntry type: %s'%hex(entry_type)) + raise ValueError('Unknown Index Entry type: %s'%hex(entry_type)) if self.entry_type in (0xdf, 0xff): self.subtype = ord(raw[0]) raw = raw[1:] - while True: + while raw: val, consumed = decint(raw) raw = raw[consumed:] - if val == 0: - break - else: - self.fields.append(val) - + self.fields.append(val) + if is_last and self.fields[-1] == 0: + self.fields = self.fields[:-1] def __str__(self): - ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%( - self.id, self.entry_type, self.sub_type)] - ans.append('\tFields: %r'%self.fields) + ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%( + self.id, self.entry_type, self.sub_type, len(self.raw))] + ans.append('\tFields (%d): %r'%(len(self.fields), self.fields)) return '\n'.join(ans) # }}} @@ -570,16 +570,21 @@ class IndexRecord(object): # {{{ indxt = raw[192:self.idxt_offset] self.indices = [] - for off in self.index_offsets: - index = indxt[off:] - ident, consumed = decode_hex_number(index) - index = index[consumed:] - entry_type, = u(b'>B', index[0]) - self.indices.append(IndexEntry(ident, entry_type, index[1:])) + for i, off in enumerate(self.index_offsets): + try: + next_off = self.index_offsets[i+1] + is_last = False + except: + next_off = len(indxt) + is_last = True + ident, consumed = decode_hex_number(indxt[off:]) + entry_type, = u(b'>B', indxt[off+consumed]) + self.indices.append(IndexEntry(ident, entry_type, + indxt[off+consumed+1:next_off], is_last)) def __str__(self): - ans = ['*'*20 + ' Index Record (%d bytes)'%len(self.record.raw)+ '*'*20] + ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] a = ans.append def u(w): a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, @@ -601,6 +606,29 @@ class IndexRecord(object): # {{{ # }}} +class CTOC(object) : # {{{ + + def __init__(self, records, codec): + self.records = OrderedDict() + pos = 0 + for record in records: + raw = record.raw + while pos < len(raw): + length, consumed = decint(raw[pos:]) + if length > 0: + self.records[pos] = raw[pos+consumed:pos+consumed+length].decode( + codec) + pos += consumed+length + + def __str__(self): + ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] + for k, v in self.records.iteritems(): + ans.append('%10d : %s'%(k, v)) + return '\n'.join(ans) + + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -633,6 +661,9 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) + self.ctoc = CTOC(self.records[ + pir+2:pir+2+self.index_header.num_of_ctoc_blocks], + self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1]) @@ -660,6 +691,8 @@ def inspect_mobi(path_or_stream): with open(os.path.join(ddir, 'index.txt'), 'wb') as out: print(str(f.index_header), file=out) print('\n\n', file=out) + print(str(f.ctoc).encode('utf-8'), file=out) + print('\n\n', file=out) print(str(f.index_record), file=out) print ('Debug data saved to:', ddir) From 08dff7d7221ecd070f8ac2d155088be85759a4ab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 20:04:03 -0600 Subject: [PATCH 17/30] ... --- src/calibre/ebooks/mobi/debug.py | 71 +++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 9eccd508a0..dd7707e2f8 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -511,7 +511,7 @@ class IndexEntry(object): # {{{ } def __init__(self, ident, entry_type, raw, is_last): - self.id = ident + self.index = ident self.fields = [] self.sub_type = None self.raw = raw @@ -531,10 +531,69 @@ class IndexEntry(object): # {{{ if is_last and self.fields[-1] == 0: self.fields = self.fields[:-1] + self.interpret() + + def interpret(self): + self.offset = self.fields[0] + self.object_size = self.fields[1] + self.label_offset = self.fields[2] + self.depth = self.fields[3] + self.extra = OrderedDict() + self.extra_fields = [] + if self.entry_type == 'subchapter': + self.parent_index = self.fields[4] + self.extra['Parent chapter index'] = 'parent_index' + self.extra_fields = self.fields[5:] + elif self.entry_type == 'article': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.parent_index = self.fields[5] + self.extra['Parent section index'] = 'parent_index' + if len(self.fields) > 6: + self.desc_offset = self.fields[6] + self.extra['Decription offset in CTOC'] = 'desc_offset' + if len(self.fields) > 7: + self.author_offset = self.fields[7] + self.extra['Author offset in CTOC'] = 'author_offset' + self.extra_fields = self.fields[8:] + elif self.entry_type == 'chapter_with_subchapters': + self.first_subchapter_index = self.fields[4] + self.last_subchapter_index = self.fields[5] + self.extra['First subchapter index'] = 'first_subchapter_index' + self.extra['Last subchapter index'] = 'last_subchapter_index' + self.extra_fields = self.fields[6:] + elif self.entry_type == 'periodical': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.first_section_index = self.fields[5] + self.last_section_index = self.fields[6] + self.extra['First section index'] = 'first_section_index' + self.extra['Last section index'] = 'last_section_index' + self.extra_fields = self.fields[7:] + elif self.entry_type == 'section': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.periodical_index = self.fields[5] + self.extra['Periodical index'] = 'periodical_index' + self.first_article_index = self.fields[6] + self.last_article_index = self.fields[7] + self.extra['First article index'] = 'first_article_index' + self.extra['Last article index'] = 'last_article_index' + self.extra_fields = self.fields[8:] + def __str__(self): - ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%( - self.id, self.entry_type, self.sub_type, len(self.raw))] - ans.append('\tFields (%d): %r'%(len(self.fields), self.fields)) + ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%( + self.index, self.entry_type, self.sub_type, len(self.raw))] + ans.append('\tOffset in HTML: %d'%self.offset) + ans.append('\tObject size in HTML: %d'%self.object_size) + ans.append('\tLabel offset in CTOC: %d'%self.label_offset) + ans.append('\tDepth: %d'%self.depth) + for text, attr in self.extra.iteritems(): + ans.append('\t%s: %d'%(text, getattr(self, attr))) + if self.extra_fields: + ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields), + self.extra_fields)) + return '\n'.join(ans) # }}} @@ -577,9 +636,9 @@ class IndexRecord(object): # {{{ except: next_off = len(indxt) is_last = True - ident, consumed = decode_hex_number(indxt[off:]) + index, consumed = decode_hex_number(indxt[off:]) entry_type, = u(b'>B', indxt[off+consumed]) - self.indices.append(IndexEntry(ident, entry_type, + self.indices.append(IndexEntry(index, entry_type, indxt[off+consumed+1:next_off], is_last)) From 55987fa6cb801e196cc84f8e7418e7be40db63fe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 22:38:04 -0600 Subject: [PATCH 18/30] Mobi debug: Figured out the TAGX table, use it to properly decode the index entries --- src/calibre/ebooks/mobi/debug.py | 189 +++++++++++++++++-------------- 1 file changed, 105 insertions(+), 84 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index dd7707e2f8..2dd26e9f83 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -377,18 +377,17 @@ class TagX(object): # {{{ def __init__(self, raw, control_byte_count): self.tag = ord(raw[0]) self.num_values = ord(raw[1]) - self.bmask = ord(raw[2]) - self.bitmask = bin(self.bmask) + self.bitmask = ord(raw[2]) # End of file = 1 iff last entry # When it is 1 all others are 0 self.eof = ord(raw[3]) self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 - and self.bmask == 0) + and self.bitmask == 0) def __repr__(self): - return 'TAGX(tag=%02d, num_values=%d, bitmask=%r (%d), eof=%d)' % (self.tag, - self.num_values, self.bitmask, self.bmask, self.eof) + return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag, + self.num_values, bin(self.bitmask), self.eof) # }}} class IndexHeader(object): # {{{ @@ -444,6 +443,7 @@ class IndexHeader(object): # {{{ self.tagx_control_byte_count)) if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') + self.tagx_entries = self.tagx_entries[:-1] idxt0_pos = self.header_length+self.tagx_header_length last_num, consumed = decode_hex_number(raw[idxt0_pos:]) @@ -497,6 +497,81 @@ class IndexHeader(object): # {{{ return '\n'.join(ans) # }}} +class Tag(object): # {{{ + + ''' + Index entries are a collection of tags. Each tag is represented by this + class. + ''' + + TAG_MAP = { + 1: ('offset', 'Offset in HTML'), + 2: ('size', 'Size in HTML'), + 3: ('label_offset', 'Offset to label in CNCX'), + 4: ('depth', 'Depth of this entry in TOC'), + + # The remaining tag types have to be interpreted subject to the type + # of index entry they are present in + } + + INTERPRET_MAP = { + 'subchapter': { + 5 : ('Parent chapter index', 'parent_index') + }, + + 'article' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 21 : ('Parent section index', 'parent_index'), + 22 : ('Description offset in CTOC', 'desc_offset'), + 23 : ('Author offset in CTOC', 'author_offset'), + }, + + 'chapter_with_subchapters' : { + 22 : ('First subchapter index', 'first_subchapter_index'), + 23 : ('Last subchapter index', 'last_subchapter_index'), + }, + + 'periodical' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 22 : ('First section index', 'first_section_index'), + 23 : ('Last section index', 'last_section_index'), + }, + + 'section' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 21 : ('Periodical index', 'periodical_index'), + 22 : ('First article index', 'first_article_index'), + 23 : ('Last article index', 'last_article_index'), + }, + } + + + def __init__(self, tagx, vals, entry_type, ctoc): + self.value = vals if len(vals) > 1 else vals[0] + self.entry_type = entry_type + self.ctoc_value = None + if tagx.tag in self.TAG_MAP: + self.attr, self.desc = self.TAG_MAP[tagx.tag] + else: + try: + td = self.INTERPRET_MAP[entry_type] + except: + raise ValueError('Unknown entry type: %s'%entry_type) + try: + self.desc, self.attr = td[tagx.tag] + except: + raise ValueError('Unknown tag: %d for entry type: %s'%( + tagx.tag, entry_type)) + if '_offset' in self.attr: + self.ctoc_value = ctoc[self.value] + + def __str__(self): + if self.ctoc_value is not None: + return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value) + return '%s : %r'%(self.desc, self.value) + +# }}} + class IndexEntry(object): # {{{ TYPES = { @@ -510,97 +585,41 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw, is_last): + def __init__(self, ident, entry_type, raw, ctoc, tagx_entries): self.index = ident - self.fields = [] - self.sub_type = None self.raw = raw + self.tags = [] try: self.entry_type = self.TYPES[entry_type] except KeyError: raise ValueError('Unknown Index Entry type: %s'%hex(entry_type)) - if self.entry_type in (0xdf, 0xff): - self.subtype = ord(raw[0]) - raw = raw[1:] - while raw: - val, consumed = decint(raw) - raw = raw[consumed:] - self.fields.append(val) - if is_last and self.fields[-1] == 0: - self.fields = self.fields[:-1] + expected_tags = [tag for tag in tagx_entries if tag.bitmask & + entry_type] - self.interpret() - - def interpret(self): - self.offset = self.fields[0] - self.object_size = self.fields[1] - self.label_offset = self.fields[2] - self.depth = self.fields[3] - self.extra = OrderedDict() - self.extra_fields = [] - if self.entry_type == 'subchapter': - self.parent_index = self.fields[4] - self.extra['Parent chapter index'] = 'parent_index' - self.extra_fields = self.fields[5:] - elif self.entry_type == 'article': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.parent_index = self.fields[5] - self.extra['Parent section index'] = 'parent_index' - if len(self.fields) > 6: - self.desc_offset = self.fields[6] - self.extra['Decription offset in CTOC'] = 'desc_offset' - if len(self.fields) > 7: - self.author_offset = self.fields[7] - self.extra['Author offset in CTOC'] = 'author_offset' - self.extra_fields = self.fields[8:] - elif self.entry_type == 'chapter_with_subchapters': - self.first_subchapter_index = self.fields[4] - self.last_subchapter_index = self.fields[5] - self.extra['First subchapter index'] = 'first_subchapter_index' - self.extra['Last subchapter index'] = 'last_subchapter_index' - self.extra_fields = self.fields[6:] - elif self.entry_type == 'periodical': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.first_section_index = self.fields[5] - self.last_section_index = self.fields[6] - self.extra['First section index'] = 'first_section_index' - self.extra['Last section index'] = 'last_section_index' - self.extra_fields = self.fields[7:] - elif self.entry_type == 'section': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.periodical_index = self.fields[5] - self.extra['Periodical index'] = 'periodical_index' - self.first_article_index = self.fields[6] - self.last_article_index = self.fields[7] - self.extra['First article index'] = 'first_article_index' - self.extra['Last article index'] = 'last_article_index' - self.extra_fields = self.fields[8:] + for tag in expected_tags: + vals = [] + for i in range(tag.num_values): + if not raw: + raise ValueError('Index entry does not match TAGX header') + val, consumed = decint(raw) + raw = raw[consumed:] + vals.append(val) + self.tags.append(Tag(tag, vals, self.entry_type, ctoc)) def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%( - self.index, self.entry_type, self.sub_type, len(self.raw))] - ans.append('\tOffset in HTML: %d'%self.offset) - ans.append('\tObject size in HTML: %d'%self.object_size) - ans.append('\tLabel offset in CTOC: %d'%self.label_offset) - ans.append('\tDepth: %d'%self.depth) - for text, attr in self.extra.iteritems(): - ans.append('\t%s: %d'%(text, getattr(self, attr))) - if self.extra_fields: - ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields), - self.extra_fields)) - + ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( + self.index, self.entry_type, len(self.tags))] + for tag in self.tags: + ans.append('\t'+str(tag)) return '\n'.join(ans) # }}} class IndexRecord(object): # {{{ - def __init__(self, record): + def __init__(self, record, index_header, ctoc): self.record = record raw = self.record.raw if raw[:4] != b'INDX': @@ -632,14 +651,12 @@ class IndexRecord(object): # {{{ for i, off in enumerate(self.index_offsets): try: next_off = self.index_offsets[i+1] - is_last = False except: next_off = len(indxt) - is_last = True index, consumed = decode_hex_number(indxt[off:]) - entry_type, = u(b'>B', indxt[off+consumed]) + entry_type = ord(indxt[off+consumed]) self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+1:next_off], is_last)) + indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries)) def __str__(self): @@ -679,6 +696,9 @@ class CTOC(object) : # {{{ codec) pos += consumed+length + def __getitem__(self, offset): + return self.records.get(offset) + def __str__(self): ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] for k, v in self.records.iteritems(): @@ -723,7 +743,8 @@ class MOBIFile(object): # {{{ self.ctoc = CTOC(self.records[ pir+2:pir+2+self.index_header.num_of_ctoc_blocks], self.index_header.index_encoding) - self.index_record = IndexRecord(self.records[pir+1]) + self.index_record = IndexRecord(self.records[pir+1], + self.index_header, self.ctoc) def print_header(self, f=sys.stdout): From a389b310c63a5f6bfa63c48de1ab5dad3c33d9e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 22:40:25 -0600 Subject: [PATCH 19/30] ... --- src/calibre/ebooks/mobi/debug.py | 44 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 2dd26e9f83..32578781b8 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -421,7 +421,7 @@ class IndexHeader(object): # {{{ self.ordt_start, = struct.unpack('>I', raw[40:44]) self.ligt_start, = struct.unpack('>I', raw[44:48]) self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) - self.num_of_ctoc_blocks, = struct.unpack('>I', raw[52:56]) + self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) self.unknown2 = raw[56:180] self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) if self.tagx_offset != self.header_length: @@ -482,7 +482,7 @@ class IndexHeader(object): # {{{ a('ORDT start: %d'%self.ordt_start) a('LIGT start: %d'%self.ligt_start) a('Number of LIGT entries: %d'%self.num_of_ligt_entries) - a('Number of CTOC blocks: %d'%self.num_of_ctoc_blocks) + a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) u(self.unknown2) a('TAGX offset: %d'%self.tagx_offset) u(self.unknown3) @@ -520,10 +520,10 @@ class Tag(object): # {{{ }, 'article' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 21 : ('Parent section index', 'parent_index'), - 22 : ('Description offset in CTOC', 'desc_offset'), - 23 : ('Author offset in CTOC', 'author_offset'), + 22 : ('Description offset in cncx', 'desc_offset'), + 23 : ('Author offset in cncx', 'author_offset'), }, 'chapter_with_subchapters' : { @@ -532,13 +532,13 @@ class Tag(object): # {{{ }, 'periodical' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 22 : ('First section index', 'first_section_index'), 23 : ('Last section index', 'last_section_index'), }, 'section' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 21 : ('Periodical index', 'periodical_index'), 22 : ('First article index', 'first_article_index'), 23 : ('Last article index', 'last_article_index'), @@ -546,10 +546,10 @@ class Tag(object): # {{{ } - def __init__(self, tagx, vals, entry_type, ctoc): + def __init__(self, tagx, vals, entry_type, cncx): self.value = vals if len(vals) > 1 else vals[0] self.entry_type = entry_type - self.ctoc_value = None + self.cncx_value = None if tagx.tag in self.TAG_MAP: self.attr, self.desc = self.TAG_MAP[tagx.tag] else: @@ -563,11 +563,11 @@ class Tag(object): # {{{ raise ValueError('Unknown tag: %d for entry type: %s'%( tagx.tag, entry_type)) if '_offset' in self.attr: - self.ctoc_value = ctoc[self.value] + self.cncx_value = cncx[self.value] def __str__(self): - if self.ctoc_value is not None: - return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value) + if self.cncx_value is not None: + return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value) return '%s : %r'%(self.desc, self.value) # }}} @@ -585,7 +585,7 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw, ctoc, tagx_entries): + def __init__(self, ident, entry_type, raw, cncx, tagx_entries): self.index = ident self.raw = raw self.tags = [] @@ -606,7 +606,7 @@ class IndexEntry(object): # {{{ val, consumed = decint(raw) raw = raw[consumed:] vals.append(val) - self.tags.append(Tag(tag, vals, self.entry_type, ctoc)) + self.tags.append(Tag(tag, vals, self.entry_type, cncx)) def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( @@ -619,7 +619,7 @@ class IndexEntry(object): # {{{ class IndexRecord(object): # {{{ - def __init__(self, record, index_header, ctoc): + def __init__(self, record, index_header, cncx): self.record = record raw = self.record.raw if raw[:4] != b'INDX': @@ -656,7 +656,7 @@ class IndexRecord(object): # {{{ index, consumed = decode_hex_number(indxt[off:]) entry_type = ord(indxt[off+consumed]) self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries)) + indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries)) def __str__(self): @@ -682,7 +682,7 @@ class IndexRecord(object): # {{{ # }}} -class CTOC(object) : # {{{ +class CNCX(object) : # {{{ def __init__(self, records, codec): self.records = OrderedDict() @@ -700,7 +700,7 @@ class CTOC(object) : # {{{ return self.records.get(offset) def __str__(self): - ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] + ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20] for k, v in self.records.iteritems(): ans.append('%10d : %s'%(k, v)) return '\n'.join(ans) @@ -740,11 +740,11 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) - self.ctoc = CTOC(self.records[ - pir+2:pir+2+self.index_header.num_of_ctoc_blocks], + self.cncx = CNCX(self.records[ + pir+2:pir+2+self.index_header.num_of_cncx_blocks], self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1], - self.index_header, self.ctoc) + self.index_header, self.cncx) def print_header(self, f=sys.stdout): @@ -771,7 +771,7 @@ def inspect_mobi(path_or_stream): with open(os.path.join(ddir, 'index.txt'), 'wb') as out: print(str(f.index_header), file=out) print('\n\n', file=out) - print(str(f.ctoc).encode('utf-8'), file=out) + print(str(f.cncx).encode('utf-8'), file=out) print('\n\n', file=out) print(str(f.index_record), file=out) From ab23416d347113215718ecb39c1e15eff4e08cdb Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Tue, 19 Jul 2011 12:55:35 +0100 Subject: [PATCH 20/30] Remove EPubBuy.DE at the request of the store --- src/calibre/customize/builtins.py | 22 +++++++++++----------- src/calibre/gui2/store/declined.txt | 1 + 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 7b43f2844a..17cc3c1028 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1208,16 +1208,16 @@ class StoreEbookscomStore(StoreBase): formats = ['EPUB', 'LIT', 'MOBI', 'PDF'] affiliate = True -class StoreEPubBuyDEStore(StoreBase): - name = 'EPUBBuy DE' - author = 'Charles Haley' - description = u'Bei EPUBBuy.com finden Sie ausschliesslich eBooks im weitverbreiteten EPUB-Format und ohne DRM. So haben Sie die freie Wahl, wo Sie Ihr eBook lesen: Tablet, eBook-Reader, Smartphone oder einfach auf Ihrem PC. So macht eBook-Lesen Spaß!' - actual_plugin = 'calibre.gui2.store.stores.epubbuy_de_plugin:EPubBuyDEStore' - - drm_free_only = True - headquarters = 'DE' - formats = ['EPUB'] - affiliate = True +#class StoreEPubBuyDEStore(StoreBase): +# name = 'EPUBBuy DE' +# author = 'Charles Haley' +# description = u'Bei EPUBBuy.com finden Sie ausschliesslich eBooks im weitverbreiteten EPUB-Format und ohne DRM. So haben Sie die freie Wahl, wo Sie Ihr eBook lesen: Tablet, eBook-Reader, Smartphone oder einfach auf Ihrem PC. So macht eBook-Lesen Spaß!' +# actual_plugin = 'calibre.gui2.store.stores.epubbuy_de_plugin:EPubBuyDEStore' +# +# drm_free_only = True +# headquarters = 'DE' +# formats = ['EPUB'] +# affiliate = True class StoreEBookShoppeUKStore(StoreBase): name = 'ebookShoppe UK' @@ -1459,7 +1459,7 @@ plugins += [ StoreEbookNLStore, StoreEbookscomStore, StoreEBookShoppeUKStore, - StoreEPubBuyDEStore, +# StoreEPubBuyDEStore, StoreEHarlequinStore, StoreEpubBudStore, StoreFeedbooksStore, diff --git a/src/calibre/gui2/store/declined.txt b/src/calibre/gui2/store/declined.txt index 3e553f2dc8..b109d30d50 100644 --- a/src/calibre/gui2/store/declined.txt +++ b/src/calibre/gui2/store/declined.txt @@ -4,3 +4,4 @@ or asked not to be included in the store integration. * Borders (http://www.borders.com/). * Indigo (http://www.chapters.indigo.ca/). * Libraria Rizzoli (http://libreriarizzoli.corriere.it/). +* EPubBuy DE: reason: too much traffic for too little sales From 35e3b759da104b8c7152fd13b29ccd953c38a676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Tue, 19 Jul 2011 14:28:10 +0200 Subject: [PATCH 21/30] bookoteka plugin --- src/calibre/customize/builtins.py | 11 +++ .../gui2/store/stores/bookoteka_plugin.py | 78 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 src/calibre/gui2/store/stores/bookoteka_plugin.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 7b43f2844a..1524ca6184 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1181,6 +1181,16 @@ class StoreBeWriteStore(StoreBase): headquarters = 'US' formats = ['EPUB', 'MOBI', 'PDF'] +class StoreBookotekaStore(StoreBase): + name = 'Bookoteka' + author = u'Tomasz Długosz' + description = u'E-booki w Bookotece dostępne są w formacie EPUB oraz PDF. Publikacje sprzedawane w Bookotece są objęte prawami autorskimi. Zobowiązaliśmy się chronić te prawa, ale bez ograniczania dostępu do książki użytkownikowi, który nabył ją w legalny sposób. Dlatego też Bookoteka stosuje tak zwany „watermarking transakcyjny” czyli swego rodzaju znaki wodne.' + actual_plugin = 'calibre.gui2.store.stores.bookoteka_plugin:BookotekaStore' + + drm_free_only = True + headquarters = 'PL' + formats = ['EPUB', 'PDF'] + class StoreDieselEbooksStore(StoreBase): name = 'Diesel eBooks' description = u'Instant access to over 2.4 million titles from hundreds of publishers including Harlequin, HarperCollins, John Wiley & Sons, McGraw-Hill, Simon & Schuster and Random House.' @@ -1455,6 +1465,7 @@ plugins += [ StoreBNStore, StoreBeamEBooksDEStore, StoreBeWriteStore, + StoreBookotekaStore, StoreDieselEbooksStore, StoreEbookNLStore, StoreEbookscomStore, diff --git a/src/calibre/gui2/store/stores/bookoteka_plugin.py b/src/calibre/gui2/store/stores/bookoteka_plugin.py new file mode 100644 index 0000000000..a75b8cdb46 --- /dev/null +++ b/src/calibre/gui2/store/stores/bookoteka_plugin.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, Tomasz Długosz ' +__docformat__ = 'restructuredtext en' + +import re +import urllib +from contextlib import closing + +from lxml import html + +from PyQt4.Qt import QUrl + +from calibre import browser, url_slash_cleaner +from calibre.gui2 import open_url +from calibre.gui2.store import StorePlugin +from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.search_result import SearchResult +from calibre.gui2.store.web_store_dialog import WebStoreDialog + +class BookotekaStore(BasicStoreConfig, StorePlugin): + + def open(self, parent=None, detail_item=None, external=False): + + url = 'http://bookoteka.pl/ebooki' + detail_url = None + + if detail_item: + detail_url = detail_item + + if external or self.config.get('open_external', False): + open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) + else: + d = WebStoreDialog(self.gui, url, parent, detail_url) + d.setWindowTitle(self.name) + d.set_tags(self.config.get('tags', '')) + d.exec_() + + def search(self, query, max_results=10, timeout=60): + url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' + + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read()) + for data in doc.xpath('//li[@class="EBOOK"]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//a[@class="item_link"]/@style')) + cover_url = re.sub(r'.*\(', '', cover_url) + cover_url = re.sub(r'\).*', '', cover_url) + title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) + author = ''.join(data.xpath('.//div[@class="shelf_authors"]/text()')) + price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) + price = price.replace('.', ',') + formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = 'http://bookoteka.pl' + cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = 'http://bookoteka.pl' + id.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats.strip() + + yield s From 1b136b6fec5ec4c6eba6f14decce2f2ec4b11e67 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:05:05 -0600 Subject: [PATCH 22/30] Fix #812750 (Literati (aka Azbooka) does not detect SD Card) --- src/calibre/devices/hanvon/driver.py | 2 +- src/calibre/ebooks/mobi/debug.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/hanvon/driver.py b/src/calibre/devices/hanvon/driver.py index 3ce0fedac0..11b064b783 100644 --- a/src/calibre/devices/hanvon/driver.py +++ b/src/calibre/devices/hanvon/driver.py @@ -131,7 +131,7 @@ class AZBOOKA(ALEX): description = _('Communicate with the Azbooka') VENDOR_NAME = 'LINUX' - WINDOWS_MAIN_MEM = 'FILE-STOR_GADGET' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET' MAIN_MEMORY_VOLUME_LABEL = 'Azbooka Internal Memory' diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 32578781b8..8ffa3aa15b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -574,6 +574,13 @@ class Tag(object): # {{{ class IndexEntry(object): # {{{ + ''' + The index is made up of entries, each of which is represented by an + instance of this class. Index entries typically point to offsets int eh + HTML, specify HTML sizes and point to text strings in the CNCX that are + used in the navigation UI. + ''' + TYPES = { # Present in book type files 0x0f : 'chapter', @@ -619,6 +626,11 @@ class IndexEntry(object): # {{{ class IndexRecord(object): # {{{ + ''' + Represents all indexing information in the MOBI, apart from indexing info + in the trailing data of the text records. + ''' + def __init__(self, record, index_header, cncx): self.record = record raw = self.record.raw @@ -684,6 +696,12 @@ class IndexRecord(object): # {{{ class CNCX(object) : # {{{ + ''' + Parses the records that contain the compiled NCX (all strings from the + NCX). Presents a simple offset : string mapping interface to access the + data. + ''' + def __init__(self, records, codec): self.records = OrderedDict() pos = 0 From 7612ec7ad840353a9679921284177171561c9ddb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:10:24 -0600 Subject: [PATCH 23/30] Updated NBObline and JBPress --- recipes/jbpress.recipe | 11 ++++++++++- recipes/nbonline.recipe | 7 +++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/recipes/jbpress.recipe b/recipes/jbpress.recipe index acfb1c78d6..1048f1fc9a 100644 --- a/recipes/jbpress.recipe +++ b/recipes/jbpress.recipe @@ -1,4 +1,4 @@ -import urllib2 +import urllib2, re from calibre.web.feeds.news import BasicNewsRecipe class JBPress(BasicNewsRecipe): @@ -40,3 +40,12 @@ class JBPress(BasicNewsRecipe): def print_version(self, url): url = urllib2.urlopen(url).geturl() # resolve redirect. return url.replace('/-/', '/print/') + + def preprocess_html(self, soup): + # remove breadcrumb + h3s = soup.findAll('h3') + for h3 in h3s: + if re.compile('^JBpress>').match(h3.string): + h3.extract() + return soup + diff --git a/recipes/nbonline.recipe b/recipes/nbonline.recipe index c5a06edec7..82b7667a5c 100644 --- a/recipes/nbonline.recipe +++ b/recipes/nbonline.recipe @@ -1,11 +1,10 @@ -EMAILADDRESS = 'hoge@foobar.co.jp' from calibre.web.feeds.news import BasicNewsRecipe class NBOnline(BasicNewsRecipe): title = u'Nikkei Business Online' language = 'ja' - description = u'Nikkei Business Online New articles. PLEASE NOTE: You need to edit EMAILADDRESS line of this "nbonline.recipe" file to set your e-mail address which is needed when login. (file is in "Calibre2/resources/recipes" directory.)' + description = u'Nikkei Business Online.\u6CE8\uFF1A\u30E6\u30FC\u30B6\u30FC\u540D\u306Bemail\u30A2\u30C9\u30EC\u30B9\u3068\u30E6\u30FC\u30B6\u30FC\u540D\u3092\u30BB\u30DF\u30B3\u30ED\u30F3\u3067\u533A\u5207\u3063\u3066\u5165\u308C\u3066\u304F\u3060\u3055\u3044\u3002\u4F8B\uFF1Aemail@address.jp;username . PLEASE NOTE: You need to put your email address and username into username filed separeted by ; (semi-colon).' __author__ = 'Ado Nishimura' needs_subscription = True oldest_article = 7 @@ -23,8 +22,8 @@ class NBOnline(BasicNewsRecipe): if self.username is not None and self.password is not None: br.open('https://signon.nikkeibp.co.jp/front/login/?ct=p&ts=nbo') br.select_form(name='loginActionForm') - br['email'] = EMAILADDRESS - br['userId'] = self.username + br['email'] = self.username.split(';')[0] + br['userId'] = self.username.split(';')[1] br['password'] = self.password br.submit() return br From 93fef1787ed9df4f0d5a02a849603b44d3c9d18f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:17:20 -0600 Subject: [PATCH 24/30] =?UTF-8?q?De=20Luns=20a=20Venres=20by=20Susana=20So?= =?UTF-8?q?telo=20Doc=C3=ADo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/galicia_confidential.recipe | 22 +++++++-------- recipes/luns_a_venres.recipe | 44 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 recipes/luns_a_venres.recipe diff --git a/recipes/galicia_confidential.recipe b/recipes/galicia_confidential.recipe index d07946001e..4aaf434b09 100644 --- a/recipes/galicia_confidential.recipe +++ b/recipes/galicia_confidential.recipe @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds import Feed @@ -36,14 +35,13 @@ class GC_gl(BasicNewsRecipe): def feed_to_index_append(self, feedObject, masterFeed): - for feed in feedObject: - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date - } - newArticles.append(newArt) - masterFeed.append((feed.title,newArticles)) - + for feed in feedObject: + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date + } + newArticles.append(newArt) + masterFeed.append((feed.title,newArticles)) diff --git a/recipes/luns_a_venres.recipe b/recipes/luns_a_venres.recipe new file mode 100644 index 0000000000..1d7a2c159f --- /dev/null +++ b/recipes/luns_a_venres.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from calibre.web.feeds.news import BasicNewsRecipe + +class LV_gl(BasicNewsRecipe): + title = u'De Luns a Venres (RSS)' + __author__ = u'Susana Sotelo Docío' + description = u'O gratuíto galego' + publisher = u'Galiciaé' + category = u'news' + encoding = 'utf-8' + language = 'gl' + direction = 'ltr' + cover_url = 'http://lv.galiciae.com/new_estilos/lv/logo.gif' + oldest_article = 2 + max_articles_per_feed = 200 + center_navbar = False + + feeds = [ + (u'Galicia', u'http://lv.galiciae.com/cache/rss/sec_galicia_gl.rss'), + (u'Cultura', u'http://lv.galiciae.com/cache/rss/sec_cultura_gl.rss'), + (u'Mundo', u'http://lv.galiciae.com/cache/rss/sec_mundo_gl.rss'), + (u'Cidadanía', u'http://lv.galiciae.com/cache/rss/sec_ciudadania_gl.rss'), + (u'Tecnoloxía', u'http://lv.galiciae.com/cache/rss/sec_tecnologia_gl.rss'), + (u'España', u'http://lv.galiciae.com/cache/rss/sec_espana_gl.rss'), + (u'Deportes', u'http://lv.galiciae.com/cache/rss/sec_deportes_gl.rss'), + (u'Economía', u'http://lv.galiciae.com/cache/rss/sec_economia_gl.rss'), + (u'Lercheo', u'http://lv.galiciae.com/cache/rss/sec_gente_gl.rss'), + (u'Medio ambiente', u'http://lv.galiciae.com/cache/rss/sec_medioambiente_gl.rss'), + (u'España/Mundo', u'http://lv.galiciae.com/cache/rss/sec_espanamundo_gl.rss'), + (u'Sociedade', u'http://lv.galiciae.com/cache/rss/sec_sociedad_gl.rss'), + (u'Ciencia', u'http://lv.galiciae.com/cache/rss/sec_ciencia_gl.rss'), + (u'Motor', u'http://lv.galiciae.com/cache/rss/sec_motor_gl.rss'), + (u'Coches', u'http://lv.galiciae.com/cache/rss/sec_coches_gl.rss'), + (u'Motos', u'http://lv.galiciae.com/cache/rss/sec_motos_gl.rss'), + (u'Industriais', u'http://lv.galiciae.com/cache/rss/sec_industriales_gl.rss') + ] + + extra_css = u' p{text-align:left} ' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\nencoding="' + encoding + '"\ntags="' + category + '"\noverride_css=" p {text-align:left; text-indent: 0cm} "' + + def print_version(self, url): + url += '?imprimir&lang=gl' + return url + From 9cc367ae24ee341e507fbd1cf815de527d54195a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 08:09:11 -0600 Subject: [PATCH 25/30] MOBI Input: When extracting images, ignore records that are known as non images faster --- src/calibre/ebooks/mobi/reader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 1173b84266..d704379cf1 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -933,6 +933,9 @@ class MobiReader(object): continue processed_records.append(i) data = self.sections[i][0] + if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'): + # A FLIS, FCIS, SRCS or EOF record, ignore + continue buf = cStringIO.StringIO(data) image_index += 1 try: From c7ea8f4886d84035dde2c99b53cbc1b27c436596 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 09:50:02 -0600 Subject: [PATCH 26/30] Conversion pipeline: When adding/removing entries to the manifest, ignore unparseable URLs instead of erroring out on them --- src/calibre/ebooks/oeb/reader.py | 9 +++++++-- src/calibre/ebooks/oeb/transforms/trimmanifest.py | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 5bb6b193f7..9e4b6238a0 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -188,8 +188,13 @@ class OEBReader(object): href, _ = urldefrag(href) if not href: continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme + try: + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + except: + self.oeb.log.exception( + 'Skipping invalid href: %r'%href) + continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index 95501dbb9b..3d56f0ef3d 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -47,7 +47,10 @@ class ManifestTrimmer(object): item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: - href = item.abshref(urlnormalize(href)) + try: + href = item.abshref(urlnormalize(href)) + except: + continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: From c0bb5902ac5d712d9549438ff96ad1e7a1e25f51 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 09:50:33 -0600 Subject: [PATCH 27/30] Mobi debug: Dump text/image and unparsed binary records --- src/calibre/ebooks/mobi/debug.py | 125 +++++++++++++++++++++-- src/calibre/ebooks/mobi/writer2/utils.py | 23 +++++ 2 files changed, 142 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 8ffa3aa15b..2dbe363e7c 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -7,11 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os +import struct, datetime, sys, os, shutil from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint +from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint, + get_trailing_data) +from calibre.utils.magick.draw import identify_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -278,6 +280,7 @@ class MOBIHeader(object): # {{{ self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 self.has_fcis_flis = False self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] self.first_content_record, self.last_content_record = \ @@ -726,6 +729,63 @@ class CNCX(object) : # {{{ # }}} +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + self.raw = decompress(self.raw) + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} + +class ImageRecord(object): # {{{ + + def __init__(self, idx, record, fmt): + self.raw = record.raw + self.fmt = fmt + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f: + f.write(self.raw) + +# }}} + +class BinaryRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + sig = self.raw[:4] + name = '%06d'%idx + if sig in (b'FCIS', b'FLIS', b'SRCS'): + name += '-' + sig.decode('ascii') + elif sig == b'\xe9\x8e\r\n': + name += '-' + 'EOF' + self.name = name + + def dump(self, folder): + with open(os.path.join(folder, self.name+'.bin'), 'wb') as f: + f.write(self.raw) + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -754,7 +814,22 @@ class MOBIFile(object): # {{{ self.mobi_header = MOBIHeader(self.records[0]) + if 'huff' in self.mobi_header.compression.lower(): + huffrecs = [r.raw for r in + xrange(self.mobi_header.huffman_record_offset, + self.mobi_header.huffman_record_offset + + self.mobi_header.huffman_record_count)] + from calibre.ebooks.mobi.huffcdic import HuffReader + huffs = HuffReader(huffrecs) + decompress = huffs.decompress + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + decompress = decompress_doc + else: + decompress = lambda x: x + self.index_header = None + self.indexing_record_nums = set() pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) @@ -763,6 +838,34 @@ class MOBIFile(object): # {{{ self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1], self.index_header, self.cncx) + self.indexing_record_nums = set(xrange(pir, + pir+2+self.index_header.num_of_cncx_blocks)) + + + ntr = self.mobi_header.number_of_text_records + fntbr = self.mobi_header.first_non_book_record + fii = self.mobi_header.first_image_index + if fntbr == 0xffffffff: + fntbr = len(self.records) + self.text_records = [TextRecord(r, self.records[r], + self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + min(len(self.records), ntr+1))] + self.image_records, self.binary_records = [], [] + for i in xrange(fntbr, len(self.records)): + if i in self.indexing_record_nums: + continue + r = self.records[i] + fmt = None + if i >= fii and r.raw[:4] not in (b'FLIS', b'FCIS', b'SRCS', + b'\xe9\x8e\r\n'): + try: + width, height, fmt = identify_data(r.raw) + except: + pass + if fmt is not None: + self.image_records.append(ImageRecord(i, r, fmt)) + else: + self.binary_records.append(BinaryRecord(i, r)) def print_header(self, f=sys.stdout): @@ -776,13 +879,16 @@ class MOBIFile(object): # {{{ print (str(self.mobi_header).encode('utf-8'), file=f) # }}} -def inspect_mobi(path_or_stream): +def inspect_mobi(path_or_stream, prefix='decompiled'): stream = (path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')) f = MOBIFile(stream) - ddir = 'debug_' + os.path.splitext(os.path.basename(stream.name))[0] - if not os.path.exists(ddir): - os.mkdir(ddir) + ddir = prefix + '_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.mkdir(ddir) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) if f.index_header is not None: @@ -793,6 +899,13 @@ def inspect_mobi(path_or_stream): print('\n\n', file=out) print(str(f.index_record), file=out) + for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), + ('binary', 'binary_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr): + rec.dump(tdir) + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 8166bdf328..708b9152d4 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct +from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -150,4 +151,26 @@ def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): scale -= 0.05 return data +def get_trailing_data(record, extra_data_flags): + ''' + Given a text record as a bytestring and the extra data flags from the MOBI + header, return the trailing data as a dictionary, mapping bit number to + data as bytestring. Also returns the record - all trailing data. + + :return: Trailing data, record - trailing data + ''' + data = OrderedDict() + for i in xrange(16, -1, -1): + flag = 2**i + if flag & extra_data_flags: + if i == 0: + # Only the first two bits are used for the size since there can + # never be more than 3 trailing multibyte chars + sz = ord(record[-1]) & 0b11 + consumed = 1 + else: + sz, consumed = decint(record, forward=False) + data[i] = record[-(sz+consumed):-consumed] + record = record[:-(sz+consumed)] + return data, record From 50f642ec07c12338e12e98b58b66cd7672583953 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 21:15:12 -0600 Subject: [PATCH 28/30] Los Andes by Darko Miletic. Fixes #813278 (New recipe for Argentinian newspaper Los Andes) --- recipes/icons/losandes.png | Bin 0 -> 285 bytes recipes/losandes.recipe | 78 +++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 recipes/icons/losandes.png create mode 100644 recipes/losandes.recipe diff --git a/recipes/icons/losandes.png b/recipes/icons/losandes.png new file mode 100644 index 0000000000000000000000000000000000000000..635217e7279702682fa4ae5de202e79b02f20267 GIT binary patch literal 285 zcmV+&0pk9NP)=GF*fShy}P<8#i2j_~wg&oFKC54QC$% z#S537{qXe{iUtFDAs{I)!Xv=VhOA-Z*~bnVVjsT$2I@r7fE55u-Ej8F{8URId*P9L zgf#$71^VZ~o39EYTnBEvdhq%S0Syy(-2i&`!1b378lYfUaQGf}4M3MD7Oe-G_u<>G z{uq;nAie%*6R@l9LKJ|4)v#vkg-0I*I9Uzk1ur~&FTl+T*7 Date: Tue, 19 Jul 2011 23:00:13 -0600 Subject: [PATCH 29/30] Mobi debug: Interpret the TBS index entries for book type documents --- src/calibre/ebooks/mobi/debug.py | 69 ++++++++++++++++++++++-- src/calibre/ebooks/mobi/writer2/utils.py | 5 +- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 2dbe363e7c..9bc587c527 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -618,6 +618,13 @@ class IndexEntry(object): # {{{ vals.append(val) self.tags.append(Tag(tag, vals, self.entry_type, cncx)) + @property + def label(self): + for tag in self.tags: + if tag.attr == 'label_offset': + return tag.cncx_value + return '' + def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( self.index, self.entry_type, len(self.tags))] @@ -731,7 +738,8 @@ class CNCX(object) : # {{{ class TextRecord(object): # {{{ - def __init__(self, idx, record, extra_data_flags, decompress): + def __init__(self, idx, record, extra_data_flags, decompress, index_record, + doc_type): self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) self.raw = decompress(self.raw) if 0 in self.trailing_data: @@ -743,6 +751,60 @@ class TextRecord(object): # {{{ self.idx = idx + if 'indexing' in self.trailing_data and index_record is not None: + self.interpret_indexing(doc_type, index_record.indices) + + def interpret_indexing(self, doc_type, indices): + raw = self.trailing_data['indexing'] + ident, consumed = decint(raw) + raw = raw[consumed:] + entry_type = ident & 0b111 + index_entry_idx = ident >> 3 + index_entry = None + for i in indices: + if i.index == index_entry_idx: + index_entry = i.label + break + self.trailing_data['interpreted_indexing'] = ( + 'Type: %s, Index Entry: %s'%(entry_type, index_entry)) + if doc_type == 2: # Book + self.interpret_book_indexing(raw, entry_type) + + def interpret_book_indexing(self, raw, entry_type): + arg1, consumed = decint(raw) + raw = raw[consumed:] + if arg1 != 0: + raise ValueError('TBS index entry has unknown arg1: %d'% + arg1) + if entry_type == 2: + desc = ('This record has only a single starting or a single' + ' ending point') + if raw: + raise ValueError('TBS index entry has unknown extra bytes:' + ' %r'%raw) + elif entry_type == 3: + desc = ('This record is spanned by a single node (i.e. it' + ' has no start or end points)') + arg2, consumed = decint(raw) + if arg2 != 0: + raise ValueError('TBS index entry has unknown arg2: %d'% + arg2) + elif entry_type == 6: + if len(raw) != 1: + raise ValueError('TBS index entry has unknown extra bytes:' + ' %r'%raw) + num = ord(raw[0]) + # An unmatched starting or ending point each contributes 1 to + # this count. A matched pair of starting and ending points + # together contribute 1 to this count. Note that you can only + # ever have either 1 unmatched start point or 1 unmatched end + # point, never both (logically impossible). + desc = ('This record has %d starting/ending points and/or complete' + ' nodes.')%num + else: + raise ValueError('Unknown TBS index entry type: %d for book'%entry_type) + self.trailing_data['interpreted_indexing'] += ' :: ' + desc + def dump(self, folder): name = '%06d'%self.idx with open(os.path.join(folder, name+'.txt'), 'wb') as f: @@ -828,7 +890,7 @@ class MOBIFile(object): # {{{ else: decompress = lambda x: x - self.index_header = None + self.index_header = self.index_record = None self.indexing_record_nums = set() pir = self.mobi_header.primary_index_record if pir != 0xffffffff: @@ -848,7 +910,8 @@ class MOBIFile(object): # {{{ if fntbr == 0xffffffff: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, decompress, self.index_record, + self.mobi_header.type_raw) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] for i in xrange(fntbr, len(self.records)): diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 708b9152d4..1c2d3a110d 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -170,7 +170,8 @@ def get_trailing_data(record, extra_data_flags): consumed = 1 else: sz, consumed = decint(record, forward=False) - data[i] = record[-(sz+consumed):-consumed] - record = record[:-(sz+consumed)] + if sz > consumed: + data[i] = record[-sz:-consumed] + record = record[:-sz] return data, record From 7f5651e0bd61b9fbf4efecb65e8d433f8fc38b1b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 23:11:03 -0600 Subject: [PATCH 30/30] ... --- src/calibre/ebooks/mobi/writer2/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 1c2d3a110d..dc9526eb77 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -166,7 +166,7 @@ def get_trailing_data(record, extra_data_flags): if i == 0: # Only the first two bits are used for the size since there can # never be more than 3 trailing multibyte chars - sz = ord(record[-1]) & 0b11 + sz = (ord(record[-1]) & 0b11) + 1 consumed = 1 else: sz, consumed = decint(record, forward=False)