From 5ab1e0d1154ecc1e7d2a19893ab8f6bce5e0e850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Fri, 16 Mar 2012 23:59:59 +0100 Subject: [PATCH 01/27] load more than 10 results in nexto --- src/calibre/gui2/store/stores/nexto_plugin.py | 78 ++++++++++--------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/src/calibre/gui2/store/stores/nexto_plugin.py b/src/calibre/gui2/store/stores/nexto_plugin.py index 16004908df..f7572e6522 100644 --- a/src/calibre/gui2/store/stores/nexto_plugin.py +++ b/src/calibre/gui2/store/stores/nexto_plugin.py @@ -3,7 +3,7 @@ from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL 3' -__copyright__ = '2011, Tomasz Długosz ' +__copyright__ = '2011-2012, Tomasz Długosz ' __docformat__ = 'restructuredtext en' import re @@ -47,41 +47,47 @@ class NextoStore(BasicStoreConfig, StorePlugin): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015' br = browser() + offset=0 counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ul[@class="productslist"]/li'): - if counter <= 0: + + while counter: + with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: + doc = html.fromstring(f.read()) + for data in doc.xpath('//ul[@class="productslist"]/li'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href')) + if not id: + continue + + price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) + + cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) + title = ''.join(data.xpath('.//a[@class="title"]/text()')) + title = re.sub(r' - ebook$', '', title) + formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()')) + DrmFree = re.search(r'bez.DRM', formats) + formats = re.sub(r'\(.+\)', '', formats) + + author = '' + with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf: + idata = html.fromstring(nf.read()) + author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED + s.formats = formats.upper().strip() + + yield s + if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'): break - - id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href')) - if not id: - continue - - price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) - - cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) - title = ''.join(data.xpath('.//a[@class="title"]/text()')) - title = re.sub(r' - ebook$', '', title) - formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()')) - DrmFree = re.search(r'bez.DRM', formats) - formats = re.sub(r'\(.+\)', '', formats) - - author = '' - with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf: - idata = html.fromstring(nf.read()) - author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()')) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED - s.formats = formats.upper().strip() - - yield s + offset+=10 From 68216a0f7f5c669bfe55895c85fbbc2a1b3c6c05 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 10:30:53 +0530 Subject: [PATCH 02/27] ... --- src/calibre/gui2/widgets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index f3badd91c9..6d238c84d3 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -15,6 +15,7 @@ from PyQt4.Qt import (QIcon, QFont, QLabel, QListWidget, QAction, QMenu, QStringListModel, QCompleter, QStringList, QTimer, QRect, QFontDatabase, QGraphicsView) +from calibre.constants import iswindows from calibre.gui2 import (NONE, error_dialog, pixmap_to_data, gprefs, warning_dialog) from calibre.gui2.filename_pattern_ui import Ui_Form @@ -365,7 +366,7 @@ class FontFamilyModel(QAbstractListModel): # {{{ self.families = list(qt_families.intersection(set(self.families))) self.families.sort() self.families[:0] = [_('None')] - self.font = QFont('sansserif') + self.font = QFont('verdana' if iswindows else 'sansserif') def rowCount(self, *args): return len(self.families) From 402d28a8f00b2d31d47c8bf599ebb6ed038e67b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 10:35:06 +0530 Subject: [PATCH 03/27] RTF Output: Fix bug that broke conversion to RTF when the input document contains tags with no src attribute. --- src/calibre/ebooks/rtf/rtfml.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 60f69e2e17..0fdc6cad1d 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -234,13 +234,15 @@ class RTFMLizer(object): # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == 'img': - src = os.path.basename(elem.get('src')) - block_start = '' - block_end = '' - if 'block' not in tag_stack: - block_start = '{\\par\\pard\\hyphpar ' - block_end = '}' - text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) + src = elem.get('src') + if src: + src = os.path.basename(elem.get('src')) + block_start = '' + block_end = '' + if 'block' not in tag_stack: + block_start = '{\\par\\pard\\hyphpar ' + block_end = '}' + text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) single_tag = SINGLE_TAGS.get(tag, None) if single_tag: From b04e3e74de3d3118ef1b589ba64a81093dac9a71 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 11:07:13 +0530 Subject: [PATCH 04/27] ... --- src/calibre/gui2/widgets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index 6d238c84d3..c9a3061295 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -366,7 +366,7 @@ class FontFamilyModel(QAbstractListModel): # {{{ self.families = list(qt_families.intersection(set(self.families))) self.families.sort() self.families[:0] = [_('None')] - self.font = QFont('verdana' if iswindows else 'sansserif') + self.font = QFont('Verdana' if iswindows else 'sansserif') def rowCount(self, *args): return len(self.families) From 91a4bd7d429460c5bba769a4af9862b4d567752a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 14:51:55 +0530 Subject: [PATCH 05/27] Upgrade version of Qt in windows builds to 4.8.0 --- setup/installer/windows/freeze.py | 2 +- setup/installer/windows/notes.rst | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py index 69e669566d..3e251d2dcf 100644 --- a/setup/installer/windows/freeze.py +++ b/setup/installer/windows/freeze.py @@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC from setup.installer.windows.wix import WixMixIn OPENSSL_DIR = r'Q:\openssl' -QT_DIR = 'Q:\\Qt\\4.7.3' +QT_DIR = 'Q:\\Qt\\4.8.0' QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns'] LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll' SW = r'C:\cygwin\home\kovid\sw' diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst index d063a19249..36acacdb55 100644 --- a/setup/installer/windows/notes.rst +++ b/setup/installer/windows/notes.rst @@ -97,7 +97,9 @@ Now, run configure and make:: -no-plugin-manifests is needed so that loading the plugins does not fail looking for the CRT assembly - configure -opensource -release -qt-zlib -qt-gif -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake + configure -opensource -release -qt-zlib -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake + +Add the path to the bin folder inside the Qt dir to your system PATH. SIP ----- From c87ad6d69f324b32ba0ac375b09955b7b84617f8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 15:31:05 +0530 Subject: [PATCH 06/27] Refactor inspect MOBI to use the INDX reading code from mobi.reader --- src/calibre/ebooks/mobi/debug.py | 284 +++++------------------- src/calibre/ebooks/mobi/reader/index.py | 163 ++++++++------ src/calibre/ebooks/mobi/reader/mobi8.py | 1 + src/calibre/ebooks/mobi/reader/ncx.py | 59 +++-- src/calibre/ebooks/mobi/utils.py | 23 +- 5 files changed, 193 insertions(+), 337 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index b12c9d2121..35484d0b39 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -15,6 +15,8 @@ from lxml import html from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import (parse_index_record, + parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data @@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{ class TagX(object): # {{{ - def __init__(self, raw): - self.tag = ord(raw[0]) - self.num_values = ord(raw[1]) - self.bitmask = ord(raw[2]) - # End of file = 1 iff last entry - # When it is 1 all others are 0 - self.eof = ord(raw[3]) - + def __init__(self, tag, num_values, bitmask, eof): + self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values, + bitmask, eof) + self.num_of_values = num_values self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 and self.bitmask == 0) @@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{ raise ValueError('Invalid TAGX section') self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') @@ -533,7 +525,8 @@ class IndexHeader(object): # {{{ raise ValueError('Invalid Primary Index Record') self.header_length, = struct.unpack('>I', raw[4:8]) - self.unknown1 = raw[8:16] + self.unknown1 = raw[8:12] + self.header_type, = struct.unpack('>I', raw[12:16]) self.index_type, = struct.unpack('>I', raw[16:20]) self.index_type_desc = {0: 'normal', 2: 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') @@ -562,13 +555,7 @@ class IndexHeader(object): # {{{ raise ValueError('Invalid TAGX section') self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') @@ -602,6 +589,7 @@ class IndexHeader(object): # {{{ a('Header length: %d'%self.header_length) u(self.unknown1) + a('Header type: %d'%self.header_type) a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) a('Offset to IDXT start: %d'%self.idxt_start) a('Number of index records: %d'%self.index_count) @@ -661,19 +649,15 @@ class Tag(object): # {{{ } - def __init__(self, tagx, vals, entry_type, cncx): + def __init__(self, tag_type, vals, cncx): self.value = vals if len(vals) > 1 else vals[0] if vals else None - self.entry_type = entry_type - tag_type = tagx.tag self.cncx_value = None if tag_type in self.TAG_MAP: self.attr, self.desc = self.TAG_MAP[tag_type] else: - print ('Unknown tag value: %d in entry type: %s'%(tag_type, - entry_type)) - self.desc = '??Unknown (tag value: %d type: %s)'%( - tag_type, entry_type) + print ('Unknown tag value: %%s'%tag_type) + self.desc = '??Unknown (tag value: %d)'%tag_type self.attr = 'unknown' if '_offset' in self.attr: @@ -695,50 +679,13 @@ class IndexEntry(object): # {{{ used in the navigation UI. ''' - def __init__(self, ident, entry_type, raw, cncx, tagx_entries, - control_byte_count): - self.index = ident - self.raw = raw - self.tags = [] - self.entry_type = entry_type - self.byte_size = len(raw) - - orig_raw = raw - - if control_byte_count not in (1, 2): - raise ValueError('Unknown control byte count: %d'% - control_byte_count) - - self.flags = 0 - - if control_byte_count == 2: - self.flags = ord(raw[0]) - raw = raw[1:] - - expected_tags = [tag for tag in tagx_entries if tag.bitmask & - entry_type] - - flags = self.flags - for tag in expected_tags: - vals = [] - - if tag.tag > 0b1000000: # 0b1000000 = 64 - has_tag = flags & 0b1 - flags = flags >> 1 - if not has_tag: continue - for i in range(tag.num_values): - if not raw: - raise ValueError('Index entry does not match TAGX header') - val, consumed = decint(raw) - raw = raw[consumed:] - vals.append(val) - self.tags.append(Tag(tag, vals, self.entry_type, cncx)) - - self.consumed = len(orig_raw) - len(raw) - self.trailing_bytes = raw - if self.trailing_bytes.replace(b'\0', b''): - raise ValueError('%s has leftover bytes: %s'%(self, format_bytes( - self.trailing_bytes))) + def __init__(self, ident, entry, cncx): + try: + self.index = int(ident, 16) + except ValueError: + self.index = ident + self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in + entry.iteritems()] @property def label(self): @@ -797,102 +744,14 @@ class IndexEntry(object): # {{{ return [0, 0] def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, ' - 'length=%d, byte_size=%d)'%( - self.index, bin(self.entry_type), bin(self.flags)[2:], - len(self.tags), self.byte_size)] + ans = ['Index Entry(index=%s, length=%d)'%( + self.index, len(self.tags))] for tag in self.tags: if tag.value is not None: ans.append('\t'+str(tag)) if self.first_child_index != -1: ans.append('\tNumber of children: %d'%(self.last_child_index - self.first_child_index + 1)) - if self.trailing_bytes: - ans.append('\tTrailing bytes: %r'%self.trailing_bytes) - return '\n'.join(ans) - -# }}} - -class SecondaryIndexRecord(object): # {{{ - - def __init__(self, record, index_header, cncx): - self.record = record - raw = self.record.raw - - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') - - u = struct.unpack - - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) - - self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - num = ord(indxt[off]) - index = indxt[off+1:off+1+num] - consumed = 1 + num - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - - def __str__(self): - ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) - for entry in self.indices: - a(str(entry)) - a('') - - return '\n'.join(ans) # }}} @@ -904,58 +763,25 @@ class IndexRecord(object): # {{{ in the trailing data of the text records. ''' - def __init__(self, record, index_header, cncx): - self.record = record + def __init__(self, records, index_header, cncx): self.alltext = None - raw = self.record.raw + table = OrderedDict() + tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in + index_header.tagx_entries] + for record in records: + raw = record.raw - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') + if raw[:4] != b'INDX': + raise ValueError('Invalid Primary Index Record') - u = struct.unpack + parse_index_record(table, record.raw, + index_header.tagx_control_byte_count, tags, + index_header.index_encoding, strict=True) - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - index, consumed = decode_hex_number(indxt[off:]) - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) + for ident, entry in table.iteritems(): + self.indices.append(IndexEntry(ident, entry, cncx)) def get_parent(self, index): if index.depth < 1: @@ -965,24 +791,12 @@ class IndexRecord(object): # {{{ if p.depth != parent_depth: continue - def __str__(self): - ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] + ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20] a = ans.append def u(w): a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) for entry in self.indices: offset = entry.offset a(str(entry)) @@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{ def get_index(self, idx): for i in self.indices: - if i.index == idx: return i + if i.index in {idx, unicode(idx)}: return i raise IndexError('Index %d not found'%idx) def __str__(self): @@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{ if entries: ans.append('\t%s:'%typ) for x in entries: - ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' + ans.append(('\t\tIndex Entry: %s (Parent index: %s, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) def bin4(num): @@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{ ' when reading starting section'%extra) si = self.get_index(si) ans.append('The section at the start of this record is:' - ' %d'%si.index) + ' %s'%si.index) if 0b0100 in extra: num = extra[0b0100] ans.append('The number of articles from the section %d' - ' in this record: %d'%(si.index, num)) + ' in this record: %s'%(si.index, num)) elif 0b0001 in extra: eof = extra[0b0001] if eof != 0: raise ValueError('Unknown eof value %s when reading' ' starting section. All bytes: %r'%(eof, orig)) ans.append('??This record has more than one article from ' - ' the section: %d'%si.index) + ' the section: %s'%si.index) return si, byts # }}} @@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != NULL_INDEX: self.index_header = IndexHeader(self.records[pir]) + numi = self.index_header.index_count self.cncx = CNCX(self.records[ - pir+2:pir+2+self.index_header.num_of_cncx_blocks], + pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks], self.index_header.index_encoding) - self.index_record = IndexRecord(self.records[pir+1], + self.index_record = IndexRecord(self.records[pir+1:pir+1+numi], self.index_header, self.cncx) self.indexing_record_nums = set(xrange(pir, - pir+2+self.index_header.num_of_cncx_blocks)) + pir+1+numi+self.index_header.num_of_cncx_blocks)) self.secondary_index_record = self.secondary_index_header = None sir = self.mobi_header.secondary_index_record if sir != NULL_INDEX: self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) + numi = self.secondary_index_header.index_count self.indexing_record_nums.add(sir) - self.secondary_index_record = SecondaryIndexRecord( - self.records[sir+1], self.secondary_index_header, self.cncx) - self.indexing_record_nums.add(sir+1) + self.secondary_index_record = IndexRecord( + self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx) + self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi)) ntr = self.mobi_header.number_of_text_records diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index b292d55c13..dd85b5a5cb 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct -from collections import OrderedDict +from collections import OrderedDict, namedtuple -from calibre.ebooks.mobi.utils import decint, count_set_bits +from calibre.ebooks.mobi.utils import (decint, count_set_bits, + decode_string) + +TagX = namedtuple('TagX', 'tag num_of_values bitmask eof') +PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values') class InvalidFile(ValueError): pass @@ -37,9 +41,8 @@ def parse_indx_header(data): 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' ) num = len(words) - values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)]) - header = {words[i]:values[i] for i in xrange(num)} - return header + values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) + return dict(zip(words, values)) class CNCX(object): # {{{ @@ -77,81 +80,116 @@ class CNCX(object): # {{{ return self.records.get(offset, default) # }}} -def parse_tag_section(data): +def parse_tagx_section(data): check_signature(data, b'TAGX') tags = [] - first_entry_offset, = struct.unpack_from(b'>L', data, 0x04) - control_byte_count, = struct.unpack_from(b'>L', data, 0x08) + first_entry_offset, = struct.unpack_from(b'>L', data, 4) + control_byte_count, = struct.unpack_from(b'>L', data, 8) - # Skip the first 12 bytes already read above. for i in xrange(12, first_entry_offset, 4): - pos = i - tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]), - ord(data[pos+3]))) + vals = list(bytearray(data[i:i+4])) + tags.append(TagX(*vals)) return control_byte_count, tags -def get_tag_map(control_byte_count, tags, data, start, end): +def get_tag_map(control_byte_count, tagx, data, strict=False): ptags = [] ans = {} - control_byte_index = 0 - data_start = start + control_byte_count + control_bytes = list(bytearray(data[:control_byte_count])) + data = data[control_byte_count:] - for tag, values_per_entry, mask, end_flag in tags: - if end_flag == 0x01: - control_byte_index += 1 + for x in tagx: + if x.eof == 0x01: + control_bytes = control_bytes[1:] continue - value = ord(data[start + control_byte_index]) & mask + value = control_bytes[0] & x.bitmask if value != 0: - if value == mask: - if count_set_bits(mask) > 1: + value_count = value_bytes = None + if value == x.bitmask: + if count_set_bits(x.bitmask) > 1: # If all bits of masked value are set and the mask has more # than one bit, a variable width value will follow after # the control bytes which defines the length of bytes (NOT # the value count!) which will contain the corresponding # variable width values. - value, consumed = decint(data[data_start:]) - data_start += consumed - ptags.append((tag, None, value, values_per_entry)) + value_bytes, consumed = decint(data) + data = data[consumed:] else: - ptags.append((tag, 1, None, values_per_entry)) + value_count = 1 else: # Shift bits to get the masked value. - while mask & 0x01 == 0: - mask = mask >> 1 - value = value >> 1 - ptags.append((tag, value, None, values_per_entry)) - for tag, value_count, value_bytes, values_per_entry in ptags: + mask = x.bitmask + while mask & 0b1 == 0: + mask >>= 1 + value >>= 1 + value_count = value + ptags.append(PTagX(x.tag, value_count, value_bytes, + x.num_of_values)) + + for x in ptags: values = [] - if value_count != None: + if x.value_count is not None: # Read value_count * values_per_entry variable width values. - for _ in xrange(value_count*values_per_entry): - byts, consumed = decint(data[data_start:]) - data_start += consumed + for _ in xrange(x.value_count * x.num_of_values): + byts, consumed = decint(data) + data = data[consumed:] values.append(byts) - else: + else: # value_bytes is not None # Convert value_bytes to variable width values. total_consumed = 0 - while total_consumed < value_bytes: + while total_consumed < x.value_bytes: # Does this work for values_per_entry != 1? - byts, consumed = decint(data[data_start:]) - data_start += consumed + byts, consumed = decint(data) + data = data[consumed:] total_consumed += consumed values.append(byts) - if total_consumed != value_bytes: - print ("Error: Should consume %s bytes, but consumed %s" % - (value_bytes, total_consumed)) - ans[tag] = values - # Test that all bytes have been processed if end is given. - if end is not None and data_start < end: - # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. - rest = data[data_start:end] - if rest.replace(b'\0', b''): - print ("Warning: There are unprocessed index bytes left: %s" % - format_bytes(rest)) + if total_consumed != x.value_bytes: + err = ("Error: Should consume %s bytes, but consumed %s" % + (x.value_bytes, total_consumed)) + if strict: + raise ValueError(err) + else: + print(err) + ans[x.tag] = values + # Test that all bytes have been processed + if data.replace(b'\0', b''): + err = ("Warning: There are unprocessed index bytes left: %s" % + format_bytes(data)) + if strict: + raise ValueError(err) + else: + print(err) return ans +def parse_index_record(table, data, control_byte_count, tags, codec, + strict=False): + header = parse_indx_header(data) + idxt_pos = header['start'] + if data[idxt_pos:idxt_pos+4] != b'IDXT': + print ('WARNING: Invalid INDX record') + entry_count = header['count'] + + # loop through to build up the IDXT position starts + idx_positions= [] + for j in xrange(entry_count): + pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) + idx_positions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill + # bytes we need to ignore!) + idx_positions.append(idxt_pos) + + # For each entry in the IDXT build up the tag map and any associated + # text + for j in xrange(entry_count): + start, end = idx_positions[j:j+2] + rec = data[start:end] + ident, consumed = decode_string(rec, codec=codec) + rec = rec[consumed:] + tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) + table[ident] = tag_map + + def read_index(sections, idx, codec): table, cncx = OrderedDict(), CNCX([], codec) @@ -166,32 +204,11 @@ def read_index(sections, idx, codec): cncx = CNCX(cncx_records, codec) tag_section_start = indx_header['len'] - control_byte_count, tags = parse_tag_section(data[tag_section_start:]) + control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) for i in xrange(idx + 1, idx + 1 + indx_count): + # Index record data = sections[i][0] - header = parse_indx_header(data) - idxt_pos = header['start'] - entry_count = header['count'] - - # loop through to build up the IDXT position starts - idx_positions= [] - for j in xrange(entry_count): - pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) - idx_positions.append(pos) - # The last entry ends before the IDXT tag (but there might be zero fill - # bytes we need to ignore!) - idx_positions.append(idxt_pos) - - # For each entry in the IDXT build up the tag map and any associated - # text - for j in xrange(entry_count): - start, end = idx_positions[j:j+2] - text_length = ord(data[start]) - text = data[start+1:start+1+text_length] - tag_map = get_tag_map(control_byte_count, tags, data, - start+1+text_length, end) - table[text] = tag_map - + parse_index_record(table, data, control_byte_count, tags, codec) return table, cncx diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index f5421bc9ea..7939f51ccf 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -317,6 +317,7 @@ class Mobi8Reader(object): for entry in index_entries: pos = entry['pos'] fi = self.get_file_info(pos) + #print (11111111, fi, entry['pos_fid']) if fi.filename is None: raise ValueError('Index entry has invalid pos: %d'%pos) idtag = self.get_id_tag(pos).decode(self.header.codec) diff --git a/src/calibre/ebooks/mobi/reader/ncx.py b/src/calibre/ebooks/mobi/reader/ncx.py index 96ab4ac70d..ca3255e100 100644 --- a/src/calibre/ebooks/mobi/reader/ncx.py +++ b/src/calibre/ebooks/mobi/reader/ncx.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import read_index @@ -23,7 +22,30 @@ tag_fieldname_map = { 6: ['pos_fid',0], 21: ['parent',0], 22: ['child1',0], - 23: ['childn',0] + 23: ['childn',0], + 69: ['image_index',0], + 70 : ['desc_offset', 0], # 'Description offset in cncx' + 71 : ['author_offset', 0], # 'Author offset in cncx' + 72 : ['image_caption_offset', 0], # 'Image caption offset in cncx', + 73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx', + +} + +default_entry = { + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Class", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'description': None, + 'author': None, + 'image_caption': None, + 'image_attribution': None, } def read_ncx(sections, index, codec): @@ -34,32 +56,25 @@ def read_ncx(sections, index, codec): for num, x in enumerate(table.iteritems()): text, tag_map = x - entry = { - 'name': text, - 'pos': -1, - 'len': 0, - 'noffs': -1, - 'text' : "Unknown Text", - 'hlvl' : -1, - 'kind' : "Unknown Kind", - 'pos_fid' : None, - 'parent' : -1, - 'child1' : -1, - 'childn' : -1, - 'num' : num - } + entry = default_entry.copy() + entry['name'] = text + entry['num'] = num - for tag in tag_fieldname_map.keys(): + for tag in tag_fieldname_map.iterkeys(): fieldname, i = tag_fieldname_map[tag] if tag in tag_map: fieldvalue = tag_map[tag][i] if tag == 6: - fieldvalue = to_base(fieldvalue, base=32) + # Appears to be an idx into the KF8 elems table with an + # offset + fieldvalue = tuple(tag_map[tag]) entry[fieldname] = fieldvalue - if tag == 3: - entry['text'] = cncx.get(fieldvalue, 'Unknown Text') - if tag == 5: - entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind') + for which, name in {3:'text', 5:'kind', 70:'description', + 71:'author', 72:'image_caption', + 73:'image_attribution'}.iteritems(): + if tag == which: + entry[name] = cncx.get(fieldvalue, + default_entry[name]) index_entries.append(entry) return index_entries diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 6ec86f77ee..2bab82bc53 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -15,7 +15,13 @@ from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 -def decode_hex_number(raw): +def decode_string(raw, codec='utf-8'): + length, = struct.unpack(b'>B', raw[0]) + raw = raw[1:1+length] + consumed = length+1 + return raw.decode(codec), consumed + +def decode_hex_number(raw, codec='utf-8'): ''' Return a variable length number encoded using hexadecimal encoding. These numbers have the first byte which tells the number of bytes that follow. @@ -25,13 +31,16 @@ def decode_hex_number(raw): :param raw: Raw binary data as a bytestring :return: The number and the number of bytes from raw that the number - occupies + occupies. ''' - length, = struct.unpack(b'>B', raw[0]) - raw = raw[1:1+length] - consumed = length+1 + raw, consumed = decode_string(raw, codec=codec) return int(raw, 16), consumed +def encode_string(raw): + ans = bytearray(bytes(raw)) + ans.insert(0, len(ans)) + return bytes(ans) + def encode_number_as_hex(num): ''' Encode num as a variable length encoded hexadecimal number. Returns the @@ -44,9 +53,7 @@ def encode_number_as_hex(num): nlen = len(num) if nlen % 2 != 0: num = b'0'+num - ans = bytearray(num) - ans.insert(0, len(num)) - return bytes(ans) + return encode_string(num) def encint(value, forward=True): ''' From 9e8d691d47a0d9d7eac8cfca23648b416b439693 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 19:05:07 +0530 Subject: [PATCH 07/27] Fix #957527 (Private bug) --- src/calibre/ebooks/conversion/plugins/epub_output.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/plugins/epub_output.py b/src/calibre/ebooks/conversion/plugins/epub_output.py index 89cf987bb1..45df8ba9d1 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_output.py +++ b/src/calibre/ebooks/conversion/plugins/epub_output.py @@ -190,12 +190,22 @@ class EPUBOutput(OutputFormatPlugin): if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'): uuid = unicode(x).split(':')[-1] break + encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) + if uuid is None: self.log.warn('No UUID identifier found') from uuid import uuid4 uuid = str(uuid4()) oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) + if encrypted_fonts and not uuid.startswith('urn:uuid:'): + # Apparently ADE requires this value to start with urn:uuid: + # for some absurd reason, or it will throw a hissy fit and refuse + # to use the obfuscated fonts. + for x in identifiers: + if unicode(x) == uuid: + x.content = 'urn:uuid:'+uuid + with TemporaryDirectory(u'_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format metadata_xml = None @@ -210,7 +220,6 @@ class EPUBOutput(OutputFormatPlugin): opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ if x.endswith('.ncx')][0]) - encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) From bf6394c5e8cb5ac6d5ee1b2896f444c47978e1ac Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 19:16:11 +0530 Subject: [PATCH 08/27] When converting KF8 to EPUB preserve obfuscation of fonts --- src/calibre/ebooks/conversion/plugins/mobi_input.py | 5 ++++- src/calibre/ebooks/mobi/reader/mobi8.py | 3 +++ src/calibre/ebooks/mobi/utils.py | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/mobi_input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py index 9d71b69891..49a57cbde1 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_input.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py @@ -59,7 +59,10 @@ class MOBIInput(InputFormatPlugin): if mr.kf8_type is not None: log('Found KF8 MOBI of type %r'%mr.kf8_type) from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader - return os.path.abspath(Mobi8Reader(mr, log)()) + mr = Mobi8Reader(mr, log) + opf = os.path.abspath(mr()) + self.encrypted_fonts = mr.encrypted_fonts + return opf raw = parse_cache.pop('calibre_raw_mobi_markup', False) if raw: diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 7939f51ccf..1e4d63d72e 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -33,6 +33,7 @@ class Mobi8Reader(object): def __init__(self, mobi6_reader, log): self.mobi6_reader, self.log = mobi6_reader, log self.header = mobi6_reader.book_header + self.encrypted_fonts = [] def __call__(self): self.mobi6_reader.check_for_drm() @@ -351,6 +352,8 @@ class Mobi8Reader(object): with open(href.replace('/', os.sep), 'wb') as f: f.write(font['font_data'] if font['font_data'] else font['raw_data']) + if font['encrypted']: + self.encrypted_fonts.append(href) else: imgtype = imghdr.what(None, data) if imgtype is None: diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 2bab82bc53..4c1e52e119 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -437,7 +437,7 @@ def read_font_record(data, extent=1040): # {{{ # The zlib compressed data begins with 2 bytes of header and # has 4 bytes of checksum at the end ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed', - 'headers':None} + 'headers':None, 'encrypted':False} try: usize, flags, dstart, xor_len, xor_start = struct.unpack_from( @@ -460,6 +460,7 @@ def read_font_record(data, extent=1040): # {{{ buf[n] ^= key[n%xor_len] # XOR of buf and key font_data = bytes(buf) + ans['encrypted'] = True if flags & 0b1: # ZLIB compressed data From 2b48f393598dad13787a297a2a58b72792ff396b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 17 Mar 2012 11:10:56 -0400 Subject: [PATCH 09/27] Fixes for smartypants from http://www.mobileread.com/forums/showthread.php?t=171920 submitted by Leigh Parry. --- src/calibre/utils/smartypants.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py index 8763a313fc..fa3119bf53 100644 --- a/src/calibre/utils/smartypants.py +++ b/src/calibre/utils/smartypants.py @@ -591,6 +591,21 @@ def educateQuotes(str): str = re.sub(r'''""''', """””""", str) str = re.sub(r"""''""", """’’""", str) + # Special case for Quotes at inside of other entities, e.g.: + #

A double quote--"within dashes"--would be nice.

+ str = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", str) + str = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", str) + str = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", str) + str = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", str) + + # Special case for Quotes at end of line with a preceeding space (may change just to end of line) + str = re.sub(r"""(?<=\s)"$""", r"""”""", str) + str = re.sub(r"""(?<=\s)'$""", r"""’""", str) + + # Special case for Quotes at beginning of line with a space - multiparagraph quoted text: + str = re.sub(r"""^"(?=\s)""", r"""“""", str) + str = re.sub(r"""^'(?=\s)""", r"""‘""", str) + # Special case for decade abbreviations (the '80s): str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str) From 8d23a63a46783283d3cb67c93ee7e9611ecbcff2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 21:35:59 +0530 Subject: [PATCH 10/27] ... --- src/calibre/ebooks/mobi/reader/markup.py | 3 ++- src/calibre/ebooks/mobi/reader/mobi8.py | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 26583cf30c..721de28ff4 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -33,7 +33,8 @@ def update_internal_links(mobi8_reader): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) - filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset) + filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32), + int(offset, 32)) suffix = (b'#' + idtag) if idtag else b'' replacement = filename.encode(mr.header.codec) + suffix tag = posfid_index_pattern.sub(replacement, tag, 1) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 1e4d63d72e..5105e20f0b 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -230,11 +230,10 @@ class Mobi8Reader(object): def get_id_tag_by_pos_fid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file - row = int(posfid, 32) - off = int(offset, 32) - [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row] - pos = insertpos + off - fname = self.get_file_info(pos).filename + insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] + pos = insertpos + offset + fi = self.get_file_info(pos) + fname = fi.filename # an existing "id=" must exist in original xhtml otherwise it would not # have worked for linking. Amazon seems to have added its own # additional "aid=" inside tags whose contents seem to represent some @@ -318,7 +317,6 @@ class Mobi8Reader(object): for entry in index_entries: pos = entry['pos'] fi = self.get_file_info(pos) - #print (11111111, fi, entry['pos_fid']) if fi.filename is None: raise ValueError('Index entry has invalid pos: %d'%pos) idtag = self.get_id_tag(pos).decode(self.header.codec) From a83654a4990900f56bb5725adf911e1074a5733c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 22:57:12 +0530 Subject: [PATCH 11/27] Refactor mobi inspect code in preparation for KF8 support --- src/calibre/debug.py | 2 +- .../ebooks/conversion/plugins/mobi_output.py | 2 +- src/calibre/ebooks/mobi/debug/__init__.py | 16 + src/calibre/ebooks/mobi/debug/headers.py | 474 ++++++++++++++++++ src/calibre/ebooks/mobi/debug/main.py | 39 ++ .../ebooks/mobi/{debug.py => debug/mobi6.py} | 452 +---------------- 6 files changed, 540 insertions(+), 445 deletions(-) create mode 100644 src/calibre/ebooks/mobi/debug/__init__.py create mode 100644 src/calibre/ebooks/mobi/debug/headers.py create mode 100644 src/calibre/ebooks/mobi/debug/main.py rename src/calibre/ebooks/mobi/{debug.py => debug/mobi6.py} (63%) diff --git a/src/calibre/debug.py b/src/calibre/debug.py index 13cccd3e01..f5f803ec84 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -234,7 +234,7 @@ def main(args=sys.argv): sql_dump = args[-1] reinit_db(opts.reinitialize_db, sql_dump=sql_dump) elif opts.inspect_mobi: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi for path in args[1:]: prints('Inspecting:', path) inspect_mobi(path) diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 7288f095d7..06580be1ba 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -179,7 +179,7 @@ class MOBIOutput(OutputFormatPlugin): writer(oeb, output_path) if opts.extract_to is not None: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi ddir = opts.extract_to inspect_mobi(output_path, ddir=ddir) diff --git a/src/calibre/ebooks/mobi/debug/__init__.py b/src/calibre/ebooks/mobi/debug/__init__.py new file mode 100644 index 0000000000..b472bf3148 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +def format_bytes(byts): + byts = bytearray(byts) + byts = [hex(b)[2:] for b in byts] + return ' '.join(byts) + + diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py new file mode 100644 index 0000000000..7965253be6 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, datetime + +from calibre.utils.date import utc_tz +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.mobi.debug import format_bytes + +# PalmDB {{{ +class PalmDOCAttributes(object): + + class Attr(object): + + def __init__(self, name, field, val): + self.name = name + self.val = val & field + + def __str__(self): + return '%s: %s'%(self.name, bool(self.val)) + + def __init__(self, raw): + self.val = struct.unpack(b'H', self.raw[34:36])[0] + + palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) + self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] + self.creation_date = (palm_epoch + + datetime.timedelta(seconds=self.creation_date_raw)) + self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] + self.modification_date = (palm_epoch + + datetime.timedelta(seconds=self.modification_date_raw)) + self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] + self.last_backup_date = (palm_epoch + + datetime.timedelta(seconds=self.last_backup_date_raw)) + self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] + self.app_info_id = self.raw[52:56] + self.sort_info_id = self.raw[56:60] + self.type = self.raw[60:64] + self.creator = self.raw[64:68] + self.ident = self.type + self.creator + if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): + raise ValueError('Unknown book ident: %r'%self.ident) + self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) + self.next_rec_list_id = self.raw[72:76] + + self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) + + def __str__(self): + ans = ['*'*20 + ' PalmDB Header '+ '*'*20] + ans.append('Name: %r'%self.name) + ans.append(str(self.attributes)) + ans.append('Version: %s'%self.version) + ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), + self.creation_date_raw)) + ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), + self.modification_date_raw)) + ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), + self.last_backup_date_raw)) + ans.append('Modification number: %s'%self.modification_number) + ans.append('App Info ID: %r'%self.app_info_id) + ans.append('Sort Info ID: %r'%self.sort_info_id) + ans.append('Type: %r'%self.type) + ans.append('Creator: %r'%self.creator) + ans.append('Last record UID +1: %r'%self.last_record_uid) + ans.append('Next record list id: %r'%self.next_rec_list_id) + ans.append('Number of records: %s'%self.number_of_records) + + return '\n'.join(ans) +# }}} + +class Record(object): # {{{ + + def __init__(self, raw, header): + self.offset, self.flags, self.uid = header + self.raw = raw + + @property + def header(self): + return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, + self.uid, self.raw[:4], len(self.raw)) +# }}} + +# EXTH {{{ +class EXTHRecord(object): + + def __init__(self, type_, data): + self.type = type_ + self.data = data + self.name = { + 1 : 'DRM Server id', + 2 : 'DRM Commerce id', + 3 : 'DRM ebookbase book id', + 100 : 'author', + 101 : 'publisher', + 102 : 'imprint', + 103 : 'description', + 104 : 'isbn', + 105 : 'subject', + 106 : 'publishingdate', + 107 : 'review', + 108 : 'contributor', + 109 : 'rights', + 110 : 'subjectcode', + 111 : 'type', + 112 : 'source', + 113 : 'asin', + 114 : 'versionnumber', + 115 : 'sample', + 116 : 'startreading', + 117 : 'adult', + 118 : 'retailprice', + 119 : 'retailpricecurrency', + 121 : 'KF8 header section index', + 125 : 'KF8 resources (images/fonts) count', + 129 : 'KF8 cover URI', + 131 : 'KF8 unknown count', + 201 : 'coveroffset', + 202 : 'thumboffset', + 203 : 'hasfakecover', + 204 : 'Creator Software', + 205 : 'Creator Major Version', # '>I' + 206 : 'Creator Minor Version', # '>I' + 207 : 'Creator Build Number', # '>I' + 208 : 'watermark', + 209 : 'tamper_proof_keys', + 300 : 'fontsignature', + 301 : 'clippinglimit', # percentage '>B' + 402 : 'publisherlimit', + 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled + 501 : 'cdetype', # 4 chars (PDOC or EBOK) + 502 : 'lastupdatetime', + 503 : 'updatedtitle', + }.get(self.type, repr(self.type)) + + if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', + 'Creator Major Version', 'Creator Minor Version', + 'Creator Build Number', 'Creator Software', 'startreading'} or + self.type in {121, 125, 131}): + self.data, = struct.unpack(b'>I', self.data) + + def __str__(self): + return '%s (%d): %r'%(self.name, self.type, self.data) + +class EXTHHeader(object): + + def __init__(self, raw): + self.raw = raw + if not self.raw.startswith(b'EXTH'): + raise ValueError('EXTH header does not start with EXTH') + self.length, = struct.unpack(b'>I', self.raw[4:8]) + self.count, = struct.unpack(b'>I', self.raw[8:12]) + + pos = 12 + self.records = [] + for i in xrange(self.count): + pos = self.read_record(pos) + self.records.sort(key=lambda x:x.type) + self.rmap = {x.type:x for x in self.records} + self.get = self.rmap.get + + def __getitem__(self, type_): + return self.rmap.__getitem__(type_) + + def read_record(self, pos): + type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) + data = self.raw[(pos+8):(pos+length)] + self.records.append(EXTHRecord(type_, data)) + return pos + length + + @property + def kf8_header_index(self): + return self.rmap.get(121, None) + + def __str__(self): + ans = ['*'*20 + ' EXTH Header '+ '*'*20] + ans.append('EXTH header length: %d'%self.length) + ans.append('Number of EXTH records: %d'%self.count) + ans.append('EXTH records...') + for r in self.records: + ans.append(str(r)) + return '\n'.join(ans) +# }}} + +class MOBIHeader(object): # {{{ + + def __init__(self, record0): + self.raw = record0.raw + + self.compression_raw = self.raw[:2] + self.compression = {1: 'No compression', 2: 'PalmDoc compression', + 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', + self.compression_raw)[0], + repr(self.compression_raw)) + self.unused = self.raw[2:4] + self.text_length, = struct.unpack(b'>I', self.raw[4:8]) + self.number_of_text_records, self.text_record_size = \ + struct.unpack(b'>HH', self.raw[8:12]) + self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) + self.encryption_type = { + 0: 'No encryption', + 1: 'Old mobipocket encryption', + 2: 'Mobipocket encryption' + }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) + self.unknown = self.raw[14:16] + + self.identifier = self.raw[16:20] + if self.identifier != b'MOBI': + raise ValueError('Identifier %r unknown'%self.identifier) + + self.length, = struct.unpack(b'>I', self.raw[20:24]) + self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) + self.type = { + 2 : 'Mobipocket book', + 3 : 'PalmDOC book', + 4 : 'Audio', + 257 : 'News', + 258 : 'News Feed', + 259 : 'News magazine', + 513 : 'PICS', + 514 : 'Word', + 515 : 'XLS', + 516 : 'PPT', + 517 : 'TEXT', + 518 : 'HTML', + }.get(self.type_raw, repr(self.type_raw)) + + self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) + self.encoding = { + 1252 : 'cp1252', + 65001: 'utf-8', + }.get(self.encoding_raw, repr(self.encoding_raw)) + self.uid = self.raw[32:36] + self.file_version, = struct.unpack(b'>I', self.raw[36:40]) + self.reserved = self.raw[40:48] + self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) + self.reserved2 = self.raw[52:80] + self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) + self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) + self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) + self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) + langcode = self.locale_raw + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + self.language = main_language.get(langid, 'ENGLISH') + self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') + + self.input_language = self.raw[96:100] + self.output_langauage = self.raw[100:104] + self.min_version, = struct.unpack(b'>I', self.raw[104:108]) + self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) + self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) + self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) + self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) + self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) + self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) + self.has_exth = bool(self.exth_flags & 0x40) + self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 + if self.has_drm_data: + self.unknown3 = self.raw[132:164] + self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) + self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) + self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) + self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) + self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 + self.has_fcis_flis = False + self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 + if self.has_extra_data_flags: + self.unknown4 = self.raw[180:192] + self.first_content_record, self.last_content_record = \ + struct.unpack(b'>HH', self.raw[192:196]) + self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) + (self.fcis_number, self.fcis_count, self.flis_number, + self.flis_count) = struct.unpack(b'>IIII', + self.raw[200:216]) + self.unknown6 = self.raw[216:224] + self.srcs_record_index = struct.unpack(b'>I', + self.raw[224:228])[0] + self.num_srcs_records = struct.unpack(b'>I', + self.raw[228:232])[0] + self.unknown7 = self.raw[232:240] + self.extra_data_flags = struct.unpack(b'>I', + self.raw[240:244])[0] + self.has_multibytes = bool(self.extra_data_flags & 0b1) + self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) + self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) + self.primary_index_record, = struct.unpack(b'>I', + self.raw[244:248]) + + if self.file_version >= 8: + (self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, + self.fdst_idx, self.fdst_count) = struct.unpack_from( + b'>LLLLLL', self.raw, 248) + self.unknown9 = self.raw[272:self.length] + + if self.has_exth: + self.exth_offset = 16 + self.length + + self.exth = EXTHHeader(self.raw[self.exth_offset:]) + + self.end_of_exth = self.exth_offset + self.exth.length + self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] + + def __str__(self): + ans = ['*'*20 + ' MOBI Header '+ '*'*20] + a = ans.append + i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) + ans.append('Compression: %s'%self.compression) + ans.append('Unused: %r'%self.unused) + ans.append('Number of text records: %d'%self.number_of_text_records) + ans.append('Text record size: %d'%self.text_record_size) + ans.append('Encryption: %s'%self.encryption_type) + ans.append('Unknown: %r'%self.unknown) + ans.append('Identifier: %r'%self.identifier) + ans.append('Header length: %d'% self.length) + ans.append('Type: %s'%self.type) + ans.append('Encoding: %s'%self.encoding) + ans.append('UID: %r'%self.uid) + ans.append('File version: %d'%self.file_version) + ans.append('Reserved: %r'%self.reserved) + ans.append('Secondary index record: %d (null val: %d)'%( + self.secondary_index_record, NULL_INDEX)) + ans.append('Reserved2: %r'%self.reserved2) + ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, + self.first_non_book_record)) + ans.append('Full name offset: %d'%self.fullname_offset) + ans.append('Full name length: %d bytes'%self.fullname_length) + ans.append('Langcode: %r'%self.locale_raw) + ans.append('Language: %s'%self.language) + ans.append('Sub language: %s'%self.sublanguage) + ans.append('Input language: %r'%self.input_language) + ans.append('Output language: %r'%self.output_langauage) + ans.append('Min version: %d'%self.min_version) + ans.append('First Image index: %d'%self.first_image_index) + ans.append('Huffman record offset: %d'%self.huffman_record_offset) + ans.append('Huffman record count: %d'%self.huffman_record_count) + ans.append('DATP record offset: %r'%self.datp_record_offset) + ans.append('DATP record count: %r'%self.datp_record_count) + ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) + if self.has_drm_data: + ans.append('Unknown3: %r'%self.unknown3) + ans.append('DRM Offset: %s'%self.drm_offset) + ans.append('DRM Count: %s'%self.drm_count) + ans.append('DRM Size: %s'%self.drm_size) + ans.append('DRM Flags: %r'%self.drm_flags) + if self.has_extra_data_flags: + ans.append('Unknown4: %r'%self.unknown4) + ans.append('First content record: %d'% self.first_content_record) + ans.append('Last content record: %d'% self.last_content_record) + ans.append('Unknown5: %d'% self.unknown5) + ans.append('FCIS number: %d'% self.fcis_number) + ans.append('FCIS count: %d'% self.fcis_count) + ans.append('FLIS number: %d'% self.flis_number) + ans.append('FLIS count: %d'% self.flis_count) + ans.append('Unknown6: %r'% self.unknown6) + ans.append('SRCS record index: %d'%self.srcs_record_index) + ans.append('Number of SRCS records?: %d'%self.num_srcs_records) + ans.append('Unknown7: %r'%self.unknown7) + ans.append(('Extra data flags: %s (has multibyte: %s) ' + '(has indexing: %s) (has uncrossable breaks: %s)')%( + bin(self.extra_data_flags), self.has_multibytes, + self.has_indexing_bytes, self.has_uncrossable_breaks )) + ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, + self.primary_index_record)) + if self.file_version >= 8: + ans.append('Unknown8: %r'%self.unknown8) + i('SKEL Index', self.skel_idx) + i('Sections Index', self.sect_idx) + i('Other Index', self.oth_idx) + i('FDST record', self.fdst_idx) + a('FDST Count: %d'%self.fdst_count) + if self.unknown9: + a('Unknown9: %r'%self.unknown9) + + ans = '\n'.join(ans) + + if self.has_exth: + ans += '\n\n' + str(self.exth) + ans += '\n\nBytes after EXTH (%d bytes): %s'%( + len(self.bytes_after_exth), + format_bytes(self.bytes_after_exth)) + + ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + + self.fullname_length)) + + ans += '\nRecord 0 length: %d'%len(self.raw) + return ans +# }}} + +class MOBIFile(object): + + def __init__(self, stream): + self.raw = stream.read() + self.palmdb = PalmDB(self.raw[:78]) + + self.record_headers = [] + self.records = [] + for i in xrange(self.palmdb.number_of_records): + pos = 78 + i * 8 + offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) + flags, val = a1, a2 << 16 | a3 << 8 | a4 + self.record_headers.append((offset, flags, val)) + + def section(section_number): + if section_number == self.palmdb.number_of_records - 1: + end_off = len(self.raw) + else: + end_off = self.record_headers[section_number + 1][0] + off = self.record_headers[section_number][0] + return self.raw[off:end_off] + + for i in range(self.palmdb.number_of_records): + self.records.append(Record(section(i), self.record_headers[i])) + + self.mobi_header = MOBIHeader(self.records[0]) + self.huffman_record_nums = [] + + if 'huff' in self.mobi_header.compression.lower(): + self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, + self.mobi_header.huffman_record_offset + + self.mobi_header.huffman_record_count)) + huffrecs = [self.records[r].raw for r in self.huffman_record_nums] + from calibre.ebooks.mobi.huffcdic import HuffReader + huffs = HuffReader(huffrecs) + decompress = huffs.unpack + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + decompress = decompress_doc + else: + decompress = lambda x: x + + self.decompress = decompress + + self.kf8_type = None + mh = self.mobi_header + if mh.file_version >= 8: + self.kf8_type = 'standalone' + elif mh.has_exth and mh.exth.kf8_header_index is not None: + self.kf8_type = 'joint' + + + diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py new file mode 100644 index 0000000000..71844150f1 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, shutil + +from calibre.ebooks.mobi.debug.headers import MOBIFile +from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 + +def inspect_mobi(path_or_stream, ddir=None): # {{{ + stream = (path_or_stream if hasattr(path_or_stream, 'read') else + open(path_or_stream, 'rb')) + f = MOBIFile(stream) + if ddir is None: + ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.makedirs(ddir) + if f.kf8_type is None: + inspect_mobi6(f, ddir) + elif f.kf8_type == 'joint': + p6 = os.path.join(ddir, 'mobi6') + inspect_mobi6(f, p6) + +# }}} + +def main(): + inspect_mobi(sys.argv[1]) + +if __name__ == '__main__': + main() + diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug/mobi6.py similarity index 63% rename from src/calibre/ebooks/mobi/debug.py rename to src/calibre/ebooks/mobi/debug/mobi6.py index 35484d0b39..5f0eda4345 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -7,403 +7,19 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os, shutil +import struct, sys, os from collections import OrderedDict, defaultdict from lxml import html -from calibre.utils.date import utc_tz -from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import (parse_index_record, parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data +from calibre.ebooks.mobi.debug import format_bytes -def format_bytes(byts): - byts = bytearray(byts) - byts = [hex(b)[2:] for b in byts] - return ' '.join(byts) - -# PalmDB {{{ -class PalmDOCAttributes(object): - - class Attr(object): - - def __init__(self, name, field, val): - self.name = name - self.val = val & field - - def __str__(self): - return '%s: %s'%(self.name, bool(self.val)) - - def __init__(self, raw): - self.val = struct.unpack(b'H', self.raw[34:36])[0] - - palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) - self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] - self.creation_date = (palm_epoch + - datetime.timedelta(seconds=self.creation_date_raw)) - self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] - self.modification_date = (palm_epoch + - datetime.timedelta(seconds=self.modification_date_raw)) - self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] - self.last_backup_date = (palm_epoch + - datetime.timedelta(seconds=self.last_backup_date_raw)) - self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] - self.app_info_id = self.raw[52:56] - self.sort_info_id = self.raw[56:60] - self.type = self.raw[60:64] - self.creator = self.raw[64:68] - self.ident = self.type + self.creator - if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): - raise ValueError('Unknown book ident: %r'%self.ident) - self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) - self.next_rec_list_id = self.raw[72:76] - - self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) - - def __str__(self): - ans = ['*'*20 + ' PalmDB Header '+ '*'*20] - ans.append('Name: %r'%self.name) - ans.append(str(self.attributes)) - ans.append('Version: %s'%self.version) - ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), - self.creation_date_raw)) - ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), - self.modification_date_raw)) - ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), - self.last_backup_date_raw)) - ans.append('Modification number: %s'%self.modification_number) - ans.append('App Info ID: %r'%self.app_info_id) - ans.append('Sort Info ID: %r'%self.sort_info_id) - ans.append('Type: %r'%self.type) - ans.append('Creator: %r'%self.creator) - ans.append('Last record UID +1: %r'%self.last_record_uid) - ans.append('Next record list id: %r'%self.next_rec_list_id) - ans.append('Number of records: %s'%self.number_of_records) - - return '\n'.join(ans) -# }}} - -class Record(object): # {{{ - - def __init__(self, raw, header): - self.offset, self.flags, self.uid = header - self.raw = raw - - @property - def header(self): - return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, - self.uid, self.raw[:4], len(self.raw)) -# }}} - -# EXTH {{{ -class EXTHRecord(object): - - def __init__(self, type_, data): - self.type = type_ - self.data = data - self.name = { - 1 : 'DRM Server id', - 2 : 'DRM Commerce id', - 3 : 'DRM ebookbase book id', - 100 : 'author', - 101 : 'publisher', - 102 : 'imprint', - 103 : 'description', - 104 : 'isbn', - 105 : 'subject', - 106 : 'publishingdate', - 107 : 'review', - 108 : 'contributor', - 109 : 'rights', - 110 : 'subjectcode', - 111 : 'type', - 112 : 'source', - 113 : 'asin', - 114 : 'versionnumber', - 115 : 'sample', - 116 : 'startreading', - 117 : 'adult', - 118 : 'retailprice', - 119 : 'retailpricecurrency', - 121 : 'KF8 header section index', - 125 : 'KF8 resources (images/fonts) count', - 129 : 'KF8 cover URI', - 131 : 'KF8 unknown count', - 201 : 'coveroffset', - 202 : 'thumboffset', - 203 : 'hasfakecover', - 204 : 'Creator Software', - 205 : 'Creator Major Version', # '>I' - 206 : 'Creator Minor Version', # '>I' - 207 : 'Creator Build Number', # '>I' - 208 : 'watermark', - 209 : 'tamper_proof_keys', - 300 : 'fontsignature', - 301 : 'clippinglimit', # percentage '>B' - 402 : 'publisherlimit', - 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled - 501 : 'cdetype', # 4 chars (PDOC or EBOK) - 502 : 'lastupdatetime', - 503 : 'updatedtitle', - }.get(self.type, repr(self.type)) - - if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', - 'Creator Major Version', 'Creator Minor Version', - 'Creator Build Number', 'Creator Software', 'startreading'} or - self.type in {121, 125, 131}): - self.data, = struct.unpack(b'>I', self.data) - - def __str__(self): - return '%s (%d): %r'%(self.name, self.type, self.data) - -class EXTHHeader(object): - - def __init__(self, raw): - self.raw = raw - if not self.raw.startswith(b'EXTH'): - raise ValueError('EXTH header does not start with EXTH') - self.length, = struct.unpack(b'>I', self.raw[4:8]) - self.count, = struct.unpack(b'>I', self.raw[8:12]) - - pos = 12 - self.records = [] - for i in xrange(self.count): - pos = self.read_record(pos) - self.records.sort(key=lambda x:x.type) - - def read_record(self, pos): - type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) - data = self.raw[(pos+8):(pos+length)] - self.records.append(EXTHRecord(type_, data)) - return pos + length - - def __str__(self): - ans = ['*'*20 + ' EXTH Header '+ '*'*20] - ans.append('EXTH header length: %d'%self.length) - ans.append('Number of EXTH records: %d'%self.count) - ans.append('EXTH records...') - for r in self.records: - ans.append(str(r)) - return '\n'.join(ans) -# }}} - -class MOBIHeader(object): # {{{ - - def __init__(self, record0): - self.raw = record0.raw - - self.compression_raw = self.raw[:2] - self.compression = {1: 'No compression', 2: 'PalmDoc compression', - 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', - self.compression_raw)[0], - repr(self.compression_raw)) - self.unused = self.raw[2:4] - self.text_length, = struct.unpack(b'>I', self.raw[4:8]) - self.number_of_text_records, self.text_record_size = \ - struct.unpack(b'>HH', self.raw[8:12]) - self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) - self.encryption_type = { - 0: 'No encryption', - 1: 'Old mobipocket encryption', - 2: 'Mobipocket encryption' - }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) - self.unknown = self.raw[14:16] - - self.identifier = self.raw[16:20] - if self.identifier != b'MOBI': - raise ValueError('Identifier %r unknown'%self.identifier) - - self.length, = struct.unpack(b'>I', self.raw[20:24]) - self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) - self.type = { - 2 : 'Mobipocket book', - 3 : 'PalmDOC book', - 4 : 'Audio', - 257 : 'News', - 258 : 'News Feed', - 259 : 'News magazine', - 513 : 'PICS', - 514 : 'Word', - 515 : 'XLS', - 516 : 'PPT', - 517 : 'TEXT', - 518 : 'HTML', - }.get(self.type_raw, repr(self.type_raw)) - - self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) - self.encoding = { - 1252 : 'cp1252', - 65001: 'utf-8', - }.get(self.encoding_raw, repr(self.encoding_raw)) - self.uid = self.raw[32:36] - self.file_version = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] - self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] - self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) - self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) - self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) - self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) - langcode = self.locale_raw - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - self.language = main_language.get(langid, 'ENGLISH') - self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') - - self.input_language = self.raw[96:100] - self.output_langauage = self.raw[100:104] - self.min_version, = struct.unpack(b'>I', self.raw[104:108]) - self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) - self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) - self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) - self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) - self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) - self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) - self.has_exth = bool(self.exth_flags & 0x40) - self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 - if self.has_drm_data: - self.unknown3 = self.raw[132:164] - self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) - self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) - self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) - self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) - self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 - self.has_fcis_flis = False - self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False - self.extra_data_flags = 0 - if self.has_extra_data_flags: - self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) - (self.fcis_number, self.fcis_count, self.flis_number, - self.flis_count) = struct.unpack(b'>IIII', - self.raw[200:216]) - self.unknown6 = self.raw[216:224] - self.srcs_record_index = struct.unpack(b'>I', - self.raw[224:228])[0] - self.num_srcs_records = struct.unpack(b'>I', - self.raw[228:232])[0] - self.unknown7 = self.raw[232:240] - self.extra_data_flags = struct.unpack(b'>I', - self.raw[240:244])[0] - self.has_multibytes = bool(self.extra_data_flags & 0b1) - self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) - self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) - self.primary_index_record, = struct.unpack(b'>I', - self.raw[244:248]) - - if self.has_exth: - self.exth_offset = 16 + self.length - - self.exth = EXTHHeader(self.raw[self.exth_offset:]) - - self.end_of_exth = self.exth_offset + self.exth.length - self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] - - def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] - ans.append('Compression: %s'%self.compression) - ans.append('Unused: %r'%self.unused) - ans.append('Number of text records: %d'%self.number_of_text_records) - ans.append('Text record size: %d'%self.text_record_size) - ans.append('Encryption: %s'%self.encryption_type) - ans.append('Unknown: %r'%self.unknown) - ans.append('Identifier: %r'%self.identifier) - ans.append('Header length: %d'% self.length) - ans.append('Type: %s'%self.type) - ans.append('Encoding: %s'%self.encoding) - ans.append('UID: %r'%self.uid) - ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) - ans.append('Secondary index record: %d (null val: %d)'%( - self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) - ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, - self.first_non_book_record)) - ans.append('Full name offset: %d'%self.fullname_offset) - ans.append('Full name length: %d bytes'%self.fullname_length) - ans.append('Langcode: %r'%self.locale_raw) - ans.append('Language: %s'%self.language) - ans.append('Sub language: %s'%self.sublanguage) - ans.append('Input language: %r'%self.input_language) - ans.append('Output language: %r'%self.output_langauage) - ans.append('Min version: %d'%self.min_version) - ans.append('First Image index: %d'%self.first_image_index) - ans.append('Huffman record offset: %d'%self.huffman_record_offset) - ans.append('Huffman record count: %d'%self.huffman_record_count) - ans.append('DATP record offset: %r'%self.datp_record_offset) - ans.append('DATP record count: %r'%self.datp_record_count) - ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) - if self.has_drm_data: - ans.append('Unknown3: %r'%self.unknown3) - ans.append('DRM Offset: %s'%self.drm_offset) - ans.append('DRM Count: %s'%self.drm_count) - ans.append('DRM Size: %s'%self.drm_size) - ans.append('DRM Flags: %r'%self.drm_flags) - if self.has_extra_data_flags: - ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) - ans.append('FCIS number: %d'% self.fcis_number) - ans.append('FCIS count: %d'% self.fcis_count) - ans.append('FLIS number: %d'% self.flis_number) - ans.append('FLIS count: %d'% self.flis_count) - ans.append('Unknown6: %r'% self.unknown6) - ans.append('SRCS record index: %d'%self.srcs_record_index) - ans.append('Number of SRCS records?: %d'%self.num_srcs_records) - ans.append('Unknown7: %r'%self.unknown7) - ans.append(('Extra data flags: %s (has multibyte: %s) ' - '(has indexing: %s) (has uncrossable breaks: %s)')%( - bin(self.extra_data_flags), self.has_multibytes, - self.has_indexing_bytes, self.has_uncrossable_breaks )) - ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, - self.primary_index_record)) - - ans = '\n'.join(ans) - - if self.has_exth: - ans += '\n\n' + str(self.exth) - ans += '\n\nBytes after EXTH (%d bytes): %s'%( - len(self.bytes_after_exth), - format_bytes(self.bytes_after_exth)) - - ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + - self.fullname_length)) - - ans += '\nRecord 0 length: %d'%len(self.raw) - return ans -# }}} class TagX(object): # {{{ @@ -1130,46 +746,10 @@ class TBSIndexing(object): # {{{ class MOBIFile(object): # {{{ - def __init__(self, stream): - self.raw = stream.read() - - self.palmdb = PalmDB(self.raw[:78]) - - self.record_headers = [] - self.records = [] - for i in xrange(self.palmdb.number_of_records): - pos = 78 + i * 8 - offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) - flags, val = a1, a2 << 16 | a3 << 8 | a4 - self.record_headers.append((offset, flags, val)) - - def section(section_number): - if section_number == self.palmdb.number_of_records - 1: - end_off = len(self.raw) - else: - end_off = self.record_headers[section_number + 1][0] - off = self.record_headers[section_number][0] - return self.raw[off:end_off] - - for i in range(self.palmdb.number_of_records): - self.records.append(Record(section(i), self.record_headers[i])) - - self.mobi_header = MOBIHeader(self.records[0]) - self.huffman_record_nums = [] - - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x + def __init__(self, mf): + for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header', + 'huffman_record_nums',): + setattr(self, x, getattr(mf, x)) self.index_header = self.index_record = None self.indexing_record_nums = set() @@ -1201,7 +781,7 @@ class MOBIFile(object): # {{{ if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] @@ -1241,17 +821,8 @@ class MOBIFile(object): # {{{ print (str(self.mobi_header).encode('utf-8'), file=f) # }}} -def inspect_mobi(path_or_stream, ddir=None): # {{{ - stream = (path_or_stream if hasattr(path_or_stream, 'read') else - open(path_or_stream, 'rb')) - f = MOBIFile(stream) - if ddir is None: - ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] - try: - shutil.rmtree(ddir) - except: - pass - os.makedirs(ddir) +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) @@ -1299,9 +870,4 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ # }}} -def main(): - inspect_mobi(sys.argv[1]) - -if __name__ == '__main__': - main() From 0479f31a5f072a8c02661c373391eaf498dc1209 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 23:14:55 +0530 Subject: [PATCH 12/27] KF8 Input: Fix some links pointing a little above or below their intended target when viewing or converting KF8 files --- src/calibre/ebooks/mobi/reader/markup.py | 3 ++- src/calibre/ebooks/mobi/reader/mobi8.py | 28 ++++++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 721de28ff4..8bb7f211f3 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -36,7 +36,8 @@ def update_internal_links(mobi8_reader): filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32), int(offset, 32)) suffix = (b'#' + idtag) if idtag else b'' - replacement = filename.encode(mr.header.codec) + suffix + replacement = filename.split('/')[-1].encode( + mr.header.codec) + suffix tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = ''.join([x.decode(mr.header.codec) for x in srcpieces]) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 5105e20f0b..ec7166ebb0 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -233,7 +233,6 @@ class Mobi8Reader(object): insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] pos = insertpos + offset fi = self.get_file_info(pos) - fname = fi.filename # an existing "id=" must exist in original xhtml otherwise it would not # have worked for linking. Amazon seems to have added its own # additional "aid=" inside tags whose contents seem to represent some @@ -242,7 +241,7 @@ class Mobi8Reader(object): # so find the closest "id=" before position the file by actually # searching in that file idtext = self.get_id_tag(pos) - return fname, idtext + return '%s/%s'%(fi.type, fi.filename), idtext def get_id_tag(self, pos): # find the correct tag by actually searching in the destination @@ -253,12 +252,13 @@ class Mobi8Reader(object): textblock = self.parts[fi.num] id_map = [] npos = pos - fi.start - # if npos inside a tag then search all text before the its end of tag - # marker pgt = textblock.find(b'>', npos) plt = textblock.find(b'<', npos) - if pgt < plt: + # if npos inside a tag then search all text before the its end of tag marker + # else not in a tag need to search the preceding tag + if plt == npos or pgt < plt: npos = pgt + 1 + textblock = textblock[0:npos] # find id links only inside of tags # inside any < > pair find all "id=' and return whatever is inside # the quotes @@ -315,12 +315,18 @@ class Mobi8Reader(object): # Add href and anchor info to the index entries for entry in index_entries: - pos = entry['pos'] - fi = self.get_file_info(pos) - if fi.filename is None: - raise ValueError('Index entry has invalid pos: %d'%pos) - idtag = self.get_id_tag(pos).decode(self.header.codec) - entry['href'] = '%s/%s'%(fi.type, fi.filename) + pos_fid = entry['pos_fid'] + if pos_fid is None: + pos = entry['pos'] + fi = self.get_file_info(pos) + if fi.filename is None: + raise ValueError('Index entry has invalid pos: %d'%pos) + idtag = self.get_id_tag(pos).decode(self.header.codec) + href = '%s/%s'%(fi.type, fi.filename) + else: + href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) + + entry['href'] = href entry['idtag'] = idtag # Build the TOC object From b6d02adfe3a52ebb21e602c73a51c82f976ba37f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 23:38:09 +0530 Subject: [PATCH 13/27] Update FHM UK --- recipes/fhm_uk.recipe | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index ab271ad753..0e2d5c1ebe 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'FHM UK' description = 'Good News for Men' - cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' + cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg' + # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' __author__ = 'Dave Asbury' - # last updated 27/1/12 + # last updated 17/3/12 language = 'en_GB' oldest_article = 28 max_articles_per_feed = 12 @@ -29,6 +30,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): feeds = [ (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), - (u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), - (u'Gaming',u'http://feed43.com/0755006465351035.xml'), - ] + (u'Upgrade',u'http://feed43.com/0877305847443234.xml'), + #(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), + #(u'Gaming',u'http://feed43.com/0755006465351035.xml'), + (u'Gaming',u'http://feed43.com/6537162612465672.xml'), + ] From 06f3a1868463710d019f3878441fc3445fc2458b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 12:15:37 +0530 Subject: [PATCH 14/27] Some progress on KF8 support in inspect MOBI --- src/calibre/ebooks/mobi/debug/headers.py | 124 +++++++++++++++------- src/calibre/ebooks/mobi/debug/main.py | 9 ++ src/calibre/ebooks/mobi/debug/mobi6.py | 48 ++------- src/calibre/ebooks/mobi/reader/headers.py | 12 +-- 4 files changed, 106 insertions(+), 87 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 7965253be6..06318c4527 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -7,12 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime +import struct, datetime, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.utils import get_trailing_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -188,10 +189,13 @@ class EXTHHeader(object): pos = self.read_record(pos) self.records.sort(key=lambda x:x.type) self.rmap = {x.type:x for x in self.records} - self.get = self.rmap.get def __getitem__(self, type_): - return self.rmap.__getitem__(type_) + return self.rmap.__getitem__(type_).data + + def get(self, type_, default=None): + ans = self.rmap.get(type_, default) + return getattr(ans, 'data', default) def read_record(self, pos): type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) @@ -201,7 +205,7 @@ class EXTHHeader(object): @property def kf8_header_index(self): - return self.rmap.get(121, None) + return self.get(121, None) def __str__(self): ans = ['*'*20 + ' EXTH Header '+ '*'*20] @@ -263,9 +267,10 @@ class MOBIHeader(object): # {{{ }.get(self.encoding_raw, repr(self.encoding_raw)) self.uid = self.raw[32:36] self.file_version, = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] + self.meta_orth_indx, self.meta_infl_indx = struct.unpack( + b'>II', self.raw[40:48]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] + self.reserved = self.raw[52:80] self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) @@ -299,9 +304,8 @@ class MOBIHeader(object): # {{{ self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) + self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II', + self.raw, 192) (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) @@ -320,10 +324,9 @@ class MOBIHeader(object): # {{{ self.raw[244:248]) if self.file_version >= 8: - (self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, - self.fdst_idx, self.fdst_count) = struct.unpack_from( - b'>LLLLLL', self.raw, 248) - self.unknown9 = self.raw[272:self.length] + (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx + ) = struct.unpack_from(b'>4L', self.raw, 248) + self.unknown9 = self.raw[264:self.length] if self.has_exth: self.exth_offset = 16 + self.length @@ -334,7 +337,7 @@ class MOBIHeader(object): # {{{ self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] + ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20] a = ans.append i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) ans.append('Compression: %s'%self.compression) @@ -349,10 +352,11 @@ class MOBIHeader(object): # {{{ ans.append('Encoding: %s'%self.encoding) ans.append('UID: %r'%self.uid) ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) + ans.append('Meta Orth Index: %d'%self.meta_orth_indx) + ans.append('Meta Infl Index: %d'%self.meta_infl_indx) ans.append('Secondary index record: %d (null val: %d)'%( self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) + ans.append('Reserved: %r'%self.reserved) ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, self.first_non_book_record)) ans.append('Full name offset: %d'%self.fullname_offset) @@ -377,9 +381,8 @@ class MOBIHeader(object): # {{{ ans.append('DRM Flags: %r'%self.drm_flags) if self.has_extra_data_flags: ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) + ans.append('FDST Index: %d'% self.fdst_idx) + ans.append('FDST Count: %d'% self.fdst_count) ans.append('FCIS number: %d'% self.fcis_number) ans.append('FCIS count: %d'% self.fcis_count) ans.append('FLIS number: %d'% self.flis_number) @@ -398,6 +401,7 @@ class MOBIHeader(object): # {{{ ans.append('Unknown8: %r'%self.unknown8) i('SKEL Index', self.skel_idx) i('Sections Index', self.sect_idx) + i('Unknown8', self.unknown8) i('Other Index', self.oth_idx) i('FDST record', self.fdst_idx) a('FDST Count: %d'%self.fdst_count) @@ -447,28 +451,74 @@ class MOBIFile(object): self.mobi_header = MOBIHeader(self.records[0]) self.huffman_record_nums = [] - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x - - self.decompress = decompress - self.kf8_type = None - mh = self.mobi_header + mh = mh8 = self.mobi_header if mh.file_version >= 8: self.kf8_type = 'standalone' elif mh.has_exth and mh.exth.kf8_header_index is not None: self.kf8_type = 'joint' - + kf8i = mh.exth.kf8_header_index + mh8 = MOBIHeader(self.records[kf8i]) + self.mobi8_header = mh8 + + if 'huff' in self.mobi_header.compression.lower(): + from calibre.ebooks.mobi.huffcdic import HuffReader + + def huffit(off, cnt): + huffman_record_nums = list(xrange(off, off+cnt)) + huffrecs = [self.records[r].raw for r in huffman_record_nums] + huffs = HuffReader(huffrecs) + return huffman_record_nums, huffs.unpack + + if self.kf8_type == 'joint': + recs6, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + recs8, d8 = huffit(mh8.huffman_record_offset + kf8i, + mh8.huffman_record_count) + self.huffman_record_nums = recs6 + recs8 + else: + self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + d8 = d6 + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + d8 = d6 = decompress_doc + else: + d8 = d6 = lambda x: x + + self.decompress6, self.decompress8 = d6, d8 + +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + raw_trailing_bytes = record.raw[len(self.raw):] + self.raw = decompress(self.raw) + + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + self.trailing_data['raw_bytes'] = raw_trailing_bytes + + for typ, val in self.trailing_data.iteritems(): + if isinstance(typ, int): + print ('Record %d has unknown trailing data of type: %d : %r'% + (idx, typ, val)) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py index 71844150f1..624da65846 100644 --- a/src/calibre/ebooks/mobi/debug/main.py +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -11,6 +11,7 @@ import sys, os, shutil from calibre.ebooks.mobi.debug.headers import MOBIFile from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 +from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8 def inspect_mobi(path_or_stream, ddir=None): # {{{ stream = (path_or_stream if hasattr(path_or_stream, 'read') else @@ -27,7 +28,15 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ inspect_mobi6(f, ddir) elif f.kf8_type == 'joint': p6 = os.path.join(ddir, 'mobi6') + os.mkdir(p6) inspect_mobi6(f, p6) + p8 = os.path.join(ddir, 'mobi8') + os.mkdir(p8) + inspect_mobi8(f, p8) + else: + inspect_mobi8(f, ddir) + + print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py index 5f0eda4345..640f58c661 100644 --- a/src/calibre/ebooks/mobi/debug/mobi6.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -16,9 +16,10 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import (parse_index_record, parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs, read_font_record) + decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.debug.headers import TextRecord class TagX(object): # {{{ @@ -472,39 +473,6 @@ class CNCX(object): # {{{ # }}} -class TextRecord(object): # {{{ - - def __init__(self, idx, record, extra_data_flags, decompress): - self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) - raw_trailing_bytes = record.raw[len(self.raw):] - self.raw = decompress(self.raw) - - if 0 in self.trailing_data: - self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) - if 1 in self.trailing_data: - self.trailing_data['indexing'] = self.trailing_data.pop(1) - if 2 in self.trailing_data: - self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) - self.trailing_data['raw_bytes'] = raw_trailing_bytes - - for typ, val in self.trailing_data.iteritems(): - if isinstance(typ, int): - print ('Record %d has unknown trailing data of type: %d : %r'% - (idx, typ, val)) - - self.idx = idx - - def dump(self, folder): - name = '%06d'%self.idx - with open(os.path.join(folder, name+'.txt'), 'wb') as f: - f.write(self.raw) - with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: - for k, v in self.trailing_data.iteritems(): - raw = '%s : %r\n\n'%(k, v) - f.write(raw.encode('utf-8')) - -# }}} - class ImageRecord(object): # {{{ def __init__(self, idx, record, fmt): @@ -781,7 +749,7 @@ class MOBIFile(object): # {{{ if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] @@ -833,12 +801,11 @@ def inspect_mobi(mobi_file, ddir): of.write(rec.raw) alltext += rec.raw of.seek(0) - if f.mobi_header.file_version < 8: - root = html.fromstring(alltext.decode('utf-8')) - with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: - of.write(html.tostring(root, pretty_print=True, encoding='utf-8', - include_meta_content_type=True)) + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) if f.index_header is not None: f.index_record.alltext = alltext @@ -866,7 +833,6 @@ def inspect_mobi(mobi_file, ddir): rec.dump(tdir) - print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index eaad81730d..db2b07e53a 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -187,19 +187,13 @@ class BookHeader(object): self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) if self.mobi_version >= 8: - self.skelidx, = struct.unpack_from('>L', raw, 0xFC) - - # Index into
sections in raw_ml - self.dividx, = struct.unpack_from('>L', raw, 0xF8) - - # Index into Other files - self.othidx, = struct.unpack_from('>L', raw, 0x104) + self.dividx, self.skelidx, self.datpidx, self.othidx = \ + struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece - self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) - self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4) + self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX From 43cf8faebc59b94b5965c14b829b9694f8c15c0b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 12:48:01 +0530 Subject: [PATCH 15/27] ... --- src/calibre/ebooks/mobi/debug/headers.py | 33 ++++++++++++++++-------- src/calibre/ebooks/mobi/reader/mobi6.py | 9 +++++-- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 06318c4527..2cc7954559 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -219,8 +219,9 @@ class EXTHHeader(object): class MOBIHeader(object): # {{{ - def __init__(self, record0): + def __init__(self, record0, offset): self.raw = record0.raw + self.header_offset = offset self.compression_raw = self.raw[:2] self.compression = {1: 'No compression', 2: 'PalmDoc compression', @@ -327,6 +328,19 @@ class MOBIHeader(object): # {{{ (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx ) = struct.unpack_from(b'>4L', self.raw, 248) self.unknown9 = self.raw[264:self.length] + if self.meta_orth_indx != self.sect_idx: + raise ValueError('KF8 header has different Meta orth and ' + 'section indices') + + # The following are all relative to the position of the header record + # make them absolute for ease of debugging + for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', + 'meta_orth_indx', 'huffman_record_offset', + 'first_non_book_record', 'datp_record_offset', 'fcis_number', + 'flis_number', 'primary_index_record', 'fdst_idx', + 'first_image_index'): + if hasattr(self, x): + setattr(self, x, self.header_offset+getattr(self, x)) if self.has_exth: self.exth_offset = 16 + self.length @@ -352,8 +366,8 @@ class MOBIHeader(object): # {{{ ans.append('Encoding: %s'%self.encoding) ans.append('UID: %r'%self.uid) ans.append('File version: %d'%self.file_version) - ans.append('Meta Orth Index: %d'%self.meta_orth_indx) - ans.append('Meta Infl Index: %d'%self.meta_infl_indx) + i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx) + i('Meta Infl Index', self.meta_infl_indx) ans.append('Secondary index record: %d (null val: %d)'%( self.secondary_index_record, NULL_INDEX)) ans.append('Reserved: %r'%self.reserved) @@ -398,13 +412,10 @@ class MOBIHeader(object): # {{{ ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, self.primary_index_record)) if self.file_version >= 8: - ans.append('Unknown8: %r'%self.unknown8) - i('SKEL Index', self.skel_idx) i('Sections Index', self.sect_idx) - i('Unknown8', self.unknown8) + i('SKEL Index', self.skel_idx) + i('DATP Index', self.datp_idx) i('Other Index', self.oth_idx) - i('FDST record', self.fdst_idx) - a('FDST Count: %d'%self.fdst_count) if self.unknown9: a('Unknown9: %r'%self.unknown9) @@ -448,7 +459,7 @@ class MOBIFile(object): for i in range(self.palmdb.number_of_records): self.records.append(Record(section(i), self.record_headers[i])) - self.mobi_header = MOBIHeader(self.records[0]) + self.mobi_header = MOBIHeader(self.records[0], 0) self.huffman_record_nums = [] self.kf8_type = None @@ -458,7 +469,7 @@ class MOBIFile(object): elif mh.has_exth and mh.exth.kf8_header_index is not None: self.kf8_type = 'joint' kf8i = mh.exth.kf8_header_index - mh8 = MOBIHeader(self.records[kf8i]) + mh8 = MOBIHeader(self.records[kf8i], kf8i) self.mobi8_header = mh8 if 'huff' in self.mobi_header.compression.lower(): @@ -473,7 +484,7 @@ class MOBIFile(object): if self.kf8_type == 'joint': recs6, d6 = huffit(mh.huffman_record_offset, mh.huffman_record_count) - recs8, d8 = huffit(mh8.huffman_record_offset + kf8i, + recs8, d8 = huffit(mh8.huffman_record_offset, mh8.huffman_record_count) self.huffman_record_nums = recs6 + recs8 else: diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index c8dec607c1..92bdd1d3bf 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -118,12 +118,17 @@ class MobiReader(object): try: self.book_header = BookHeader(self.sections[k8i][0], self.ident, user_encoding, self.log) - # The following are only correct in the Mobi 6 - # header not the Mobi 8 header + + # Only the first_image_index from the MOBI 6 header is + # useful for x in ('first_image_index',): setattr(self.book_header, x, getattr(bh, x)) + + # We need to do this because the MOBI 6 text extract code + # does not know anything about the kf8 offset if hasattr(self.book_header, 'huff_offset'): self.book_header.huff_offset += k8i + self.kf8_type = 'joint' self.kf8_boundary = k8i-1 except: From 9712175d74aa11a73b83dbb2f759ebda5472acfe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 13:23:16 +0530 Subject: [PATCH 16/27] MOBI Input: Fix regression caused by KF8 support that broke reading on ancient PRC files from Baen --- src/calibre/ebooks/mobi/reader/headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index db2b07e53a..20a31cde50 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -186,7 +186,7 @@ class BookHeader(object): if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) - if self.mobi_version >= 8: + if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) From dd20e427b5353c7e51fa9aec31bafb75ba1df80c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 13:27:00 +0530 Subject: [PATCH 17/27] ... --- src/calibre/ebooks/mobi/reader/headers.py | 2 ++ src/calibre/ebooks/mobi/reader/mobi6.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index 20a31cde50..06d349d5de 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -186,6 +186,8 @@ class BookHeader(object): if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 92bdd1d3bf..6dd789755d 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -107,7 +107,10 @@ class MobiReader(object): self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) - if self.book_header.mobi_version == 8: + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative + if (self.book_header.mobi_version == 8 and hasattr(self.book_header, + 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: From be1e281012c3140b76bd2bc4396afd02b4f1bc94 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:11:05 +0530 Subject: [PATCH 18/27] ... --- src/calibre/ebooks/mobi/debug/mobi8.py | 62 ++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 src/calibre/ebooks/mobi/debug/mobi8.py diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py new file mode 100644 index 0000000000..e4a92ee95c --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os + +from calibre.ebooks.mobi.debug.headers import TextRecord + +class MOBIFile(object): + + def __init__(self, mf): + self.mf = mf + h, h8 = mf.mobi_header, mf.mobi8_header + first_text_record = 1 + offset = 0 + res_end = len(mf.records) + if mf.kf8_type == 'joint': + offset = h.exth.kf8_header_index + res_end = offset - 1 + + self.resource_records = mf.records[h.first_non_book_record:res_end] + self.text_records = [TextRecord(i, r, h8.extra_data_flags, + mf.decompress8) for i, r in + enumerate(mf.records[first_text_record+offset: + first_text_record+offset+h8.number_of_text_records])] + + self.raw_text = b''.join(r.raw for r in self.text_records) + + def print_header(self, f=sys.stdout): + print (str(self.mf.palmdb).encode('utf-8'), file=f) + print (file=f) + print ('Record headers:', file=f) + for i, r in enumerate(self.mf.records): + print ('%6d. %s'%(i, r.header), file=f) + + print (file=f) + print (str(self.mf.mobi8_header).encode('utf-8'), file=f) + + +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) + with open(os.path.join(ddir, 'header.txt'), 'wb') as out: + f.print_header(f=out) + + alltext = os.path.join(ddir, 'raw_text.html') + with open(alltext, 'wb') as of: + of.write(f.raw_text) + + for tdir, attr in [('text_records', 'text_records'), ('images', + 'image_records'), ('binary', 'binary_records'), ('font', + 'font_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr, []): + rec.dump(tdir) + + From 86771a52863167d10caf7f4ffd3d8368f8a9bf3f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:14:54 +0530 Subject: [PATCH 19/27] Fix #958320 (Sony PRS not set to accept XMDF (*.zbf) by default) --- src/calibre/devices/prs505/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index bfce4fa1be..3ba3fcf50f 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -27,7 +27,7 @@ class PRS505(USBMS): booklist_class = CollectionsBookList - FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] + FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt', 'zbf'] CAN_SET_METADATA = ['title', 'authors', 'collections'] CAN_DO_DEVICE_DB_PLUGBOARD = True From 34da8d4060e5ae63719493f91b59dbd36d1d78e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 19:29:26 +0530 Subject: [PATCH 20/27] Fix #958145 ([Enhancement] add a link to 'adding books' preferences in the 'add books' function) --- src/calibre/gui2/actions/add.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index bb695db841..bbdef5b1b5 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -70,6 +70,9 @@ class AddAction(InterfaceAction): self.add_menu.addSeparator() ma('add-formats', _('Add files to selected book records'), triggered=self.add_formats, shortcut=_('Shift+A')) + self.add_menu.addSeparator() + ma('add-config', _('Configure the adding of books'), + triggered=self.add_config) self.qaction.triggered.connect(self.add_books) @@ -78,6 +81,11 @@ class AddAction(InterfaceAction): for action in list(self.add_menu.actions())[1:]: action.setEnabled(enabled) + def add_config(self): + self.gui.iactions['Preferences'].do_config( + initial_plugin=('Import/Export', 'Adding'), + close_after_initial=True) + def add_formats(self, *args): if self.gui.stack.currentIndex() != 0: return From e66f422d9fb37f2ff8551eef8952ea82b515bd9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Mar 2012 21:20:02 +0530 Subject: [PATCH 21/27] Fix #958442 (Device Info for Ectaco Jetbook Color) --- src/calibre/customize/builtins.py | 7 +++---- src/calibre/devices/jetbook/driver.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2908444665..55742b3ee3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -625,7 +625,8 @@ from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK, POCKETBOOK701, POCKETBOOK360P, PI2) from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800 -from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI +from calibre.devices.jetbook.driver import (JETBOOK, MIBUK, JETBOOK_MINI, + JETBOOK_COLOR) from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX, KINDLE_FIRE) from calibre.devices.nook.driver import NOOK, NOOK_COLOR @@ -664,9 +665,7 @@ plugins += [ ILIAD, IREXDR1000, IREXDR800, - JETBOOK, - JETBOOK_MINI, - MIBUK, + JETBOOK, JETBOOK_MINI, MIBUK, JETBOOK_COLOR, SHINEBOOK, POCKETBOOK360, POCKETBOOK301, POCKETBOOK602, POCKETBOOK701, POCKETBOOK360P, PI2, diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 0d328ba637..7f2f48a0b4 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -125,4 +125,29 @@ class JETBOOK_MINI(USBMS): SUPPORTS_SUB_DIRS = True +class JETBOOK_COLOR(USBMS): + + ''' +set([(u'0x951', + u'0x160b', + u'0x0', + u'Freescale', + u'Mass Storage Device', + u'0802270905553')]) + ''' + + FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'djvu'] + + gui_name = 'JetBook Color' + name = 'JetBook Color Device Interface' + description = _('Communicate with the JetBook Color reader.') + author = 'Kovid Goyal' + + VENDOR_ID = [0x951] + PRODUCT_ID = [0x160b] + BCD = [0x0] + EBOOK_DIR_MAIN = 'My Books' + + SUPPORTS_SUB_DIRS = True + From c1bb1ab5c6446622367afbfa04e2aaea5a7cc9e3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 09:57:59 +0530 Subject: [PATCH 22/27] E-book viewer: When in full screen mode, using the close window keyboard shortcut drops out of full screen instead of closing the window --- src/calibre/gui2/viewer/main.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 64521ecdd7..a83c5d12c0 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -5,11 +5,11 @@ import traceback, os, sys, functools, collections, re from functools import partial from threading import Thread -from PyQt4.Qt import QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, \ - QDoubleSpinBox, QLabel, QTextBrowser, \ - QPainter, QBrush, QColor, QStandardItemModel, QPalette, \ - QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, \ - QToolButton, QMenu, QInputDialog, QAction, QKeySequence +from PyQt4.Qt import (QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, + QDoubleSpinBox, QLabel, QTextBrowser, + QPainter, QBrush, QColor, QStandardItemModel, QPalette, + QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, + QToolButton, QMenu, QInputDialog, QAction, QKeySequence) from calibre.gui2.viewer.main_ui import Ui_EbookViewer from calibre.gui2.viewer.printing import Printing @@ -338,6 +338,10 @@ class EbookViewer(MainWindow, Ui_EbookViewer): count += 1 def closeEvent(self, e): + if self.isFullScreen(): + self.showNormal() + e.ignore() + return self.save_state() return MainWindow.closeEvent(self, e) From 7658a536ccff140c22e66e7b432b26e8bc166271 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 18:34:24 +0530 Subject: [PATCH 23/27] E-book viewer: A whole new full screen mode, with no toolbars to distract from the text and the ability to set the width of the column of tet via Preferences in the ebook viewer. Fixes #959830 (Feature request for ebook viewer app) --- src/calibre/gui2/viewer/config.ui | 11 ++- src/calibre/gui2/viewer/documentview.py | 48 +++++++++-- src/calibre/gui2/viewer/main.py | 101 +++++++++++++++++++++--- src/calibre/gui2/viewer/main.ui | 3 + 4 files changed, 139 insertions(+), 24 deletions(-) diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index 3158241f28..f876b87fc3 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -255,7 +255,10 @@ - + + + Set the maximum width that the book's text and pictures will take when in fullscreen mode. This allows you to read the book text without it becoming too wide. + px @@ -270,10 +273,10 @@ - Maximum &view width: + Maximum text width in &fullscreen: - max_view_width + max_fs_width @@ -350,7 +353,7 @@ serif_family sans_family mono_family - max_view_width + max_fs_width opt_remember_window_size buttonBox diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index 2f520c1912..7999458004 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -12,7 +12,7 @@ from PyQt4.Qt import (QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, QPainter, QPalette, QBrush, QFontDatabase, QDialog, QColor, QPoint, QImage, QRegion, QVariant, QIcon, QFont, pyqtSignature, QAction, QByteArray, QMenu, - pyqtSignal, QSwipeGesture) + pyqtSignal, QSwipeGesture, QApplication) from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from calibre.utils.config import Config, StringConfig @@ -46,8 +46,10 @@ def config(defaults=None): help=_('Remember last used window size')) c.add_opt('user_css', default='', help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) - c.add_opt('max_view_width', default=6000, - help=_('Maximum width of the viewer window, in pixels.')) + c.add_opt('max_fs_width', default=800, + help=_("Set the maximum width that the book's text and pictures will take" + " when in fullscreen mode. This allows you to read the book text" + " without it becoming too wide.")) c.add_opt('fit_images', default=True, help=_('Resize images larger than the viewer window to fit inside it')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) @@ -101,7 +103,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.standard_font.setCurrentIndex({'serif':0, 'sans':1, 'mono':2}[opts.standard_font]) self.css.setPlainText(opts.user_css) self.css.setToolTip(_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) - self.max_view_width.setValue(opts.max_view_width) + self.max_fs_width.setValue(opts.max_fs_width) with zipfile.ZipFile(P('viewer/hyphenate/patterns.zip', allow_user_override=False), 'r') as zf: pats = [x.split('.')[0].replace('-', '_') for x in zf.namelist()] @@ -144,7 +146,7 @@ class ConfigDialog(QDialog, Ui_Dialog): c.set('user_css', unicode(self.css.toPlainText())) c.set('remember_window_size', self.opt_remember_window_size.isChecked()) c.set('fit_images', self.opt_fit_images.isChecked()) - c.set('max_view_width', int(self.max_view_width.value())) + c.set('max_fs_width', int(self.max_fs_width.value())) c.set('hyphenate', self.hyphenate.isChecked()) c.set('remember_current_page', self.opt_remember_current_page.isChecked()) c.set('wheel_flips_pages', self.opt_wheel_flips_pages.isChecked()) @@ -192,6 +194,8 @@ class Document(QWebPage): # {{{ self.loaded_javascript = False self.js_loader = JavaScriptLoader( dynamic_coffeescript=self.debug_javascript) + self.initial_left_margin = self.initial_right_margin = u'' + self.in_fullscreen_mode = False self.setLinkDelegationPolicy(self.DelegateAllLinks) self.scroll_marks = [] @@ -239,6 +243,9 @@ class Document(QWebPage): # {{{ self.enable_page_flip = self.page_flip_duration > 0.1 self.font_magnification_step = opts.font_magnification_step self.wheel_flips_pages = opts.wheel_flips_pages + screen_width = QApplication.desktop().screenGeometry().width() + # Leave some space for the scrollbar and some border + self.max_fs_width = min(opts.max_fs_width, screen_width-50) def fit_images(self): if self.do_fit_images: @@ -274,6 +281,30 @@ class Document(QWebPage): # {{{ self.set_bottom_padding(0) self.fit_images() self.init_hyphenate() + self.initial_left_margin = unicode(self.javascript( + 'document.body.style.marginLeft').toString()) + self.initial_right_margin = unicode(self.javascript( + 'document.body.style.marginRight').toString()) + if self.in_fullscreen_mode: + self.switch_to_fullscreen_mode() + + def switch_to_fullscreen_mode(self): + self.in_fullscreen_mode = True + self.javascript(''' + var s = document.body.style; + s.maxWidth = "%dpx"; + s.marginLeft = "auto"; + s.marginRight = "auto"; + '''%self.max_fs_width) + + def switch_to_window_mode(self): + self.in_fullscreen_mode = False + self.javascript(''' + var s = document.body.style; + s.maxWidth = "none"; + s.marginLeft = "%s"; + s.marginRight = "%s"; + '''%(self.initial_left_margin, self.initial_right_margin)) @pyqtSignature("QString") def debug(self, msg): @@ -581,8 +612,8 @@ class DocumentView(QWebView): # {{{ def config(self, parent=None): self.document.do_config(parent) - if self.manager is not None: - self.manager.set_max_width() + if self.document.in_fullscreen_mode: + self.document.switch_to_fullscreen_mode() self.setFocus(Qt.OtherFocusReason) def bookmark(self): @@ -602,6 +633,9 @@ class DocumentView(QWebView): # {{{ menu.insertAction(list(menu.actions())[0], self.search_action) menu.addSeparator() menu.addAction(self.goto_location_action) + if self.document.in_fullscreen_mode and self.manager is not None: + menu.addSeparator() + menu.addAction(self.manager.toggle_toolbar_action) menu.exec_(ev.globalPos()) def lookup(self, *args): diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index a83c5d12c0..c1cb89aeb6 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -6,10 +6,10 @@ from functools import partial from threading import Thread from PyQt4.Qt import (QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, - QDoubleSpinBox, QLabel, QTextBrowser, - QPainter, QBrush, QColor, QStandardItemModel, QPalette, - QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, - QToolButton, QMenu, QInputDialog, QAction, QKeySequence) + QSize, QDoubleSpinBox, QLabel, QTextBrowser, QPropertyAnimation, + QPainter, QBrush, QColor, QStandardItemModel, QPalette, QStandardItem, + QUrl, QRegExpValidator, QRegExp, QLineEdit, QToolButton, QMenu, + QInputDialog, QAction, QKeySequence) from calibre.gui2.viewer.main_ui import Ui_EbookViewer from calibre.gui2.viewer.printing import Printing @@ -55,8 +55,6 @@ class TOC(QStandardItemModel): self.appendRow(TOCItem(t)) self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents'))) - - class Worker(Thread): def run(self): @@ -292,6 +290,37 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.tool_bar2.setContextMenuPolicy(Qt.PreventContextMenu) self.tool_bar.widgetForAction(self.action_bookmark).setPopupMode(QToolButton.MenuButtonPopup) self.action_full_screen.setCheckable(True) + self.full_screen_label = QLabel(''' +
+

%s

+

%s

+

%s

+
+ '''%(_('Full screen mode'), + _('Right click to show controls'), + _('Press Esc to quit')), + self) + self.full_screen_label.setVisible(False) + self.full_screen_label.setStyleSheet(''' + QLabel { + text-align: center; + background-color: white; + color: black; + border-width: 1px; + border-style: solid; + border-radius: 20px; + } + ''') + self.toggle_toolbar_action = QAction(_('Show/hide controls'), self) + self.toggle_toolbar_action.triggered.connect(self.toggle_toolbars) + self.addAction(self.toggle_toolbar_action) + self.full_screen_label_anim = QPropertyAnimation( + self.full_screen_label, 'size') + self.esc_full_screen_action = a = QAction(self) + self.addAction(a) + a.setShortcut(Qt.Key_Escape) + a.setEnabled(False) + a.triggered.connect(self.action_full_screen.trigger) self.print_menu = QMenu() self.print_menu.addAction(QIcon(I('print-preview.png')), _('Print Preview')) @@ -299,7 +328,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.tool_bar.widgetForAction(self.action_print).setPopupMode(QToolButton.MenuButtonPopup) self.connect(self.action_print, SIGNAL("triggered(bool)"), partial(self.print_book, preview=False)) self.connect(self.print_menu.actions()[0], SIGNAL("triggered(bool)"), partial(self.print_book, preview=True)) - self.set_max_width() ca = self.view.copy_action ca.setShortcut(QKeySequence.Copy) self.addAction(ca) @@ -313,6 +341,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer): w = self.tool_bar.widgetForAction(self.action_open_ebook) w.setPopupMode(QToolButton.MenuButtonPopup) + for x in ('tool_bar', 'tool_bar2'): + x = getattr(self, x) + for action in x.actions(): + # So that the keyboard shortcuts for these actions will + # continue to function even when the toolbars are hidden + self.addAction(action) + self.restore_state() def set_toc_visible(self, yes): @@ -339,12 +374,17 @@ class EbookViewer(MainWindow, Ui_EbookViewer): def closeEvent(self, e): if self.isFullScreen(): - self.showNormal() + self.action_full_screen.trigger() e.ignore() return self.save_state() return MainWindow.closeEvent(self, e) + def toggle_toolbars(self): + for x in ('tool_bar', 'tool_bar2'): + x = getattr(self, x) + x.setVisible(not x.isVisible()) + def save_state(self): state = bytearray(self.saveState(self.STATE_VERSION)) vprefs['viewer_toolbar_state'] = state @@ -386,11 +426,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self._lookup = None self.dictionary_view.setHtml(html) - def set_max_width(self): - from calibre.gui2.viewer.documentview import config - c = config().parse() - self.frame.setMaximumWidth(c.max_view_width) - def get_remember_current_page_opt(self): from calibre.gui2.viewer.documentview import config c = config().parse() @@ -405,6 +440,46 @@ class EbookViewer(MainWindow, Ui_EbookViewer): else: self.showFullScreen() + def showFullScreen(self): + self.tool_bar.setVisible(False) + self.tool_bar2.setVisible(False) + self._original_frame_margins = ( + self.centralwidget.layout().contentsMargins(), + self.frame.layout().contentsMargins()) + self.frame.layout().setContentsMargins(0, 0, 0, 0) + self.centralwidget.layout().setContentsMargins(0, 0, 0, 0) + + super(EbookViewer, self).showFullScreen() + QTimer.singleShot(10, self.show_full_screen_label) + + def show_full_screen_label(self): + f = self.full_screen_label + self.esc_full_screen_action.setEnabled(True) + f.setVisible(True) + height = 200 + width = int(0.7*self.view.width()) + f.resize(width, height) + f.move((self.view.width() - width)//2, (self.view.height()-height)//2) + a = self.full_screen_label_anim + a.setDuration(500) + a.setStartValue(QSize(width, 0)) + a.setEndValue(QSize(width, height)) + a.start() + QTimer.singleShot(2750, self.full_screen_label.hide) + self.view.document.switch_to_fullscreen_mode() + + def showNormal(self): + self.esc_full_screen_action.setEnabled(False) + self.tool_bar.setVisible(True) + self.tool_bar2.setVisible(True) + self.full_screen_label.setVisible(False) + if hasattr(self, '_original_frame_margins'): + om = self._original_frame_margins + self.centralwidget.layout().setContentsMargins(om[0]) + self.frame.layout().setContentsMargins(om[1]) + super(EbookViewer, self).showNormal() + self.view.document.switch_to_window_mode() + def goto(self, ref): if ref: tokens = ref.split('.') diff --git a/src/calibre/gui2/viewer/main.ui b/src/calibre/gui2/viewer/main.ui index 3137ad2e07..659a534fa8 100644 --- a/src/calibre/gui2/viewer/main.ui +++ b/src/calibre/gui2/viewer/main.ui @@ -284,6 +284,9 @@ Toggle full screen + + Toggle full screen (F11) + From 9915d4b9636ec094f9b501d94ec55b4986df9c7c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 18:49:00 +0530 Subject: [PATCH 24/27] Fix Le Monde --- recipes/le_monde.recipe | 79 +++++------------------------------------ 1 file changed, 8 insertions(+), 71 deletions(-) diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index 8fcdf9c870..6c7f15cca7 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -3,7 +3,6 @@ __copyright__ = '2011' ''' lemonde.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): @@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe): remove_empty_feeds = True - filterDuplicates = True + auto_cleanup = True - def preprocess_html(self, soup): - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return self.adeify_images(soup) - - preprocess_regexps = [ - (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), - (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), - (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), - (re.compile(r''), lambda match: ' '), - (re.compile(r'\("'), lambda match: '(« '), - (re.compile(r'"\)'), lambda match: ' »)'), - (re.compile(r'“'), lambda match: '(« '), - (re.compile(r'”'), lambda match: ' »)'), - (re.compile(r'>\''), lambda match: '>‘'), - (re.compile(r' \''), lambda match: ' ‘'), - (re.compile(r'\''), lambda match: '’'), - (re.compile(r'"'), lambda match: '« '), - (re.compile(r'""'), lambda match: '« '), - (re.compile(r'""'), lambda match: ' »'), - (re.compile(r'"'), lambda match: ' »'), - (re.compile(r'""'), lambda match: '>« '), - (re.compile(r'"<'), lambda match: ' »<'), - (re.compile(r'’"'), lambda match: '’« '), - (re.compile(r' "'), lambda match: ' « '), - (re.compile(r'" '), lambda match: ' » '), - (re.compile(r'"\.'), lambda match: ' ».'), - (re.compile(r'",'), lambda match: ' »,'), - (re.compile(r'"\?'), lambda match: ' »?'), - (re.compile(r'":'), lambda match: ' »:'), - (re.compile(r'";'), lambda match: ' »;'), - (re.compile(r'"\!'), lambda match: ' »!'), - (re.compile(r' :'), lambda match: ' :'), - (re.compile(r' ;'), lambda match: ' ;'), - (re.compile(r' \?'), lambda match: ' ?'), - (re.compile(r' \!'), lambda match: ' !'), - (re.compile(r'\s»'), lambda match: ' »'), - (re.compile(r'«\s'), lambda match: '« '), - (re.compile(r' %'), lambda match: ' %'), - (re.compile(r'\.jpg » border='), lambda match: '.jpg'), - (re.compile(r'\.png » border='), lambda match: '.png'), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' - '), lambda match: ' – '), - (re.compile(r' -,'), lambda match: ' –,'), - (re.compile(r'»:'), lambda match: '» :'), - ] - - - keep_only_tags = [ - dict(name='div', attrs={'class':['contenu']}) - ] - remove_tags = [dict(name='div', attrs={'class':['LM_atome']})] - remove_tags_after = [dict(id='appel_temoignage')] - - def get_article_url(self, article): - url = article.get('guid', None) - if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : - url = None - return url - -# def get_article_url(self, article): -# link = article.get('link') -# if 'blog' not in link and ('chat' not in link): -# return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), @@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe): return cover_url + def get_article_url(self, article): + url = article.get('guid', None) + if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : + url = None + return url + + From 6b12bc2e560458daa07c4ef2ab30b60f6e9fa280 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 18:51:53 +0530 Subject: [PATCH 25/27] Fix calibre not supporting different http and https proxies. Fixes #960173 (Proxy error with multiple "*_proxy" environment variables under GNU/Linux) --- src/calibre/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index ea5e4858ca..2a2242a68f 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -381,12 +381,15 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() + to_add = {} http_proxy = proxies.get('http', None) if http_proxy: - opener.set_proxies({'http':http_proxy}) + to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: - opener.set_proxies({'https':https_proxy}) + to_add['https'] = https_proxy + if to_add: + opener.set_proxies(to_add) return opener From 3656de9ea2309977894b8fa04c31e00105145ca6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 19:47:45 +0530 Subject: [PATCH 26/27] Klub knjige by Darko Miletic. Fixes #960197 (New recipe for blog klub knjige) --- recipes/klubknjige.recipe | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 recipes/klubknjige.recipe diff --git a/recipes/klubknjige.recipe b/recipes/klubknjige.recipe new file mode 100644 index 0000000000..dd16c0b3b9 --- /dev/null +++ b/recipes/klubknjige.recipe @@ -0,0 +1,42 @@ + +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +klub-knjige.blogspot.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class KlubKnjige(BasicNewsRecipe): + title = 'Klub knjige' + __author__ = 'Darko Miletic' + description = 'literarni blog' + oldest_article = 30 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif} + img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'knjige, blog, srbija, sf' + , 'publisher': 'Klub Knjige' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) From 931d46cd84761f55372adbe4ca04a1ffe2d6b73a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 20:47:51 +0530 Subject: [PATCH 27/27] Ivana Milakovic by Darko Miletic. Fixes #960279 (New recipe for blog Ivana Milakovic) --- recipes/ivanamilakovic.recipe | 43 +++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 recipes/ivanamilakovic.recipe diff --git a/recipes/ivanamilakovic.recipe b/recipes/ivanamilakovic.recipe new file mode 100644 index 0000000000..34e00a7ed8 --- /dev/null +++ b/recipes/ivanamilakovic.recipe @@ -0,0 +1,43 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +ivanamilakovic.blogspot.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class IvanaMilakovic(BasicNewsRecipe): + title = u'Ivana Milaković' + __author__ = 'Darko Miletic' + description = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...' + oldest_article = 80 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif} + img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'knjige, blog, srbija, sf' + , 'publisher': 'Ivana Milakovic' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup)