diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index ab271ad753..0e2d5c1ebe 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'FHM UK' description = 'Good News for Men' - cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' + cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg' + # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' __author__ = 'Dave Asbury' - # last updated 27/1/12 + # last updated 17/3/12 language = 'en_GB' oldest_article = 28 max_articles_per_feed = 12 @@ -29,6 +30,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): feeds = [ (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), - (u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), - (u'Gaming',u'http://feed43.com/0755006465351035.xml'), - ] + (u'Upgrade',u'http://feed43.com/0877305847443234.xml'), + #(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), + #(u'Gaming',u'http://feed43.com/0755006465351035.xml'), + (u'Gaming',u'http://feed43.com/6537162612465672.xml'), + ] diff --git a/recipes/ivanamilakovic.recipe b/recipes/ivanamilakovic.recipe new file mode 100644 index 0000000000..34e00a7ed8 --- /dev/null +++ b/recipes/ivanamilakovic.recipe @@ -0,0 +1,43 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +ivanamilakovic.blogspot.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class IvanaMilakovic(BasicNewsRecipe): + title = u'Ivana Milaković' + __author__ = 'Darko Miletic' + description = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...' + oldest_article = 80 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif} + img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'knjige, blog, srbija, sf' + , 'publisher': 'Ivana Milakovic' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) diff --git a/recipes/klubknjige.recipe b/recipes/klubknjige.recipe new file mode 100644 index 0000000000..dd16c0b3b9 --- /dev/null +++ b/recipes/klubknjige.recipe @@ -0,0 +1,42 @@ + +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +klub-knjige.blogspot.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class KlubKnjige(BasicNewsRecipe): + title = 'Klub knjige' + __author__ = 'Darko Miletic' + description = 'literarni blog' + oldest_article = 30 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif} + img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'knjige, blog, srbija, sf' + , 'publisher': 'Klub Knjige' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index 8fcdf9c870..6c7f15cca7 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -3,7 +3,6 @@ __copyright__ = '2011' ''' lemonde.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): @@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe): remove_empty_feeds = True - filterDuplicates = True + auto_cleanup = True - def preprocess_html(self, soup): - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return self.adeify_images(soup) - - preprocess_regexps = [ - (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), - (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), - (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), - (re.compile(r''), lambda match: ' '), - (re.compile(r'\("'), lambda match: '(« '), - (re.compile(r'"\)'), lambda match: ' »)'), - (re.compile(r'“'), lambda match: '(« '), - (re.compile(r'”'), lambda match: ' »)'), - (re.compile(r'>\''), lambda match: '>‘'), - (re.compile(r' \''), lambda match: ' ‘'), - (re.compile(r'\''), lambda match: '’'), - (re.compile(r'"'), lambda match: '« '), - (re.compile(r'""'), lambda match: '« '), - (re.compile(r'""'), lambda match: ' »'), - (re.compile(r'"'), lambda match: ' »'), - (re.compile(r'""'), lambda match: '>« '), - (re.compile(r'"<'), lambda match: ' »<'), - (re.compile(r'’"'), lambda match: '’« '), - (re.compile(r' "'), lambda match: ' « '), - (re.compile(r'" '), lambda match: ' » '), - (re.compile(r'"\.'), lambda match: ' ».'), - (re.compile(r'",'), lambda match: ' »,'), - (re.compile(r'"\?'), lambda match: ' »?'), - (re.compile(r'":'), lambda match: ' »:'), - (re.compile(r'";'), lambda match: ' »;'), - (re.compile(r'"\!'), lambda match: ' »!'), - (re.compile(r' :'), lambda match: ' :'), - (re.compile(r' ;'), lambda match: ' ;'), - (re.compile(r' \?'), lambda match: ' ?'), - (re.compile(r' \!'), lambda match: ' !'), - (re.compile(r'\s»'), lambda match: ' »'), - (re.compile(r'«\s'), lambda match: '« '), - (re.compile(r' %'), lambda match: ' %'), - (re.compile(r'\.jpg » border='), lambda match: '.jpg'), - (re.compile(r'\.png » border='), lambda match: '.png'), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' - '), lambda match: ' – '), - (re.compile(r' -,'), lambda match: ' –,'), - (re.compile(r'»:'), lambda match: '» :'), - ] - - - keep_only_tags = [ - dict(name='div', attrs={'class':['contenu']}) - ] - remove_tags = [dict(name='div', attrs={'class':['LM_atome']})] - remove_tags_after = [dict(id='appel_temoignage')] - - def get_article_url(self, article): - url = article.get('guid', None) - if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : - url = None - return url - -# def get_article_url(self, article): -# link = article.get('link') -# if 'blog' not in link and ('chat' not in link): -# return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), @@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe): return cover_url + def get_article_url(self, article): + url = article.get('guid', None) + if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : + url = None + return url + + diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py index 69e669566d..3e251d2dcf 100644 --- a/setup/installer/windows/freeze.py +++ b/setup/installer/windows/freeze.py @@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC from setup.installer.windows.wix import WixMixIn OPENSSL_DIR = r'Q:\openssl' -QT_DIR = 'Q:\\Qt\\4.7.3' +QT_DIR = 'Q:\\Qt\\4.8.0' QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns'] LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll' SW = r'C:\cygwin\home\kovid\sw' diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst index d063a19249..36acacdb55 100644 --- a/setup/installer/windows/notes.rst +++ b/setup/installer/windows/notes.rst @@ -97,7 +97,9 @@ Now, run configure and make:: -no-plugin-manifests is needed so that loading the plugins does not fail looking for the CRT assembly - configure -opensource -release -qt-zlib -qt-gif -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake + configure -opensource -release -qt-zlib -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake + +Add the path to the bin folder inside the Qt dir to your system PATH. SIP ----- diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index ea5e4858ca..2a2242a68f 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -381,12 +381,15 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() + to_add = {} http_proxy = proxies.get('http', None) if http_proxy: - opener.set_proxies({'http':http_proxy}) + to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: - opener.set_proxies({'https':https_proxy}) + to_add['https'] = https_proxy + if to_add: + opener.set_proxies(to_add) return opener diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2908444665..55742b3ee3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -625,7 +625,8 @@ from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK, POCKETBOOK701, POCKETBOOK360P, PI2) from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800 -from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI +from calibre.devices.jetbook.driver import (JETBOOK, MIBUK, JETBOOK_MINI, + JETBOOK_COLOR) from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX, KINDLE_FIRE) from calibre.devices.nook.driver import NOOK, NOOK_COLOR @@ -664,9 +665,7 @@ plugins += [ ILIAD, IREXDR1000, IREXDR800, - JETBOOK, - JETBOOK_MINI, - MIBUK, + JETBOOK, JETBOOK_MINI, MIBUK, JETBOOK_COLOR, SHINEBOOK, POCKETBOOK360, POCKETBOOK301, POCKETBOOK602, POCKETBOOK701, POCKETBOOK360P, PI2, diff --git a/src/calibre/debug.py b/src/calibre/debug.py index 13cccd3e01..f5f803ec84 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -234,7 +234,7 @@ def main(args=sys.argv): sql_dump = args[-1] reinit_db(opts.reinitialize_db, sql_dump=sql_dump) elif opts.inspect_mobi: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi for path in args[1:]: prints('Inspecting:', path) inspect_mobi(path) diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 0d328ba637..7f2f48a0b4 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -125,4 +125,29 @@ class JETBOOK_MINI(USBMS): SUPPORTS_SUB_DIRS = True +class JETBOOK_COLOR(USBMS): + + ''' +set([(u'0x951', + u'0x160b', + u'0x0', + u'Freescale', + u'Mass Storage Device', + u'0802270905553')]) + ''' + + FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'djvu'] + + gui_name = 'JetBook Color' + name = 'JetBook Color Device Interface' + description = _('Communicate with the JetBook Color reader.') + author = 'Kovid Goyal' + + VENDOR_ID = [0x951] + PRODUCT_ID = [0x160b] + BCD = [0x0] + EBOOK_DIR_MAIN = 'My Books' + + SUPPORTS_SUB_DIRS = True + diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index bfce4fa1be..3ba3fcf50f 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -27,7 +27,7 @@ class PRS505(USBMS): booklist_class = CollectionsBookList - FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] + FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt', 'zbf'] CAN_SET_METADATA = ['title', 'authors', 'collections'] CAN_DO_DEVICE_DB_PLUGBOARD = True diff --git a/src/calibre/ebooks/conversion/plugins/epub_output.py b/src/calibre/ebooks/conversion/plugins/epub_output.py index 89cf987bb1..45df8ba9d1 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_output.py +++ b/src/calibre/ebooks/conversion/plugins/epub_output.py @@ -190,12 +190,22 @@ class EPUBOutput(OutputFormatPlugin): if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'): uuid = unicode(x).split(':')[-1] break + encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) + if uuid is None: self.log.warn('No UUID identifier found') from uuid import uuid4 uuid = str(uuid4()) oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) + if encrypted_fonts and not uuid.startswith('urn:uuid:'): + # Apparently ADE requires this value to start with urn:uuid: + # for some absurd reason, or it will throw a hissy fit and refuse + # to use the obfuscated fonts. + for x in identifiers: + if unicode(x) == uuid: + x.content = 'urn:uuid:'+uuid + with TemporaryDirectory(u'_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format metadata_xml = None @@ -210,7 +220,6 @@ class EPUBOutput(OutputFormatPlugin): opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ if x.endswith('.ncx')][0]) - encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) diff --git a/src/calibre/ebooks/conversion/plugins/mobi_input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py index 9d71b69891..49a57cbde1 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_input.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py @@ -59,7 +59,10 @@ class MOBIInput(InputFormatPlugin): if mr.kf8_type is not None: log('Found KF8 MOBI of type %r'%mr.kf8_type) from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader - return os.path.abspath(Mobi8Reader(mr, log)()) + mr = Mobi8Reader(mr, log) + opf = os.path.abspath(mr()) + self.encrypted_fonts = mr.encrypted_fonts + return opf raw = parse_cache.pop('calibre_raw_mobi_markup', False) if raw: diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 7288f095d7..06580be1ba 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -179,7 +179,7 @@ class MOBIOutput(OutputFormatPlugin): writer(oeb, output_path) if opts.extract_to is not None: - from calibre.ebooks.mobi.debug import inspect_mobi + from calibre.ebooks.mobi.debug.main import inspect_mobi ddir = opts.extract_to inspect_mobi(output_path, ddir=ddir) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py deleted file mode 100644 index b12c9d2121..0000000000 --- a/src/calibre/ebooks/mobi/debug.py +++ /dev/null @@ -1,1491 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -from __future__ import (unicode_literals, division, absolute_import, - print_function) - -__license__ = 'GPL v3' -__copyright__ = '2011, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -import struct, datetime, sys, os, shutil -from collections import OrderedDict, defaultdict - -from lxml import html - -from calibre.utils.date import utc_tz -from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.reader.headers import NULL_INDEX -from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs, read_font_record) -from calibre.utils.magick.draw import identify_data - -def format_bytes(byts): - byts = bytearray(byts) - byts = [hex(b)[2:] for b in byts] - return ' '.join(byts) - -# PalmDB {{{ -class PalmDOCAttributes(object): - - class Attr(object): - - def __init__(self, name, field, val): - self.name = name - self.val = val & field - - def __str__(self): - return '%s: %s'%(self.name, bool(self.val)) - - def __init__(self, raw): - self.val = struct.unpack(b'H', self.raw[34:36])[0] - - palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) - self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] - self.creation_date = (palm_epoch + - datetime.timedelta(seconds=self.creation_date_raw)) - self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] - self.modification_date = (palm_epoch + - datetime.timedelta(seconds=self.modification_date_raw)) - self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] - self.last_backup_date = (palm_epoch + - datetime.timedelta(seconds=self.last_backup_date_raw)) - self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] - self.app_info_id = self.raw[52:56] - self.sort_info_id = self.raw[56:60] - self.type = self.raw[60:64] - self.creator = self.raw[64:68] - self.ident = self.type + self.creator - if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): - raise ValueError('Unknown book ident: %r'%self.ident) - self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) - self.next_rec_list_id = self.raw[72:76] - - self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) - - def __str__(self): - ans = ['*'*20 + ' PalmDB Header '+ '*'*20] - ans.append('Name: %r'%self.name) - ans.append(str(self.attributes)) - ans.append('Version: %s'%self.version) - ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), - self.creation_date_raw)) - ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), - self.modification_date_raw)) - ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), - self.last_backup_date_raw)) - ans.append('Modification number: %s'%self.modification_number) - ans.append('App Info ID: %r'%self.app_info_id) - ans.append('Sort Info ID: %r'%self.sort_info_id) - ans.append('Type: %r'%self.type) - ans.append('Creator: %r'%self.creator) - ans.append('Last record UID +1: %r'%self.last_record_uid) - ans.append('Next record list id: %r'%self.next_rec_list_id) - ans.append('Number of records: %s'%self.number_of_records) - - return '\n'.join(ans) -# }}} - -class Record(object): # {{{ - - def __init__(self, raw, header): - self.offset, self.flags, self.uid = header - self.raw = raw - - @property - def header(self): - return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, - self.uid, self.raw[:4], len(self.raw)) -# }}} - -# EXTH {{{ -class EXTHRecord(object): - - def __init__(self, type_, data): - self.type = type_ - self.data = data - self.name = { - 1 : 'DRM Server id', - 2 : 'DRM Commerce id', - 3 : 'DRM ebookbase book id', - 100 : 'author', - 101 : 'publisher', - 102 : 'imprint', - 103 : 'description', - 104 : 'isbn', - 105 : 'subject', - 106 : 'publishingdate', - 107 : 'review', - 108 : 'contributor', - 109 : 'rights', - 110 : 'subjectcode', - 111 : 'type', - 112 : 'source', - 113 : 'asin', - 114 : 'versionnumber', - 115 : 'sample', - 116 : 'startreading', - 117 : 'adult', - 118 : 'retailprice', - 119 : 'retailpricecurrency', - 121 : 'KF8 header section index', - 125 : 'KF8 resources (images/fonts) count', - 129 : 'KF8 cover URI', - 131 : 'KF8 unknown count', - 201 : 'coveroffset', - 202 : 'thumboffset', - 203 : 'hasfakecover', - 204 : 'Creator Software', - 205 : 'Creator Major Version', # '>I' - 206 : 'Creator Minor Version', # '>I' - 207 : 'Creator Build Number', # '>I' - 208 : 'watermark', - 209 : 'tamper_proof_keys', - 300 : 'fontsignature', - 301 : 'clippinglimit', # percentage '>B' - 402 : 'publisherlimit', - 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled - 501 : 'cdetype', # 4 chars (PDOC or EBOK) - 502 : 'lastupdatetime', - 503 : 'updatedtitle', - }.get(self.type, repr(self.type)) - - if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', - 'Creator Major Version', 'Creator Minor Version', - 'Creator Build Number', 'Creator Software', 'startreading'} or - self.type in {121, 125, 131}): - self.data, = struct.unpack(b'>I', self.data) - - def __str__(self): - return '%s (%d): %r'%(self.name, self.type, self.data) - -class EXTHHeader(object): - - def __init__(self, raw): - self.raw = raw - if not self.raw.startswith(b'EXTH'): - raise ValueError('EXTH header does not start with EXTH') - self.length, = struct.unpack(b'>I', self.raw[4:8]) - self.count, = struct.unpack(b'>I', self.raw[8:12]) - - pos = 12 - self.records = [] - for i in xrange(self.count): - pos = self.read_record(pos) - self.records.sort(key=lambda x:x.type) - - def read_record(self, pos): - type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) - data = self.raw[(pos+8):(pos+length)] - self.records.append(EXTHRecord(type_, data)) - return pos + length - - def __str__(self): - ans = ['*'*20 + ' EXTH Header '+ '*'*20] - ans.append('EXTH header length: %d'%self.length) - ans.append('Number of EXTH records: %d'%self.count) - ans.append('EXTH records...') - for r in self.records: - ans.append(str(r)) - return '\n'.join(ans) -# }}} - -class MOBIHeader(object): # {{{ - - def __init__(self, record0): - self.raw = record0.raw - - self.compression_raw = self.raw[:2] - self.compression = {1: 'No compression', 2: 'PalmDoc compression', - 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', - self.compression_raw)[0], - repr(self.compression_raw)) - self.unused = self.raw[2:4] - self.text_length, = struct.unpack(b'>I', self.raw[4:8]) - self.number_of_text_records, self.text_record_size = \ - struct.unpack(b'>HH', self.raw[8:12]) - self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) - self.encryption_type = { - 0: 'No encryption', - 1: 'Old mobipocket encryption', - 2: 'Mobipocket encryption' - }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) - self.unknown = self.raw[14:16] - - self.identifier = self.raw[16:20] - if self.identifier != b'MOBI': - raise ValueError('Identifier %r unknown'%self.identifier) - - self.length, = struct.unpack(b'>I', self.raw[20:24]) - self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) - self.type = { - 2 : 'Mobipocket book', - 3 : 'PalmDOC book', - 4 : 'Audio', - 257 : 'News', - 258 : 'News Feed', - 259 : 'News magazine', - 513 : 'PICS', - 514 : 'Word', - 515 : 'XLS', - 516 : 'PPT', - 517 : 'TEXT', - 518 : 'HTML', - }.get(self.type_raw, repr(self.type_raw)) - - self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) - self.encoding = { - 1252 : 'cp1252', - 65001: 'utf-8', - }.get(self.encoding_raw, repr(self.encoding_raw)) - self.uid = self.raw[32:36] - self.file_version = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] - self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] - self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) - self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) - self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) - self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) - langcode = self.locale_raw - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - self.language = main_language.get(langid, 'ENGLISH') - self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') - - self.input_language = self.raw[96:100] - self.output_langauage = self.raw[100:104] - self.min_version, = struct.unpack(b'>I', self.raw[104:108]) - self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) - self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) - self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) - self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) - self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) - self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) - self.has_exth = bool(self.exth_flags & 0x40) - self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 - if self.has_drm_data: - self.unknown3 = self.raw[132:164] - self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) - self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) - self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) - self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) - self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 - self.has_fcis_flis = False - self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False - self.extra_data_flags = 0 - if self.has_extra_data_flags: - self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) - (self.fcis_number, self.fcis_count, self.flis_number, - self.flis_count) = struct.unpack(b'>IIII', - self.raw[200:216]) - self.unknown6 = self.raw[216:224] - self.srcs_record_index = struct.unpack(b'>I', - self.raw[224:228])[0] - self.num_srcs_records = struct.unpack(b'>I', - self.raw[228:232])[0] - self.unknown7 = self.raw[232:240] - self.extra_data_flags = struct.unpack(b'>I', - self.raw[240:244])[0] - self.has_multibytes = bool(self.extra_data_flags & 0b1) - self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) - self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) - self.primary_index_record, = struct.unpack(b'>I', - self.raw[244:248]) - - if self.has_exth: - self.exth_offset = 16 + self.length - - self.exth = EXTHHeader(self.raw[self.exth_offset:]) - - self.end_of_exth = self.exth_offset + self.exth.length - self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] - - def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] - ans.append('Compression: %s'%self.compression) - ans.append('Unused: %r'%self.unused) - ans.append('Number of text records: %d'%self.number_of_text_records) - ans.append('Text record size: %d'%self.text_record_size) - ans.append('Encryption: %s'%self.encryption_type) - ans.append('Unknown: %r'%self.unknown) - ans.append('Identifier: %r'%self.identifier) - ans.append('Header length: %d'% self.length) - ans.append('Type: %s'%self.type) - ans.append('Encoding: %s'%self.encoding) - ans.append('UID: %r'%self.uid) - ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) - ans.append('Secondary index record: %d (null val: %d)'%( - self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) - ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, - self.first_non_book_record)) - ans.append('Full name offset: %d'%self.fullname_offset) - ans.append('Full name length: %d bytes'%self.fullname_length) - ans.append('Langcode: %r'%self.locale_raw) - ans.append('Language: %s'%self.language) - ans.append('Sub language: %s'%self.sublanguage) - ans.append('Input language: %r'%self.input_language) - ans.append('Output language: %r'%self.output_langauage) - ans.append('Min version: %d'%self.min_version) - ans.append('First Image index: %d'%self.first_image_index) - ans.append('Huffman record offset: %d'%self.huffman_record_offset) - ans.append('Huffman record count: %d'%self.huffman_record_count) - ans.append('DATP record offset: %r'%self.datp_record_offset) - ans.append('DATP record count: %r'%self.datp_record_count) - ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) - if self.has_drm_data: - ans.append('Unknown3: %r'%self.unknown3) - ans.append('DRM Offset: %s'%self.drm_offset) - ans.append('DRM Count: %s'%self.drm_count) - ans.append('DRM Size: %s'%self.drm_size) - ans.append('DRM Flags: %r'%self.drm_flags) - if self.has_extra_data_flags: - ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) - ans.append('FCIS number: %d'% self.fcis_number) - ans.append('FCIS count: %d'% self.fcis_count) - ans.append('FLIS number: %d'% self.flis_number) - ans.append('FLIS count: %d'% self.flis_count) - ans.append('Unknown6: %r'% self.unknown6) - ans.append('SRCS record index: %d'%self.srcs_record_index) - ans.append('Number of SRCS records?: %d'%self.num_srcs_records) - ans.append('Unknown7: %r'%self.unknown7) - ans.append(('Extra data flags: %s (has multibyte: %s) ' - '(has indexing: %s) (has uncrossable breaks: %s)')%( - bin(self.extra_data_flags), self.has_multibytes, - self.has_indexing_bytes, self.has_uncrossable_breaks )) - ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, - self.primary_index_record)) - - ans = '\n'.join(ans) - - if self.has_exth: - ans += '\n\n' + str(self.exth) - ans += '\n\nBytes after EXTH (%d bytes): %s'%( - len(self.bytes_after_exth), - format_bytes(self.bytes_after_exth)) - - ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + - self.fullname_length)) - - ans += '\nRecord 0 length: %d'%len(self.raw) - return ans -# }}} - -class TagX(object): # {{{ - - def __init__(self, raw): - self.tag = ord(raw[0]) - self.num_values = ord(raw[1]) - self.bitmask = ord(raw[2]) - # End of file = 1 iff last entry - # When it is 1 all others are 0 - self.eof = ord(raw[3]) - - self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 - and self.bitmask == 0) - - def __repr__(self): - return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag, - self.num_values, bin(self.bitmask), self.eof) - # }}} - -class SecondaryIndexHeader(object): # {{{ - - def __init__(self, record): - self.record = record - raw = self.record.raw - #open('/t/index_header.bin', 'wb').write(raw) - if raw[:4] != b'INDX': - raise ValueError('Invalid Secondary Index Record') - self.header_length, = struct.unpack('>I', raw[4:8]) - self.unknown1 = raw[8:16] - self.index_type, = struct.unpack('>I', raw[16:20]) - self.index_type_desc = {0: 'normal', 2: - 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') - self.idxt_start, = struct.unpack('>I', raw[20:24]) - self.index_count, = struct.unpack('>I', raw[24:28]) - self.index_encoding_num, = struct.unpack('>I', raw[28:32]) - self.index_encoding = {65001: 'utf-8', 1252: - 'cp1252'}.get(self.index_encoding_num, 'unknown') - if self.index_encoding == 'unknown': - raise ValueError( - 'Unknown index encoding: %d'%self.index_encoding_num) - self.unknown2 = raw[32:36] - self.num_index_entries, = struct.unpack('>I', raw[36:40]) - self.ordt_start, = struct.unpack('>I', raw[40:44]) - self.ligt_start, = struct.unpack('>I', raw[44:48]) - self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) - self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) - self.unknown3 = raw[56:180] - self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) - if self.tagx_offset != self.header_length: - raise ValueError('TAGX offset and header length disagree') - self.unknown4 = raw[184:self.header_length] - - tagx = raw[self.header_length:] - if not tagx.startswith(b'TAGX'): - raise ValueError('Invalid TAGX section') - self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) - self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) - if self.tagx_entries and not self.tagx_entries[-1].is_eof: - raise ValueError('TAGX last entry is not EOF') - - idxt0_pos = self.header_length+self.tagx_header_length - num = ord(raw[idxt0_pos]) - count_pos = idxt0_pos+1+num - self.last_entry = raw[idxt0_pos+1:count_pos] - self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) - - # There may be some alignment zero bytes between the end of the idxt0 - # and self.idxt_start - idxt = raw[self.idxt_start:] - if idxt[:4] != b'IDXT': - raise ValueError('Invalid IDXT header') - length_check, = struct.unpack(b'>H', idxt[4:6]) - if length_check != self.header_length + self.tagx_header_length: - raise ValueError('Length check failed') - if idxt[6:].replace(b'\0', b''): - raise ValueError('Non null trailing bytes after IDXT') - - - def __str__(self): - ans = ['*'*20 + ' Secondary Index Header '+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) - a('Offset to IDXT start: %d'%self.idxt_start) - a('Number of index records: %d'%self.index_count) - a('Index encoding: %s (%d)'%(self.index_encoding, - self.index_encoding_num)) - u(self.unknown2) - a('Number of index entries: %d'% self.num_index_entries) - a('ORDT start: %d'%self.ordt_start) - a('LIGT start: %d'%self.ligt_start) - a('Number of LIGT entries: %d'%self.num_of_ligt_entries) - a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) - u(self.unknown3) - a('TAGX offset: %d'%self.tagx_offset) - u(self.unknown4) - a('\n\n') - a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20) - a('Header length: %d'%self.tagx_header_length) - a('Control byte count: %d'%self.tagx_control_byte_count) - for i in self.tagx_entries: - a('\t' + repr(i)) - a('Index of last IndexEntry in secondary index record: %s'% self.last_entry) - a('Number of entries in the NCX: %d'% self.ncx_count) - - return '\n'.join(ans) - -# }}} - -class IndexHeader(object): # {{{ - - def __init__(self, record): - self.record = record - raw = self.record.raw - #open('/t/index_header.bin', 'wb').write(raw) - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') - - self.header_length, = struct.unpack('>I', raw[4:8]) - self.unknown1 = raw[8:16] - self.index_type, = struct.unpack('>I', raw[16:20]) - self.index_type_desc = {0: 'normal', 2: - 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') - self.idxt_start, = struct.unpack('>I', raw[20:24]) - self.index_count, = struct.unpack('>I', raw[24:28]) - self.index_encoding_num, = struct.unpack('>I', raw[28:32]) - self.index_encoding = {65001: 'utf-8', 1252: - 'cp1252'}.get(self.index_encoding_num, 'unknown') - if self.index_encoding == 'unknown': - raise ValueError( - 'Unknown index encoding: %d'%self.index_encoding_num) - self.possibly_language = raw[32:36] - self.num_index_entries, = struct.unpack('>I', raw[36:40]) - self.ordt_start, = struct.unpack('>I', raw[40:44]) - self.ligt_start, = struct.unpack('>I', raw[44:48]) - self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) - self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) - self.unknown2 = raw[56:180] - self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) - if self.tagx_offset != self.header_length: - raise ValueError('TAGX offset and header length disagree') - self.unknown3 = raw[184:self.header_length] - - tagx = raw[self.header_length:] - if not tagx.startswith(b'TAGX'): - raise ValueError('Invalid TAGX section') - self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) - self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) - if self.tagx_entries and not self.tagx_entries[-1].is_eof: - raise ValueError('TAGX last entry is not EOF') - - idxt0_pos = self.header_length+self.tagx_header_length - last_num, consumed = decode_hex_number(raw[idxt0_pos:]) - count_pos = idxt0_pos + consumed - self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) - self.last_entry = last_num - - if last_num != self.ncx_count - 1: - raise ValueError('Last id number in the NCX != NCX count - 1') - # There may be some alignment zero bytes between the end of the idxt0 - # and self.idxt_start - - idxt = raw[self.idxt_start:] - if idxt[:4] != b'IDXT': - raise ValueError('Invalid IDXT header') - length_check, = struct.unpack(b'>H', idxt[4:6]) - if length_check != self.header_length + self.tagx_header_length: - raise ValueError('Length check failed') - if idxt[6:].replace(b'\0', b''): - raise ValueError('Non null trailing bytes after IDXT') - - - def __str__(self): - ans = ['*'*20 + ' Index Header (%d bytes)'%len(self.record.raw)+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) - a('Offset to IDXT start: %d'%self.idxt_start) - a('Number of index records: %d'%self.index_count) - a('Index encoding: %s (%d)'%(self.index_encoding, - self.index_encoding_num)) - a('Unknown (possibly language?): %r'%(self.possibly_language)) - a('Number of index entries: %d'% self.num_index_entries) - a('ORDT start: %d'%self.ordt_start) - a('LIGT start: %d'%self.ligt_start) - a('Number of LIGT entries: %d'%self.num_of_ligt_entries) - a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) - u(self.unknown2) - a('TAGX offset: %d'%self.tagx_offset) - u(self.unknown3) - a('\n\n') - a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20) - a('Header length: %d'%self.tagx_header_length) - a('Control byte count: %d'%self.tagx_control_byte_count) - for i in self.tagx_entries: - a('\t' + repr(i)) - a('Index of last IndexEntry in primary index record: %s'% self.last_entry) - a('Number of entries in the NCX: %d'% self.ncx_count) - - return '\n'.join(ans) - # }}} - -class Tag(object): # {{{ - - ''' - Index entries are a collection of tags. Each tag is represented by this - class. - ''' - - TAG_MAP = { - 1: ('offset', 'Offset in HTML'), - 2: ('size', 'Size in HTML'), - 3: ('label_offset', 'Label offset in CNCX'), - 4: ('depth', 'Depth of this entry in TOC'), - 5: ('class_offset', 'Class offset in CNCX'), - 6: ('pos_fid', 'File Index'), - - 11: ('secondary', '[unknown, unknown, ' - 'tag type from TAGX in primary index header]'), - - 21: ('parent_index', 'Parent'), - 22: ('first_child_index', 'First child'), - 23: ('last_child_index', 'Last child'), - - 69 : ('image_index', 'Offset from first image record to the' - ' image record associated with this entry' - ' (masthead for periodical or thumbnail for' - ' article entry).'), - 70 : ('desc_offset', 'Description offset in cncx'), - 71 : ('author_offset', 'Author offset in cncx'), - 72 : ('image_caption_offset', 'Image caption offset in cncx'), - 73 : ('image_attr_offset', 'Image attribution offset in cncx'), - - } - - def __init__(self, tagx, vals, entry_type, cncx): - self.value = vals if len(vals) > 1 else vals[0] if vals else None - self.entry_type = entry_type - tag_type = tagx.tag - - self.cncx_value = None - if tag_type in self.TAG_MAP: - self.attr, self.desc = self.TAG_MAP[tag_type] - else: - print ('Unknown tag value: %d in entry type: %s'%(tag_type, - entry_type)) - self.desc = '??Unknown (tag value: %d type: %s)'%( - tag_type, entry_type) - self.attr = 'unknown' - - if '_offset' in self.attr: - self.cncx_value = cncx[self.value] - - def __str__(self): - if self.cncx_value is not None: - return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value) - return '%s : %r'%(self.desc, self.value) - -# }}} - -class IndexEntry(object): # {{{ - - ''' - The index is made up of entries, each of which is represented by an - instance of this class. Index entries typically point to offsets in the - HTML, specify HTML sizes and point to text strings in the CNCX that are - used in the navigation UI. - ''' - - def __init__(self, ident, entry_type, raw, cncx, tagx_entries, - control_byte_count): - self.index = ident - self.raw = raw - self.tags = [] - self.entry_type = entry_type - self.byte_size = len(raw) - - orig_raw = raw - - if control_byte_count not in (1, 2): - raise ValueError('Unknown control byte count: %d'% - control_byte_count) - - self.flags = 0 - - if control_byte_count == 2: - self.flags = ord(raw[0]) - raw = raw[1:] - - expected_tags = [tag for tag in tagx_entries if tag.bitmask & - entry_type] - - flags = self.flags - for tag in expected_tags: - vals = [] - - if tag.tag > 0b1000000: # 0b1000000 = 64 - has_tag = flags & 0b1 - flags = flags >> 1 - if not has_tag: continue - for i in range(tag.num_values): - if not raw: - raise ValueError('Index entry does not match TAGX header') - val, consumed = decint(raw) - raw = raw[consumed:] - vals.append(val) - self.tags.append(Tag(tag, vals, self.entry_type, cncx)) - - self.consumed = len(orig_raw) - len(raw) - self.trailing_bytes = raw - if self.trailing_bytes.replace(b'\0', b''): - raise ValueError('%s has leftover bytes: %s'%(self, format_bytes( - self.trailing_bytes))) - - @property - def label(self): - for tag in self.tags: - if tag.attr == 'label_offset': - return tag.cncx_value - return '' - - @property - def offset(self): - for tag in self.tags: - if tag.attr == 'offset': - return tag.value - return 0 - - @property - def size(self): - for tag in self.tags: - if tag.attr == 'size': - return tag.value - return 0 - - @property - def depth(self): - for tag in self.tags: - if tag.attr == 'depth': - return tag.value - return 0 - - @property - def parent_index(self): - for tag in self.tags: - if tag.attr == 'parent_index': - return tag.value - return -1 - - @property - def first_child_index(self): - for tag in self.tags: - if tag.attr == 'first_child_index': - return tag.value - return -1 - - @property - def last_child_index(self): - for tag in self.tags: - if tag.attr == 'last_child_index': - return tag.value - return -1 - - @property - def pos_fid(self): - for tag in self.tags: - if tag.attr == 'pos_fid': - return tag.value - return [0, 0] - - def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, ' - 'length=%d, byte_size=%d)'%( - self.index, bin(self.entry_type), bin(self.flags)[2:], - len(self.tags), self.byte_size)] - for tag in self.tags: - if tag.value is not None: - ans.append('\t'+str(tag)) - if self.first_child_index != -1: - ans.append('\tNumber of children: %d'%(self.last_child_index - - self.first_child_index + 1)) - if self.trailing_bytes: - ans.append('\tTrailing bytes: %r'%self.trailing_bytes) - return '\n'.join(ans) - -# }}} - -class SecondaryIndexRecord(object): # {{{ - - def __init__(self, record, index_header, cncx): - self.record = record - raw = self.record.raw - - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') - - u = struct.unpack - - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) - - self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - num = ord(indxt[off]) - index = indxt[off+1:off+1+num] - consumed = 1 + num - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - - def __str__(self): - ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) - for entry in self.indices: - a(str(entry)) - a('') - - - return '\n'.join(ans) - -# }}} - -class IndexRecord(object): # {{{ - - ''' - Represents all indexing information in the MOBI, apart from indexing info - in the trailing data of the text records. - ''' - - def __init__(self, record, index_header, cncx): - self.record = record - self.alltext = None - raw = self.record.raw - - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') - - u = struct.unpack - - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) - self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - index, consumed = decode_hex_number(indxt[off:]) - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - def get_parent(self, index): - if index.depth < 1: - return None - parent_depth = index.depth - 1 - for p in self.indices: - if p.depth != parent_depth: - continue - - - def __str__(self): - ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) - for entry in self.indices: - offset = entry.offset - a(str(entry)) - t = self.alltext - if offset is not None and self.alltext is not None: - a('\tHTML before offset: %r'%t[offset-50:offset]) - a('\tHTML after offset: %r'%t[offset:offset+50]) - p = offset+entry.size - a('\tHTML before end: %r'%t[p-50:p]) - a('\tHTML after end: %r'%t[p:p+50]) - - a('') - - return '\n'.join(ans) - -# }}} - -class CNCX(object): # {{{ - - ''' - Parses the records that contain the compiled NCX (all strings from the - NCX). Presents a simple offset : string mapping interface to access the - data. - ''' - - def __init__(self, records, codec): - self.records = OrderedDict() - record_offset = 0 - for record in records: - raw = record.raw - pos = 0 - while pos < len(raw): - length, consumed = decint(raw[pos:]) - if length > 0: - try: - self.records[pos+record_offset] = raw[ - pos+consumed:pos+consumed+length].decode(codec) - except: - byts = raw[pos:] - r = format_bytes(byts) - print ('CNCX entry at offset %d has unknown format %s'%( - pos+record_offset, r)) - self.records[pos+record_offset] = r - pos = len(raw) - pos += consumed+length - record_offset += 0x10000 - - def __getitem__(self, offset): - return self.records.get(offset) - - def __str__(self): - ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20] - for k, v in self.records.iteritems(): - ans.append('%10d : %s'%(k, v)) - return '\n'.join(ans) - - -# }}} - -class TextRecord(object): # {{{ - - def __init__(self, idx, record, extra_data_flags, decompress): - self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) - raw_trailing_bytes = record.raw[len(self.raw):] - self.raw = decompress(self.raw) - - if 0 in self.trailing_data: - self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) - if 1 in self.trailing_data: - self.trailing_data['indexing'] = self.trailing_data.pop(1) - if 2 in self.trailing_data: - self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) - self.trailing_data['raw_bytes'] = raw_trailing_bytes - - for typ, val in self.trailing_data.iteritems(): - if isinstance(typ, int): - print ('Record %d has unknown trailing data of type: %d : %r'% - (idx, typ, val)) - - self.idx = idx - - def dump(self, folder): - name = '%06d'%self.idx - with open(os.path.join(folder, name+'.txt'), 'wb') as f: - f.write(self.raw) - with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: - for k, v in self.trailing_data.iteritems(): - raw = '%s : %r\n\n'%(k, v) - f.write(raw.encode('utf-8')) - -# }}} - -class ImageRecord(object): # {{{ - - def __init__(self, idx, record, fmt): - self.raw = record.raw - self.fmt = fmt - self.idx = idx - - def dump(self, folder): - name = '%06d'%self.idx - with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f: - f.write(self.raw) - -# }}} - -class BinaryRecord(object): # {{{ - - def __init__(self, idx, record): - self.raw = record.raw - sig = self.raw[:4] - name = '%06d'%idx - if sig in {b'FCIS', b'FLIS', b'SRCS', b'DATP', b'RESC', b'BOUN', - b'FDST', b'AUDI', b'VIDE',}: - name += '-' + sig.decode('ascii') - elif sig == b'\xe9\x8e\r\n': - name += '-' + 'EOF' - self.name = name - - def dump(self, folder): - with open(os.path.join(folder, self.name+'.bin'), 'wb') as f: - f.write(self.raw) - -# }}} - -class FontRecord(object): # {{{ - - def __init__(self, idx, record): - self.raw = record.raw - name = '%06d'%idx - self.font = read_font_record(self.raw) - if self.font['err']: - raise ValueError('Failed to read font record: %s Headers: %s'%( - self.font['err'], self.font['headers'])) - self.payload = (self.font['font_data'] if self.font['font_data'] else - self.font['raw_data']) - self.name = '%s.%s'%(name, self.font['ext']) - - def dump(self, folder): - with open(os.path.join(folder, self.name), 'wb') as f: - f.write(self.payload) - -# }}} - -class TBSIndexing(object): # {{{ - - def __init__(self, text_records, indices, doc_type): - self.record_indices = OrderedDict() - self.doc_type = doc_type - self.indices = indices - pos = 0 - for r in text_records: - start = pos - pos += len(r.raw) - end = pos - 1 - self.record_indices[r] = x = {'starts':[], 'ends':[], - 'complete':[], 'geom': (start, end)} - for entry in indices: - istart, sz = entry.offset, entry.size - iend = istart + sz - 1 - has_start = istart >= start and istart <= end - has_end = iend >= start and iend <= end - rec = None - if has_start and has_end: - rec = 'complete' - elif has_start and not has_end: - rec = 'starts' - elif not has_start and has_end: - rec = 'ends' - if rec: - x[rec].append(entry) - - def get_index(self, idx): - for i in self.indices: - if i.index == idx: return i - raise IndexError('Index %d not found'%idx) - - def __str__(self): - ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20] - for r, dat in self.record_indices.iteritems(): - ans += self.dump_record(r, dat)[-1] - return '\n'.join(ans) - - def dump(self, bdir): - types = defaultdict(list) - for r, dat in self.record_indices.iteritems(): - tbs_type, strings = self.dump_record(r, dat) - if tbs_type == 0: continue - types[tbs_type] += strings - for typ, strings in types.iteritems(): - with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f: - f.write('\n'.join(strings)) - - def dump_record(self, r, dat): - ans = [] - ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx, - dat['geom'][0], dat['geom'][1])) - s, e, c = dat['starts'], dat['ends'], dat['complete'] - ans.append(('\tContains: %d index entries ' - '(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e, - c, s)))) - byts = bytearray(r.trailing_data.get('indexing', b'')) - ans.append('TBS bytes: %s'%format_bytes(byts)) - for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)): - if entries: - ans.append('\t%s:'%typ) - for x in entries: - ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' - 'Depth: %d, Offset: %d, Size: %d) [%s]')%( - x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) - def bin4(num): - ans = bin(num)[2:] - return bytes('0'*(4-len(ans)) + ans) - - def repr_extra(x): - return str({bin4(k):v for k, v in extra.iteritems()}) - - tbs_type = 0 - is_periodical = self.doc_type in (257, 258, 259) - if len(byts): - outermost_index, extra, consumed = decode_tbs(byts, flag_size=3) - byts = byts[consumed:] - for k in extra: - tbs_type |= k - ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) - ans.append('Outermost index: %d'%outermost_index) - ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) - if is_periodical: # Hierarchical periodical - try: - byts, a = self.interpret_periodical(tbs_type, byts, - dat['geom'][0]) - except: - import traceback - traceback.print_exc() - a = [] - print ('Failed to decode TBS bytes for record: %d'%r.idx) - ans += a - if byts: - sbyts = tuple(hex(b)[2:] for b in byts) - ans.append('Remaining bytes: %s'%' '.join(sbyts)) - - ans.append('') - return tbs_type, ans - - def interpret_periodical(self, tbs_type, byts, record_offset): - ans = [] - - def read_section_transitions(byts, psi=None): # {{{ - if psi is None: - # Assume previous section is 1 - psi = self.get_index(1) - - while byts: - ai, extra, consumed = decode_tbs(byts) - byts = byts[consumed:] - if extra.get(0b0010, None) is not None: - raise ValueError('Dont know how to interpret flag 0b0010' - ' while reading section transitions') - if extra.get(0b1000, None) is not None: - if len(extra) > 1: - raise ValueError('Dont know how to interpret flags' - ' %r while reading section transitions'%extra) - nsi = self.get_index(psi.index+1) - ans.append('Last article in this record of section %d' - ' (relative to next section index [%d]): ' - '%d [%d absolute index]'%(psi.index, nsi.index, ai, - ai+nsi.index)) - psi = nsi - continue - - ans.append('First article in this record of section %d' - ' (relative to its parent section): ' - '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) - - num = extra.get(0b0100, None) - if num is None: - msg = ('The section %d has at most one article' - ' in this record')%psi.index - else: - msg = ('Number of articles in this record of ' - 'section %d: %d')%(psi.index, num) - ans.append(msg) - - offset = extra.get(0b0001, None) - if offset is not None: - if offset == 0: - ans.append('This record is spanned by the article:' - '%d'%(ai+psi.index)) - else: - ans.append('->Offset to start of next section (%d) from start' - ' of record: %d [%d absolute offset]'%(psi.index+1, - offset, offset+record_offset)) - return byts - # }}} - - def read_starting_section(byts): # {{{ - orig = byts - si, extra, consumed = decode_tbs(byts) - byts = byts[consumed:] - if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: - raise ValueError('Dont know how to interpret flags %r' - ' when reading starting section'%extra) - si = self.get_index(si) - ans.append('The section at the start of this record is:' - ' %d'%si.index) - if 0b0100 in extra: - num = extra[0b0100] - ans.append('The number of articles from the section %d' - ' in this record: %d'%(si.index, num)) - elif 0b0001 in extra: - eof = extra[0b0001] - if eof != 0: - raise ValueError('Unknown eof value %s when reading' - ' starting section. All bytes: %r'%(eof, orig)) - ans.append('??This record has more than one article from ' - ' the section: %d'%si.index) - return si, byts - # }}} - - if tbs_type & 0b0100: - # Starting section is the first section - ssi = self.get_index(1) - else: - ssi, byts = read_starting_section(byts) - - byts = read_section_transitions(byts, ssi) - - return byts, ans - -# }}} - -class MOBIFile(object): # {{{ - - def __init__(self, stream): - self.raw = stream.read() - - self.palmdb = PalmDB(self.raw[:78]) - - self.record_headers = [] - self.records = [] - for i in xrange(self.palmdb.number_of_records): - pos = 78 + i * 8 - offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) - flags, val = a1, a2 << 16 | a3 << 8 | a4 - self.record_headers.append((offset, flags, val)) - - def section(section_number): - if section_number == self.palmdb.number_of_records - 1: - end_off = len(self.raw) - else: - end_off = self.record_headers[section_number + 1][0] - off = self.record_headers[section_number][0] - return self.raw[off:end_off] - - for i in range(self.palmdb.number_of_records): - self.records.append(Record(section(i), self.record_headers[i])) - - self.mobi_header = MOBIHeader(self.records[0]) - self.huffman_record_nums = [] - - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x - - self.index_header = self.index_record = None - self.indexing_record_nums = set() - pir = self.mobi_header.primary_index_record - if pir != NULL_INDEX: - self.index_header = IndexHeader(self.records[pir]) - self.cncx = CNCX(self.records[ - pir+2:pir+2+self.index_header.num_of_cncx_blocks], - self.index_header.index_encoding) - self.index_record = IndexRecord(self.records[pir+1], - self.index_header, self.cncx) - self.indexing_record_nums = set(xrange(pir, - pir+2+self.index_header.num_of_cncx_blocks)) - self.secondary_index_record = self.secondary_index_header = None - sir = self.mobi_header.secondary_index_record - if sir != NULL_INDEX: - self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) - self.indexing_record_nums.add(sir) - self.secondary_index_record = SecondaryIndexRecord( - self.records[sir+1], self.secondary_index_header, self.cncx) - self.indexing_record_nums.add(sir+1) - - - ntr = self.mobi_header.number_of_text_records - fntbr = self.mobi_header.first_non_book_record - fii = self.mobi_header.first_image_index - if fntbr == NULL_INDEX: - fntbr = len(self.records) - self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, decompress) for r in xrange(1, - min(len(self.records), ntr+1))] - self.image_records, self.binary_records = [], [] - self.font_records = [] - image_index = 0 - for i in xrange(fntbr, len(self.records)): - if i in self.indexing_record_nums or i in self.huffman_record_nums: - continue - image_index += 1 - r = self.records[i] - fmt = None - if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS', - b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', - b'AUDI', b'VIDE', b'FONT'}: - try: - width, height, fmt = identify_data(r.raw) - except: - pass - if fmt is not None: - self.image_records.append(ImageRecord(image_index, r, fmt)) - elif r.raw[:4] == b'FONT': - self.font_records.append(FontRecord(i, r)) - else: - self.binary_records.append(BinaryRecord(i, r)) - - if self.index_record is not None: - self.tbs_indexing = TBSIndexing(self.text_records, - self.index_record.indices, self.mobi_header.type_raw) - - def print_header(self, f=sys.stdout): - print (str(self.palmdb).encode('utf-8'), file=f) - print (file=f) - print ('Record headers:', file=f) - for i, r in enumerate(self.records): - print ('%6d. %s'%(i, r.header), file=f) - - print (file=f) - print (str(self.mobi_header).encode('utf-8'), file=f) -# }}} - -def inspect_mobi(path_or_stream, ddir=None): # {{{ - stream = (path_or_stream if hasattr(path_or_stream, 'read') else - open(path_or_stream, 'rb')) - f = MOBIFile(stream) - if ddir is None: - ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] - try: - shutil.rmtree(ddir) - except: - pass - os.makedirs(ddir) - with open(os.path.join(ddir, 'header.txt'), 'wb') as out: - f.print_header(f=out) - - alltext = os.path.join(ddir, 'text.html') - with open(alltext, 'wb') as of: - alltext = b'' - for rec in f.text_records: - of.write(rec.raw) - alltext += rec.raw - of.seek(0) - if f.mobi_header.file_version < 8: - root = html.fromstring(alltext.decode('utf-8')) - with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: - of.write(html.tostring(root, pretty_print=True, encoding='utf-8', - include_meta_content_type=True)) - - - if f.index_header is not None: - f.index_record.alltext = alltext - with open(os.path.join(ddir, 'index.txt'), 'wb') as out: - print(str(f.index_header), file=out) - print('\n\n', file=out) - if f.secondary_index_header is not None: - print(str(f.secondary_index_header).encode('utf-8'), file=out) - print('\n\n', file=out) - if f.secondary_index_record is not None: - print(str(f.secondary_index_record).encode('utf-8'), file=out) - print('\n\n', file=out) - print(str(f.cncx).encode('utf-8'), file=out) - print('\n\n', file=out) - print(str(f.index_record), file=out) - with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out: - print(str(f.tbs_indexing), file=out) - f.tbs_indexing.dump(ddir) - - for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), - ('binary', 'binary_records'), ('font', 'font_records')]: - tdir = os.path.join(ddir, tdir) - os.mkdir(tdir) - for rec in getattr(f, attr): - rec.dump(tdir) - - - print ('Debug data saved to:', ddir) - -# }}} - -def main(): - inspect_mobi(sys.argv[1]) - -if __name__ == '__main__': - main() - diff --git a/src/calibre/ebooks/mobi/debug/__init__.py b/src/calibre/ebooks/mobi/debug/__init__.py new file mode 100644 index 0000000000..b472bf3148 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +def format_bytes(byts): + byts = bytearray(byts) + byts = [hex(b)[2:] for b in byts] + return ' '.join(byts) + + diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py new file mode 100644 index 0000000000..2cc7954559 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -0,0 +1,535 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, datetime, os + +from calibre.utils.date import utc_tz +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.utils import get_trailing_data + +# PalmDB {{{ +class PalmDOCAttributes(object): + + class Attr(object): + + def __init__(self, name, field, val): + self.name = name + self.val = val & field + + def __str__(self): + return '%s: %s'%(self.name, bool(self.val)) + + def __init__(self, raw): + self.val = struct.unpack(b'H', self.raw[34:36])[0] + + palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz) + self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0] + self.creation_date = (palm_epoch + + datetime.timedelta(seconds=self.creation_date_raw)) + self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0] + self.modification_date = (palm_epoch + + datetime.timedelta(seconds=self.modification_date_raw)) + self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0] + self.last_backup_date = (palm_epoch + + datetime.timedelta(seconds=self.last_backup_date_raw)) + self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0] + self.app_info_id = self.raw[52:56] + self.sort_info_id = self.raw[56:60] + self.type = self.raw[60:64] + self.creator = self.raw[64:68] + self.ident = self.type + self.creator + if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): + raise ValueError('Unknown book ident: %r'%self.ident) + self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72]) + self.next_rec_list_id = self.raw[72:76] + + self.number_of_records, = struct.unpack(b'>H', self.raw[76:78]) + + def __str__(self): + ans = ['*'*20 + ' PalmDB Header '+ '*'*20] + ans.append('Name: %r'%self.name) + ans.append(str(self.attributes)) + ans.append('Version: %s'%self.version) + ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(), + self.creation_date_raw)) + ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(), + self.modification_date_raw)) + ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(), + self.last_backup_date_raw)) + ans.append('Modification number: %s'%self.modification_number) + ans.append('App Info ID: %r'%self.app_info_id) + ans.append('Sort Info ID: %r'%self.sort_info_id) + ans.append('Type: %r'%self.type) + ans.append('Creator: %r'%self.creator) + ans.append('Last record UID +1: %r'%self.last_record_uid) + ans.append('Next record list id: %r'%self.next_rec_list_id) + ans.append('Number of records: %s'%self.number_of_records) + + return '\n'.join(ans) +# }}} + +class Record(object): # {{{ + + def __init__(self, raw, header): + self.offset, self.flags, self.uid = header + self.raw = raw + + @property + def header(self): + return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags, + self.uid, self.raw[:4], len(self.raw)) +# }}} + +# EXTH {{{ +class EXTHRecord(object): + + def __init__(self, type_, data): + self.type = type_ + self.data = data + self.name = { + 1 : 'DRM Server id', + 2 : 'DRM Commerce id', + 3 : 'DRM ebookbase book id', + 100 : 'author', + 101 : 'publisher', + 102 : 'imprint', + 103 : 'description', + 104 : 'isbn', + 105 : 'subject', + 106 : 'publishingdate', + 107 : 'review', + 108 : 'contributor', + 109 : 'rights', + 110 : 'subjectcode', + 111 : 'type', + 112 : 'source', + 113 : 'asin', + 114 : 'versionnumber', + 115 : 'sample', + 116 : 'startreading', + 117 : 'adult', + 118 : 'retailprice', + 119 : 'retailpricecurrency', + 121 : 'KF8 header section index', + 125 : 'KF8 resources (images/fonts) count', + 129 : 'KF8 cover URI', + 131 : 'KF8 unknown count', + 201 : 'coveroffset', + 202 : 'thumboffset', + 203 : 'hasfakecover', + 204 : 'Creator Software', + 205 : 'Creator Major Version', # '>I' + 206 : 'Creator Minor Version', # '>I' + 207 : 'Creator Build Number', # '>I' + 208 : 'watermark', + 209 : 'tamper_proof_keys', + 300 : 'fontsignature', + 301 : 'clippinglimit', # percentage '>B' + 402 : 'publisherlimit', + 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled + 501 : 'cdetype', # 4 chars (PDOC or EBOK) + 502 : 'lastupdatetime', + 503 : 'updatedtitle', + }.get(self.type, repr(self.type)) + + if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', + 'Creator Major Version', 'Creator Minor Version', + 'Creator Build Number', 'Creator Software', 'startreading'} or + self.type in {121, 125, 131}): + self.data, = struct.unpack(b'>I', self.data) + + def __str__(self): + return '%s (%d): %r'%(self.name, self.type, self.data) + +class EXTHHeader(object): + + def __init__(self, raw): + self.raw = raw + if not self.raw.startswith(b'EXTH'): + raise ValueError('EXTH header does not start with EXTH') + self.length, = struct.unpack(b'>I', self.raw[4:8]) + self.count, = struct.unpack(b'>I', self.raw[8:12]) + + pos = 12 + self.records = [] + for i in xrange(self.count): + pos = self.read_record(pos) + self.records.sort(key=lambda x:x.type) + self.rmap = {x.type:x for x in self.records} + + def __getitem__(self, type_): + return self.rmap.__getitem__(type_).data + + def get(self, type_, default=None): + ans = self.rmap.get(type_, default) + return getattr(ans, 'data', default) + + def read_record(self, pos): + type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) + data = self.raw[(pos+8):(pos+length)] + self.records.append(EXTHRecord(type_, data)) + return pos + length + + @property + def kf8_header_index(self): + return self.get(121, None) + + def __str__(self): + ans = ['*'*20 + ' EXTH Header '+ '*'*20] + ans.append('EXTH header length: %d'%self.length) + ans.append('Number of EXTH records: %d'%self.count) + ans.append('EXTH records...') + for r in self.records: + ans.append(str(r)) + return '\n'.join(ans) +# }}} + +class MOBIHeader(object): # {{{ + + def __init__(self, record0, offset): + self.raw = record0.raw + self.header_offset = offset + + self.compression_raw = self.raw[:2] + self.compression = {1: 'No compression', 2: 'PalmDoc compression', + 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', + self.compression_raw)[0], + repr(self.compression_raw)) + self.unused = self.raw[2:4] + self.text_length, = struct.unpack(b'>I', self.raw[4:8]) + self.number_of_text_records, self.text_record_size = \ + struct.unpack(b'>HH', self.raw[8:12]) + self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) + self.encryption_type = { + 0: 'No encryption', + 1: 'Old mobipocket encryption', + 2: 'Mobipocket encryption' + }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) + self.unknown = self.raw[14:16] + + self.identifier = self.raw[16:20] + if self.identifier != b'MOBI': + raise ValueError('Identifier %r unknown'%self.identifier) + + self.length, = struct.unpack(b'>I', self.raw[20:24]) + self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) + self.type = { + 2 : 'Mobipocket book', + 3 : 'PalmDOC book', + 4 : 'Audio', + 257 : 'News', + 258 : 'News Feed', + 259 : 'News magazine', + 513 : 'PICS', + 514 : 'Word', + 515 : 'XLS', + 516 : 'PPT', + 517 : 'TEXT', + 518 : 'HTML', + }.get(self.type_raw, repr(self.type_raw)) + + self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) + self.encoding = { + 1252 : 'cp1252', + 65001: 'utf-8', + }.get(self.encoding_raw, repr(self.encoding_raw)) + self.uid = self.raw[32:36] + self.file_version, = struct.unpack(b'>I', self.raw[36:40]) + self.meta_orth_indx, self.meta_infl_indx = struct.unpack( + b'>II', self.raw[40:48]) + self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) + self.reserved = self.raw[52:80] + self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) + self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) + self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) + self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) + langcode = self.locale_raw + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + self.language = main_language.get(langid, 'ENGLISH') + self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') + + self.input_language = self.raw[96:100] + self.output_langauage = self.raw[100:104] + self.min_version, = struct.unpack(b'>I', self.raw[104:108]) + self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) + self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) + self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) + self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) + self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) + self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) + self.has_exth = bool(self.exth_flags & 0x40) + self.has_drm_data = self.length >= 174 and len(self.raw) >= 180 + if self.has_drm_data: + self.unknown3 = self.raw[132:164] + self.drm_offset, = struct.unpack(b'>I', self.raw[164:168]) + self.drm_count, = struct.unpack(b'>I', self.raw[168:172]) + self.drm_size, = struct.unpack(b'>I', self.raw[172:176]) + self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0]) + self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 + self.has_fcis_flis = False + self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 + if self.has_extra_data_flags: + self.unknown4 = self.raw[180:192] + self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II', + self.raw, 192) + (self.fcis_number, self.fcis_count, self.flis_number, + self.flis_count) = struct.unpack(b'>IIII', + self.raw[200:216]) + self.unknown6 = self.raw[216:224] + self.srcs_record_index = struct.unpack(b'>I', + self.raw[224:228])[0] + self.num_srcs_records = struct.unpack(b'>I', + self.raw[228:232])[0] + self.unknown7 = self.raw[232:240] + self.extra_data_flags = struct.unpack(b'>I', + self.raw[240:244])[0] + self.has_multibytes = bool(self.extra_data_flags & 0b1) + self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) + self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) + self.primary_index_record, = struct.unpack(b'>I', + self.raw[244:248]) + + if self.file_version >= 8: + (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx + ) = struct.unpack_from(b'>4L', self.raw, 248) + self.unknown9 = self.raw[264:self.length] + if self.meta_orth_indx != self.sect_idx: + raise ValueError('KF8 header has different Meta orth and ' + 'section indices') + + # The following are all relative to the position of the header record + # make them absolute for ease of debugging + for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', + 'meta_orth_indx', 'huffman_record_offset', + 'first_non_book_record', 'datp_record_offset', 'fcis_number', + 'flis_number', 'primary_index_record', 'fdst_idx', + 'first_image_index'): + if hasattr(self, x): + setattr(self, x, self.header_offset+getattr(self, x)) + + if self.has_exth: + self.exth_offset = 16 + self.length + + self.exth = EXTHHeader(self.raw[self.exth_offset:]) + + self.end_of_exth = self.exth_offset + self.exth.length + self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] + + def __str__(self): + ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20] + a = ans.append + i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) + ans.append('Compression: %s'%self.compression) + ans.append('Unused: %r'%self.unused) + ans.append('Number of text records: %d'%self.number_of_text_records) + ans.append('Text record size: %d'%self.text_record_size) + ans.append('Encryption: %s'%self.encryption_type) + ans.append('Unknown: %r'%self.unknown) + ans.append('Identifier: %r'%self.identifier) + ans.append('Header length: %d'% self.length) + ans.append('Type: %s'%self.type) + ans.append('Encoding: %s'%self.encoding) + ans.append('UID: %r'%self.uid) + ans.append('File version: %d'%self.file_version) + i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx) + i('Meta Infl Index', self.meta_infl_indx) + ans.append('Secondary index record: %d (null val: %d)'%( + self.secondary_index_record, NULL_INDEX)) + ans.append('Reserved: %r'%self.reserved) + ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, + self.first_non_book_record)) + ans.append('Full name offset: %d'%self.fullname_offset) + ans.append('Full name length: %d bytes'%self.fullname_length) + ans.append('Langcode: %r'%self.locale_raw) + ans.append('Language: %s'%self.language) + ans.append('Sub language: %s'%self.sublanguage) + ans.append('Input language: %r'%self.input_language) + ans.append('Output language: %r'%self.output_langauage) + ans.append('Min version: %d'%self.min_version) + ans.append('First Image index: %d'%self.first_image_index) + ans.append('Huffman record offset: %d'%self.huffman_record_offset) + ans.append('Huffman record count: %d'%self.huffman_record_count) + ans.append('DATP record offset: %r'%self.datp_record_offset) + ans.append('DATP record count: %r'%self.datp_record_count) + ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) + if self.has_drm_data: + ans.append('Unknown3: %r'%self.unknown3) + ans.append('DRM Offset: %s'%self.drm_offset) + ans.append('DRM Count: %s'%self.drm_count) + ans.append('DRM Size: %s'%self.drm_size) + ans.append('DRM Flags: %r'%self.drm_flags) + if self.has_extra_data_flags: + ans.append('Unknown4: %r'%self.unknown4) + ans.append('FDST Index: %d'% self.fdst_idx) + ans.append('FDST Count: %d'% self.fdst_count) + ans.append('FCIS number: %d'% self.fcis_number) + ans.append('FCIS count: %d'% self.fcis_count) + ans.append('FLIS number: %d'% self.flis_number) + ans.append('FLIS count: %d'% self.flis_count) + ans.append('Unknown6: %r'% self.unknown6) + ans.append('SRCS record index: %d'%self.srcs_record_index) + ans.append('Number of SRCS records?: %d'%self.num_srcs_records) + ans.append('Unknown7: %r'%self.unknown7) + ans.append(('Extra data flags: %s (has multibyte: %s) ' + '(has indexing: %s) (has uncrossable breaks: %s)')%( + bin(self.extra_data_flags), self.has_multibytes, + self.has_indexing_bytes, self.has_uncrossable_breaks )) + ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, + self.primary_index_record)) + if self.file_version >= 8: + i('Sections Index', self.sect_idx) + i('SKEL Index', self.skel_idx) + i('DATP Index', self.datp_idx) + i('Other Index', self.oth_idx) + if self.unknown9: + a('Unknown9: %r'%self.unknown9) + + ans = '\n'.join(ans) + + if self.has_exth: + ans += '\n\n' + str(self.exth) + ans += '\n\nBytes after EXTH (%d bytes): %s'%( + len(self.bytes_after_exth), + format_bytes(self.bytes_after_exth)) + + ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + + self.fullname_length)) + + ans += '\nRecord 0 length: %d'%len(self.raw) + return ans +# }}} + +class MOBIFile(object): + + def __init__(self, stream): + self.raw = stream.read() + self.palmdb = PalmDB(self.raw[:78]) + + self.record_headers = [] + self.records = [] + for i in xrange(self.palmdb.number_of_records): + pos = 78 + i * 8 + offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8]) + flags, val = a1, a2 << 16 | a3 << 8 | a4 + self.record_headers.append((offset, flags, val)) + + def section(section_number): + if section_number == self.palmdb.number_of_records - 1: + end_off = len(self.raw) + else: + end_off = self.record_headers[section_number + 1][0] + off = self.record_headers[section_number][0] + return self.raw[off:end_off] + + for i in range(self.palmdb.number_of_records): + self.records.append(Record(section(i), self.record_headers[i])) + + self.mobi_header = MOBIHeader(self.records[0], 0) + self.huffman_record_nums = [] + + self.kf8_type = None + mh = mh8 = self.mobi_header + if mh.file_version >= 8: + self.kf8_type = 'standalone' + elif mh.has_exth and mh.exth.kf8_header_index is not None: + self.kf8_type = 'joint' + kf8i = mh.exth.kf8_header_index + mh8 = MOBIHeader(self.records[kf8i], kf8i) + self.mobi8_header = mh8 + + if 'huff' in self.mobi_header.compression.lower(): + from calibre.ebooks.mobi.huffcdic import HuffReader + + def huffit(off, cnt): + huffman_record_nums = list(xrange(off, off+cnt)) + huffrecs = [self.records[r].raw for r in huffman_record_nums] + huffs = HuffReader(huffrecs) + return huffman_record_nums, huffs.unpack + + if self.kf8_type == 'joint': + recs6, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + recs8, d8 = huffit(mh8.huffman_record_offset, + mh8.huffman_record_count) + self.huffman_record_nums = recs6 + recs8 + else: + self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + d8 = d6 + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + d8 = d6 = decompress_doc + else: + d8 = d6 = lambda x: x + + self.decompress6, self.decompress8 = d6, d8 + +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + raw_trailing_bytes = record.raw[len(self.raw):] + self.raw = decompress(self.raw) + + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + self.trailing_data['raw_bytes'] = raw_trailing_bytes + + for typ, val in self.trailing_data.iteritems(): + if isinstance(typ, int): + print ('Record %d has unknown trailing data of type: %d : %r'% + (idx, typ, val)) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} + + diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py new file mode 100644 index 0000000000..624da65846 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, shutil + +from calibre.ebooks.mobi.debug.headers import MOBIFile +from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 +from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8 + +def inspect_mobi(path_or_stream, ddir=None): # {{{ + stream = (path_or_stream if hasattr(path_or_stream, 'read') else + open(path_or_stream, 'rb')) + f = MOBIFile(stream) + if ddir is None: + ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.makedirs(ddir) + if f.kf8_type is None: + inspect_mobi6(f, ddir) + elif f.kf8_type == 'joint': + p6 = os.path.join(ddir, 'mobi6') + os.mkdir(p6) + inspect_mobi6(f, p6) + p8 = os.path.join(ddir, 'mobi8') + os.mkdir(p8) + inspect_mobi8(f, p8) + else: + inspect_mobi8(f, ddir) + + print ('Debug data saved to:', ddir) + +# }}} + +def main(): + inspect_mobi(sys.argv[1]) + +if __name__ == '__main__': + main() + diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py new file mode 100644 index 0000000000..640f58c661 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -0,0 +1,839 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, sys, os +from collections import OrderedDict, defaultdict + +from lxml import html + +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import (parse_index_record, + parse_tagx_section) +from calibre.ebooks.mobi.utils import (decode_hex_number, decint, + decode_tbs, read_font_record) +from calibre.utils.magick.draw import identify_data +from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.debug.headers import TextRecord + + +class TagX(object): # {{{ + + def __init__(self, tag, num_values, bitmask, eof): + self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values, + bitmask, eof) + self.num_of_values = num_values + self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 + and self.bitmask == 0) + + def __repr__(self): + return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag, + self.num_values, bin(self.bitmask), self.eof) + # }}} + +class SecondaryIndexHeader(object): # {{{ + + def __init__(self, record): + self.record = record + raw = self.record.raw + #open('/t/index_header.bin', 'wb').write(raw) + if raw[:4] != b'INDX': + raise ValueError('Invalid Secondary Index Record') + self.header_length, = struct.unpack('>I', raw[4:8]) + self.unknown1 = raw[8:16] + self.index_type, = struct.unpack('>I', raw[16:20]) + self.index_type_desc = {0: 'normal', 2: + 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') + self.idxt_start, = struct.unpack('>I', raw[20:24]) + self.index_count, = struct.unpack('>I', raw[24:28]) + self.index_encoding_num, = struct.unpack('>I', raw[28:32]) + self.index_encoding = {65001: 'utf-8', 1252: + 'cp1252'}.get(self.index_encoding_num, 'unknown') + if self.index_encoding == 'unknown': + raise ValueError( + 'Unknown index encoding: %d'%self.index_encoding_num) + self.unknown2 = raw[32:36] + self.num_index_entries, = struct.unpack('>I', raw[36:40]) + self.ordt_start, = struct.unpack('>I', raw[40:44]) + self.ligt_start, = struct.unpack('>I', raw[44:48]) + self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) + self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) + self.unknown3 = raw[56:180] + self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) + if self.tagx_offset != self.header_length: + raise ValueError('TAGX offset and header length disagree') + self.unknown4 = raw[184:self.header_length] + + tagx = raw[self.header_length:] + if not tagx.startswith(b'TAGX'): + raise ValueError('Invalid TAGX section') + self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) + self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] + if self.tagx_entries and not self.tagx_entries[-1].is_eof: + raise ValueError('TAGX last entry is not EOF') + + idxt0_pos = self.header_length+self.tagx_header_length + num = ord(raw[idxt0_pos]) + count_pos = idxt0_pos+1+num + self.last_entry = raw[idxt0_pos+1:count_pos] + self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) + + # There may be some alignment zero bytes between the end of the idxt0 + # and self.idxt_start + idxt = raw[self.idxt_start:] + if idxt[:4] != b'IDXT': + raise ValueError('Invalid IDXT header') + length_check, = struct.unpack(b'>H', idxt[4:6]) + if length_check != self.header_length + self.tagx_header_length: + raise ValueError('Length check failed') + if idxt[6:].replace(b'\0', b''): + raise ValueError('Non null trailing bytes after IDXT') + + + def __str__(self): + ans = ['*'*20 + ' Secondary Index Header '+ '*'*20] + a = ans.append + def u(w): + a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, + len(w), not bool(w.replace(b'\0', b'')) )) + + a('Header length: %d'%self.header_length) + u(self.unknown1) + a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) + a('Offset to IDXT start: %d'%self.idxt_start) + a('Number of index records: %d'%self.index_count) + a('Index encoding: %s (%d)'%(self.index_encoding, + self.index_encoding_num)) + u(self.unknown2) + a('Number of index entries: %d'% self.num_index_entries) + a('ORDT start: %d'%self.ordt_start) + a('LIGT start: %d'%self.ligt_start) + a('Number of LIGT entries: %d'%self.num_of_ligt_entries) + a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) + u(self.unknown3) + a('TAGX offset: %d'%self.tagx_offset) + u(self.unknown4) + a('\n\n') + a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20) + a('Header length: %d'%self.tagx_header_length) + a('Control byte count: %d'%self.tagx_control_byte_count) + for i in self.tagx_entries: + a('\t' + repr(i)) + a('Index of last IndexEntry in secondary index record: %s'% self.last_entry) + a('Number of entries in the NCX: %d'% self.ncx_count) + + return '\n'.join(ans) + +# }}} + +class IndexHeader(object): # {{{ + + def __init__(self, record): + self.record = record + raw = self.record.raw + #open('/t/index_header.bin', 'wb').write(raw) + if raw[:4] != b'INDX': + raise ValueError('Invalid Primary Index Record') + + self.header_length, = struct.unpack('>I', raw[4:8]) + self.unknown1 = raw[8:12] + self.header_type, = struct.unpack('>I', raw[12:16]) + self.index_type, = struct.unpack('>I', raw[16:20]) + self.index_type_desc = {0: 'normal', 2: + 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') + self.idxt_start, = struct.unpack('>I', raw[20:24]) + self.index_count, = struct.unpack('>I', raw[24:28]) + self.index_encoding_num, = struct.unpack('>I', raw[28:32]) + self.index_encoding = {65001: 'utf-8', 1252: + 'cp1252'}.get(self.index_encoding_num, 'unknown') + if self.index_encoding == 'unknown': + raise ValueError( + 'Unknown index encoding: %d'%self.index_encoding_num) + self.possibly_language = raw[32:36] + self.num_index_entries, = struct.unpack('>I', raw[36:40]) + self.ordt_start, = struct.unpack('>I', raw[40:44]) + self.ligt_start, = struct.unpack('>I', raw[44:48]) + self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) + self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) + self.unknown2 = raw[56:180] + self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) + if self.tagx_offset != self.header_length: + raise ValueError('TAGX offset and header length disagree') + self.unknown3 = raw[184:self.header_length] + + tagx = raw[self.header_length:] + if not tagx.startswith(b'TAGX'): + raise ValueError('Invalid TAGX section') + self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) + self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] + if self.tagx_entries and not self.tagx_entries[-1].is_eof: + raise ValueError('TAGX last entry is not EOF') + + idxt0_pos = self.header_length+self.tagx_header_length + last_num, consumed = decode_hex_number(raw[idxt0_pos:]) + count_pos = idxt0_pos + consumed + self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) + self.last_entry = last_num + + if last_num != self.ncx_count - 1: + raise ValueError('Last id number in the NCX != NCX count - 1') + # There may be some alignment zero bytes between the end of the idxt0 + # and self.idxt_start + + idxt = raw[self.idxt_start:] + if idxt[:4] != b'IDXT': + raise ValueError('Invalid IDXT header') + length_check, = struct.unpack(b'>H', idxt[4:6]) + if length_check != self.header_length + self.tagx_header_length: + raise ValueError('Length check failed') + if idxt[6:].replace(b'\0', b''): + raise ValueError('Non null trailing bytes after IDXT') + + + def __str__(self): + ans = ['*'*20 + ' Index Header (%d bytes)'%len(self.record.raw)+ '*'*20] + a = ans.append + def u(w): + a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, + len(w), not bool(w.replace(b'\0', b'')) )) + + a('Header length: %d'%self.header_length) + u(self.unknown1) + a('Header type: %d'%self.header_type) + a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) + a('Offset to IDXT start: %d'%self.idxt_start) + a('Number of index records: %d'%self.index_count) + a('Index encoding: %s (%d)'%(self.index_encoding, + self.index_encoding_num)) + a('Unknown (possibly language?): %r'%(self.possibly_language)) + a('Number of index entries: %d'% self.num_index_entries) + a('ORDT start: %d'%self.ordt_start) + a('LIGT start: %d'%self.ligt_start) + a('Number of LIGT entries: %d'%self.num_of_ligt_entries) + a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) + u(self.unknown2) + a('TAGX offset: %d'%self.tagx_offset) + u(self.unknown3) + a('\n\n') + a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20) + a('Header length: %d'%self.tagx_header_length) + a('Control byte count: %d'%self.tagx_control_byte_count) + for i in self.tagx_entries: + a('\t' + repr(i)) + a('Index of last IndexEntry in primary index record: %s'% self.last_entry) + a('Number of entries in the NCX: %d'% self.ncx_count) + + return '\n'.join(ans) + # }}} + +class Tag(object): # {{{ + + ''' + Index entries are a collection of tags. Each tag is represented by this + class. + ''' + + TAG_MAP = { + 1: ('offset', 'Offset in HTML'), + 2: ('size', 'Size in HTML'), + 3: ('label_offset', 'Label offset in CNCX'), + 4: ('depth', 'Depth of this entry in TOC'), + 5: ('class_offset', 'Class offset in CNCX'), + 6: ('pos_fid', 'File Index'), + + 11: ('secondary', '[unknown, unknown, ' + 'tag type from TAGX in primary index header]'), + + 21: ('parent_index', 'Parent'), + 22: ('first_child_index', 'First child'), + 23: ('last_child_index', 'Last child'), + + 69 : ('image_index', 'Offset from first image record to the' + ' image record associated with this entry' + ' (masthead for periodical or thumbnail for' + ' article entry).'), + 70 : ('desc_offset', 'Description offset in cncx'), + 71 : ('author_offset', 'Author offset in cncx'), + 72 : ('image_caption_offset', 'Image caption offset in cncx'), + 73 : ('image_attr_offset', 'Image attribution offset in cncx'), + + } + + def __init__(self, tag_type, vals, cncx): + self.value = vals if len(vals) > 1 else vals[0] if vals else None + + self.cncx_value = None + if tag_type in self.TAG_MAP: + self.attr, self.desc = self.TAG_MAP[tag_type] + else: + print ('Unknown tag value: %%s'%tag_type) + self.desc = '??Unknown (tag value: %d)'%tag_type + self.attr = 'unknown' + + if '_offset' in self.attr: + self.cncx_value = cncx[self.value] + + def __str__(self): + if self.cncx_value is not None: + return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value) + return '%s : %r'%(self.desc, self.value) + +# }}} + +class IndexEntry(object): # {{{ + + ''' + The index is made up of entries, each of which is represented by an + instance of this class. Index entries typically point to offsets in the + HTML, specify HTML sizes and point to text strings in the CNCX that are + used in the navigation UI. + ''' + + def __init__(self, ident, entry, cncx): + try: + self.index = int(ident, 16) + except ValueError: + self.index = ident + self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in + entry.iteritems()] + + @property + def label(self): + for tag in self.tags: + if tag.attr == 'label_offset': + return tag.cncx_value + return '' + + @property + def offset(self): + for tag in self.tags: + if tag.attr == 'offset': + return tag.value + return 0 + + @property + def size(self): + for tag in self.tags: + if tag.attr == 'size': + return tag.value + return 0 + + @property + def depth(self): + for tag in self.tags: + if tag.attr == 'depth': + return tag.value + return 0 + + @property + def parent_index(self): + for tag in self.tags: + if tag.attr == 'parent_index': + return tag.value + return -1 + + @property + def first_child_index(self): + for tag in self.tags: + if tag.attr == 'first_child_index': + return tag.value + return -1 + + @property + def last_child_index(self): + for tag in self.tags: + if tag.attr == 'last_child_index': + return tag.value + return -1 + + @property + def pos_fid(self): + for tag in self.tags: + if tag.attr == 'pos_fid': + return tag.value + return [0, 0] + + def __str__(self): + ans = ['Index Entry(index=%s, length=%d)'%( + self.index, len(self.tags))] + for tag in self.tags: + if tag.value is not None: + ans.append('\t'+str(tag)) + if self.first_child_index != -1: + ans.append('\tNumber of children: %d'%(self.last_child_index - + self.first_child_index + 1)) + return '\n'.join(ans) + +# }}} + +class IndexRecord(object): # {{{ + + ''' + Represents all indexing information in the MOBI, apart from indexing info + in the trailing data of the text records. + ''' + + def __init__(self, records, index_header, cncx): + self.alltext = None + table = OrderedDict() + tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in + index_header.tagx_entries] + for record in records: + raw = record.raw + + if raw[:4] != b'INDX': + raise ValueError('Invalid Primary Index Record') + + parse_index_record(table, record.raw, + index_header.tagx_control_byte_count, tags, + index_header.index_encoding, strict=True) + + self.indices = [] + + for ident, entry in table.iteritems(): + self.indices.append(IndexEntry(ident, entry, cncx)) + + def get_parent(self, index): + if index.depth < 1: + return None + parent_depth = index.depth - 1 + for p in self.indices: + if p.depth != parent_depth: + continue + + def __str__(self): + ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20] + a = ans.append + def u(w): + a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, + len(w), not bool(w.replace(b'\0', b'')) )) + for entry in self.indices: + offset = entry.offset + a(str(entry)) + t = self.alltext + if offset is not None and self.alltext is not None: + a('\tHTML before offset: %r'%t[offset-50:offset]) + a('\tHTML after offset: %r'%t[offset:offset+50]) + p = offset+entry.size + a('\tHTML before end: %r'%t[p-50:p]) + a('\tHTML after end: %r'%t[p:p+50]) + + a('') + + return '\n'.join(ans) + +# }}} + +class CNCX(object): # {{{ + + ''' + Parses the records that contain the compiled NCX (all strings from the + NCX). Presents a simple offset : string mapping interface to access the + data. + ''' + + def __init__(self, records, codec): + self.records = OrderedDict() + record_offset = 0 + for record in records: + raw = record.raw + pos = 0 + while pos < len(raw): + length, consumed = decint(raw[pos:]) + if length > 0: + try: + self.records[pos+record_offset] = raw[ + pos+consumed:pos+consumed+length].decode(codec) + except: + byts = raw[pos:] + r = format_bytes(byts) + print ('CNCX entry at offset %d has unknown format %s'%( + pos+record_offset, r)) + self.records[pos+record_offset] = r + pos = len(raw) + pos += consumed+length + record_offset += 0x10000 + + def __getitem__(self, offset): + return self.records.get(offset) + + def __str__(self): + ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20] + for k, v in self.records.iteritems(): + ans.append('%10d : %s'%(k, v)) + return '\n'.join(ans) + + +# }}} + +class ImageRecord(object): # {{{ + + def __init__(self, idx, record, fmt): + self.raw = record.raw + self.fmt = fmt + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f: + f.write(self.raw) + +# }}} + +class BinaryRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + sig = self.raw[:4] + name = '%06d'%idx + if sig in {b'FCIS', b'FLIS', b'SRCS', b'DATP', b'RESC', b'BOUN', + b'FDST', b'AUDI', b'VIDE',}: + name += '-' + sig.decode('ascii') + elif sig == b'\xe9\x8e\r\n': + name += '-' + 'EOF' + self.name = name + + def dump(self, folder): + with open(os.path.join(folder, self.name+'.bin'), 'wb') as f: + f.write(self.raw) + +# }}} + +class FontRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + name = '%06d'%idx + self.font = read_font_record(self.raw) + if self.font['err']: + raise ValueError('Failed to read font record: %s Headers: %s'%( + self.font['err'], self.font['headers'])) + self.payload = (self.font['font_data'] if self.font['font_data'] else + self.font['raw_data']) + self.name = '%s.%s'%(name, self.font['ext']) + + def dump(self, folder): + with open(os.path.join(folder, self.name), 'wb') as f: + f.write(self.payload) + +# }}} + +class TBSIndexing(object): # {{{ + + def __init__(self, text_records, indices, doc_type): + self.record_indices = OrderedDict() + self.doc_type = doc_type + self.indices = indices + pos = 0 + for r in text_records: + start = pos + pos += len(r.raw) + end = pos - 1 + self.record_indices[r] = x = {'starts':[], 'ends':[], + 'complete':[], 'geom': (start, end)} + for entry in indices: + istart, sz = entry.offset, entry.size + iend = istart + sz - 1 + has_start = istart >= start and istart <= end + has_end = iend >= start and iend <= end + rec = None + if has_start and has_end: + rec = 'complete' + elif has_start and not has_end: + rec = 'starts' + elif not has_start and has_end: + rec = 'ends' + if rec: + x[rec].append(entry) + + def get_index(self, idx): + for i in self.indices: + if i.index in {idx, unicode(idx)}: return i + raise IndexError('Index %d not found'%idx) + + def __str__(self): + ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20] + for r, dat in self.record_indices.iteritems(): + ans += self.dump_record(r, dat)[-1] + return '\n'.join(ans) + + def dump(self, bdir): + types = defaultdict(list) + for r, dat in self.record_indices.iteritems(): + tbs_type, strings = self.dump_record(r, dat) + if tbs_type == 0: continue + types[tbs_type] += strings + for typ, strings in types.iteritems(): + with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f: + f.write('\n'.join(strings)) + + def dump_record(self, r, dat): + ans = [] + ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx, + dat['geom'][0], dat['geom'][1])) + s, e, c = dat['starts'], dat['ends'], dat['complete'] + ans.append(('\tContains: %d index entries ' + '(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e, + c, s)))) + byts = bytearray(r.trailing_data.get('indexing', b'')) + ans.append('TBS bytes: %s'%format_bytes(byts)) + for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)): + if entries: + ans.append('\t%s:'%typ) + for x in entries: + ans.append(('\t\tIndex Entry: %s (Parent index: %s, ' + 'Depth: %d, Offset: %d, Size: %d) [%s]')%( + x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) + def bin4(num): + ans = bin(num)[2:] + return bytes('0'*(4-len(ans)) + ans) + + def repr_extra(x): + return str({bin4(k):v for k, v in extra.iteritems()}) + + tbs_type = 0 + is_periodical = self.doc_type in (257, 258, 259) + if len(byts): + outermost_index, extra, consumed = decode_tbs(byts, flag_size=3) + byts = byts[consumed:] + for k in extra: + tbs_type |= k + ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) + ans.append('Outermost index: %d'%outermost_index) + ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) + if is_periodical: # Hierarchical periodical + try: + byts, a = self.interpret_periodical(tbs_type, byts, + dat['geom'][0]) + except: + import traceback + traceback.print_exc() + a = [] + print ('Failed to decode TBS bytes for record: %d'%r.idx) + ans += a + if byts: + sbyts = tuple(hex(b)[2:] for b in byts) + ans.append('Remaining bytes: %s'%' '.join(sbyts)) + + ans.append('') + return tbs_type, ans + + def interpret_periodical(self, tbs_type, byts, record_offset): + ans = [] + + def read_section_transitions(byts, psi=None): # {{{ + if psi is None: + # Assume previous section is 1 + psi = self.get_index(1) + + while byts: + ai, extra, consumed = decode_tbs(byts) + byts = byts[consumed:] + if extra.get(0b0010, None) is not None: + raise ValueError('Dont know how to interpret flag 0b0010' + ' while reading section transitions') + if extra.get(0b1000, None) is not None: + if len(extra) > 1: + raise ValueError('Dont know how to interpret flags' + ' %r while reading section transitions'%extra) + nsi = self.get_index(psi.index+1) + ans.append('Last article in this record of section %d' + ' (relative to next section index [%d]): ' + '%d [%d absolute index]'%(psi.index, nsi.index, ai, + ai+nsi.index)) + psi = nsi + continue + + ans.append('First article in this record of section %d' + ' (relative to its parent section): ' + '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) + + num = extra.get(0b0100, None) + if num is None: + msg = ('The section %d has at most one article' + ' in this record')%psi.index + else: + msg = ('Number of articles in this record of ' + 'section %d: %d')%(psi.index, num) + ans.append(msg) + + offset = extra.get(0b0001, None) + if offset is not None: + if offset == 0: + ans.append('This record is spanned by the article:' + '%d'%(ai+psi.index)) + else: + ans.append('->Offset to start of next section (%d) from start' + ' of record: %d [%d absolute offset]'%(psi.index+1, + offset, offset+record_offset)) + return byts + # }}} + + def read_starting_section(byts): # {{{ + orig = byts + si, extra, consumed = decode_tbs(byts) + byts = byts[consumed:] + if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: + raise ValueError('Dont know how to interpret flags %r' + ' when reading starting section'%extra) + si = self.get_index(si) + ans.append('The section at the start of this record is:' + ' %s'%si.index) + if 0b0100 in extra: + num = extra[0b0100] + ans.append('The number of articles from the section %d' + ' in this record: %s'%(si.index, num)) + elif 0b0001 in extra: + eof = extra[0b0001] + if eof != 0: + raise ValueError('Unknown eof value %s when reading' + ' starting section. All bytes: %r'%(eof, orig)) + ans.append('??This record has more than one article from ' + ' the section: %s'%si.index) + return si, byts + # }}} + + if tbs_type & 0b0100: + # Starting section is the first section + ssi = self.get_index(1) + else: + ssi, byts = read_starting_section(byts) + + byts = read_section_transitions(byts, ssi) + + return byts, ans + +# }}} + +class MOBIFile(object): # {{{ + + def __init__(self, mf): + for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header', + 'huffman_record_nums',): + setattr(self, x, getattr(mf, x)) + + self.index_header = self.index_record = None + self.indexing_record_nums = set() + pir = self.mobi_header.primary_index_record + if pir != NULL_INDEX: + self.index_header = IndexHeader(self.records[pir]) + numi = self.index_header.index_count + self.cncx = CNCX(self.records[ + pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks], + self.index_header.index_encoding) + self.index_record = IndexRecord(self.records[pir+1:pir+1+numi], + self.index_header, self.cncx) + self.indexing_record_nums = set(xrange(pir, + pir+1+numi+self.index_header.num_of_cncx_blocks)) + self.secondary_index_record = self.secondary_index_header = None + sir = self.mobi_header.secondary_index_record + if sir != NULL_INDEX: + self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) + numi = self.secondary_index_header.index_count + self.indexing_record_nums.add(sir) + self.secondary_index_record = IndexRecord( + self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx) + self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi)) + + + ntr = self.mobi_header.number_of_text_records + fntbr = self.mobi_header.first_non_book_record + fii = self.mobi_header.first_image_index + if fntbr == NULL_INDEX: + fntbr = len(self.records) + self.text_records = [TextRecord(r, self.records[r], + self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1, + min(len(self.records), ntr+1))] + self.image_records, self.binary_records = [], [] + self.font_records = [] + image_index = 0 + for i in xrange(fntbr, len(self.records)): + if i in self.indexing_record_nums or i in self.huffman_record_nums: + continue + image_index += 1 + r = self.records[i] + fmt = None + if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS', + b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', + b'AUDI', b'VIDE', b'FONT'}: + try: + width, height, fmt = identify_data(r.raw) + except: + pass + if fmt is not None: + self.image_records.append(ImageRecord(image_index, r, fmt)) + elif r.raw[:4] == b'FONT': + self.font_records.append(FontRecord(i, r)) + else: + self.binary_records.append(BinaryRecord(i, r)) + + if self.index_record is not None: + self.tbs_indexing = TBSIndexing(self.text_records, + self.index_record.indices, self.mobi_header.type_raw) + + def print_header(self, f=sys.stdout): + print (str(self.palmdb).encode('utf-8'), file=f) + print (file=f) + print ('Record headers:', file=f) + for i, r in enumerate(self.records): + print ('%6d. %s'%(i, r.header), file=f) + + print (file=f) + print (str(self.mobi_header).encode('utf-8'), file=f) +# }}} + +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) + with open(os.path.join(ddir, 'header.txt'), 'wb') as out: + f.print_header(f=out) + + alltext = os.path.join(ddir, 'text.html') + with open(alltext, 'wb') as of: + alltext = b'' + for rec in f.text_records: + of.write(rec.raw) + alltext += rec.raw + of.seek(0) + + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) + + if f.index_header is not None: + f.index_record.alltext = alltext + with open(os.path.join(ddir, 'index.txt'), 'wb') as out: + print(str(f.index_header), file=out) + print('\n\n', file=out) + if f.secondary_index_header is not None: + print(str(f.secondary_index_header).encode('utf-8'), file=out) + print('\n\n', file=out) + if f.secondary_index_record is not None: + print(str(f.secondary_index_record).encode('utf-8'), file=out) + print('\n\n', file=out) + print(str(f.cncx).encode('utf-8'), file=out) + print('\n\n', file=out) + print(str(f.index_record), file=out) + with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out: + print(str(f.tbs_indexing), file=out) + f.tbs_indexing.dump(ddir) + + for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), + ('binary', 'binary_records'), ('font', 'font_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr): + rec.dump(tdir) + + + +# }}} + + diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py new file mode 100644 index 0000000000..e4a92ee95c --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os + +from calibre.ebooks.mobi.debug.headers import TextRecord + +class MOBIFile(object): + + def __init__(self, mf): + self.mf = mf + h, h8 = mf.mobi_header, mf.mobi8_header + first_text_record = 1 + offset = 0 + res_end = len(mf.records) + if mf.kf8_type == 'joint': + offset = h.exth.kf8_header_index + res_end = offset - 1 + + self.resource_records = mf.records[h.first_non_book_record:res_end] + self.text_records = [TextRecord(i, r, h8.extra_data_flags, + mf.decompress8) for i, r in + enumerate(mf.records[first_text_record+offset: + first_text_record+offset+h8.number_of_text_records])] + + self.raw_text = b''.join(r.raw for r in self.text_records) + + def print_header(self, f=sys.stdout): + print (str(self.mf.palmdb).encode('utf-8'), file=f) + print (file=f) + print ('Record headers:', file=f) + for i, r in enumerate(self.mf.records): + print ('%6d. %s'%(i, r.header), file=f) + + print (file=f) + print (str(self.mf.mobi8_header).encode('utf-8'), file=f) + + +def inspect_mobi(mobi_file, ddir): + f = MOBIFile(mobi_file) + with open(os.path.join(ddir, 'header.txt'), 'wb') as out: + f.print_header(f=out) + + alltext = os.path.join(ddir, 'raw_text.html') + with open(alltext, 'wb') as of: + of.write(f.raw_text) + + for tdir, attr in [('text_records', 'text_records'), ('images', + 'image_records'), ('binary', 'binary_records'), ('font', + 'font_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr, []): + rec.dump(tdir) + + diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index eaad81730d..06d349d5de 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -186,20 +186,16 @@ class BookHeader(object): if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) - if self.mobi_version >= 8: - self.skelidx, = struct.unpack_from('>L', raw, 0xFC) - - # Index into
sections in raw_ml - self.dividx, = struct.unpack_from('>L', raw, 0xF8) - - # Index into Other files - self.othidx, = struct.unpack_from('>L', raw, 0x104) + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative + if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): + self.dividx, self.skelidx, self.datpidx, self.othidx = \ + struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece - self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) - self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4) + self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index b292d55c13..dd85b5a5cb 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct -from collections import OrderedDict +from collections import OrderedDict, namedtuple -from calibre.ebooks.mobi.utils import decint, count_set_bits +from calibre.ebooks.mobi.utils import (decint, count_set_bits, + decode_string) + +TagX = namedtuple('TagX', 'tag num_of_values bitmask eof') +PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values') class InvalidFile(ValueError): pass @@ -37,9 +41,8 @@ def parse_indx_header(data): 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' ) num = len(words) - values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)]) - header = {words[i]:values[i] for i in xrange(num)} - return header + values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) + return dict(zip(words, values)) class CNCX(object): # {{{ @@ -77,81 +80,116 @@ class CNCX(object): # {{{ return self.records.get(offset, default) # }}} -def parse_tag_section(data): +def parse_tagx_section(data): check_signature(data, b'TAGX') tags = [] - first_entry_offset, = struct.unpack_from(b'>L', data, 0x04) - control_byte_count, = struct.unpack_from(b'>L', data, 0x08) + first_entry_offset, = struct.unpack_from(b'>L', data, 4) + control_byte_count, = struct.unpack_from(b'>L', data, 8) - # Skip the first 12 bytes already read above. for i in xrange(12, first_entry_offset, 4): - pos = i - tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]), - ord(data[pos+3]))) + vals = list(bytearray(data[i:i+4])) + tags.append(TagX(*vals)) return control_byte_count, tags -def get_tag_map(control_byte_count, tags, data, start, end): +def get_tag_map(control_byte_count, tagx, data, strict=False): ptags = [] ans = {} - control_byte_index = 0 - data_start = start + control_byte_count + control_bytes = list(bytearray(data[:control_byte_count])) + data = data[control_byte_count:] - for tag, values_per_entry, mask, end_flag in tags: - if end_flag == 0x01: - control_byte_index += 1 + for x in tagx: + if x.eof == 0x01: + control_bytes = control_bytes[1:] continue - value = ord(data[start + control_byte_index]) & mask + value = control_bytes[0] & x.bitmask if value != 0: - if value == mask: - if count_set_bits(mask) > 1: + value_count = value_bytes = None + if value == x.bitmask: + if count_set_bits(x.bitmask) > 1: # If all bits of masked value are set and the mask has more # than one bit, a variable width value will follow after # the control bytes which defines the length of bytes (NOT # the value count!) which will contain the corresponding # variable width values. - value, consumed = decint(data[data_start:]) - data_start += consumed - ptags.append((tag, None, value, values_per_entry)) + value_bytes, consumed = decint(data) + data = data[consumed:] else: - ptags.append((tag, 1, None, values_per_entry)) + value_count = 1 else: # Shift bits to get the masked value. - while mask & 0x01 == 0: - mask = mask >> 1 - value = value >> 1 - ptags.append((tag, value, None, values_per_entry)) - for tag, value_count, value_bytes, values_per_entry in ptags: + mask = x.bitmask + while mask & 0b1 == 0: + mask >>= 1 + value >>= 1 + value_count = value + ptags.append(PTagX(x.tag, value_count, value_bytes, + x.num_of_values)) + + for x in ptags: values = [] - if value_count != None: + if x.value_count is not None: # Read value_count * values_per_entry variable width values. - for _ in xrange(value_count*values_per_entry): - byts, consumed = decint(data[data_start:]) - data_start += consumed + for _ in xrange(x.value_count * x.num_of_values): + byts, consumed = decint(data) + data = data[consumed:] values.append(byts) - else: + else: # value_bytes is not None # Convert value_bytes to variable width values. total_consumed = 0 - while total_consumed < value_bytes: + while total_consumed < x.value_bytes: # Does this work for values_per_entry != 1? - byts, consumed = decint(data[data_start:]) - data_start += consumed + byts, consumed = decint(data) + data = data[consumed:] total_consumed += consumed values.append(byts) - if total_consumed != value_bytes: - print ("Error: Should consume %s bytes, but consumed %s" % - (value_bytes, total_consumed)) - ans[tag] = values - # Test that all bytes have been processed if end is given. - if end is not None and data_start < end: - # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. - rest = data[data_start:end] - if rest.replace(b'\0', b''): - print ("Warning: There are unprocessed index bytes left: %s" % - format_bytes(rest)) + if total_consumed != x.value_bytes: + err = ("Error: Should consume %s bytes, but consumed %s" % + (x.value_bytes, total_consumed)) + if strict: + raise ValueError(err) + else: + print(err) + ans[x.tag] = values + # Test that all bytes have been processed + if data.replace(b'\0', b''): + err = ("Warning: There are unprocessed index bytes left: %s" % + format_bytes(data)) + if strict: + raise ValueError(err) + else: + print(err) return ans +def parse_index_record(table, data, control_byte_count, tags, codec, + strict=False): + header = parse_indx_header(data) + idxt_pos = header['start'] + if data[idxt_pos:idxt_pos+4] != b'IDXT': + print ('WARNING: Invalid INDX record') + entry_count = header['count'] + + # loop through to build up the IDXT position starts + idx_positions= [] + for j in xrange(entry_count): + pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) + idx_positions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill + # bytes we need to ignore!) + idx_positions.append(idxt_pos) + + # For each entry in the IDXT build up the tag map and any associated + # text + for j in xrange(entry_count): + start, end = idx_positions[j:j+2] + rec = data[start:end] + ident, consumed = decode_string(rec, codec=codec) + rec = rec[consumed:] + tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) + table[ident] = tag_map + + def read_index(sections, idx, codec): table, cncx = OrderedDict(), CNCX([], codec) @@ -166,32 +204,11 @@ def read_index(sections, idx, codec): cncx = CNCX(cncx_records, codec) tag_section_start = indx_header['len'] - control_byte_count, tags = parse_tag_section(data[tag_section_start:]) + control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) for i in xrange(idx + 1, idx + 1 + indx_count): + # Index record data = sections[i][0] - header = parse_indx_header(data) - idxt_pos = header['start'] - entry_count = header['count'] - - # loop through to build up the IDXT position starts - idx_positions= [] - for j in xrange(entry_count): - pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) - idx_positions.append(pos) - # The last entry ends before the IDXT tag (but there might be zero fill - # bytes we need to ignore!) - idx_positions.append(idxt_pos) - - # For each entry in the IDXT build up the tag map and any associated - # text - for j in xrange(entry_count): - start, end = idx_positions[j:j+2] - text_length = ord(data[start]) - text = data[start+1:start+1+text_length] - tag_map = get_tag_map(control_byte_count, tags, data, - start+1+text_length, end) - table[text] = tag_map - + parse_index_record(table, data, control_byte_count, tags, codec) return table, cncx diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 26583cf30c..8bb7f211f3 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -33,9 +33,11 @@ def update_internal_links(mobi8_reader): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) - filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset) + filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32), + int(offset, 32)) suffix = (b'#' + idtag) if idtag else b'' - replacement = filename.encode(mr.header.codec) + suffix + replacement = filename.split('/')[-1].encode( + mr.header.codec) + suffix tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = ''.join([x.decode(mr.header.codec) for x in srcpieces]) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index c8dec607c1..6dd789755d 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -107,7 +107,10 @@ class MobiReader(object): self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) - if self.book_header.mobi_version == 8: + # Ancient PRC files from Baen can have random values for + # mobi_version, so be conservative + if (self.book_header.mobi_version == 8 and hasattr(self.book_header, + 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: @@ -118,12 +121,17 @@ class MobiReader(object): try: self.book_header = BookHeader(self.sections[k8i][0], self.ident, user_encoding, self.log) - # The following are only correct in the Mobi 6 - # header not the Mobi 8 header + + # Only the first_image_index from the MOBI 6 header is + # useful for x in ('first_image_index',): setattr(self.book_header, x, getattr(bh, x)) + + # We need to do this because the MOBI 6 text extract code + # does not know anything about the kf8 offset if hasattr(self.book_header, 'huff_offset'): self.book_header.huff_offset += k8i + self.kf8_type = 'joint' self.kf8_boundary = k8i-1 except: diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index f5421bc9ea..ec7166ebb0 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -33,6 +33,7 @@ class Mobi8Reader(object): def __init__(self, mobi6_reader, log): self.mobi6_reader, self.log = mobi6_reader, log self.header = mobi6_reader.book_header + self.encrypted_fonts = [] def __call__(self): self.mobi6_reader.check_for_drm() @@ -229,11 +230,9 @@ class Mobi8Reader(object): def get_id_tag_by_pos_fid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file - row = int(posfid, 32) - off = int(offset, 32) - [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row] - pos = insertpos + off - fname = self.get_file_info(pos).filename + insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] + pos = insertpos + offset + fi = self.get_file_info(pos) # an existing "id=" must exist in original xhtml otherwise it would not # have worked for linking. Amazon seems to have added its own # additional "aid=" inside tags whose contents seem to represent some @@ -242,7 +241,7 @@ class Mobi8Reader(object): # so find the closest "id=" before position the file by actually # searching in that file idtext = self.get_id_tag(pos) - return fname, idtext + return '%s/%s'%(fi.type, fi.filename), idtext def get_id_tag(self, pos): # find the correct tag by actually searching in the destination @@ -253,12 +252,13 @@ class Mobi8Reader(object): textblock = self.parts[fi.num] id_map = [] npos = pos - fi.start - # if npos inside a tag then search all text before the its end of tag - # marker pgt = textblock.find(b'>', npos) plt = textblock.find(b'<', npos) - if pgt < plt: + # if npos inside a tag then search all text before the its end of tag marker + # else not in a tag need to search the preceding tag + if plt == npos or pgt < plt: npos = pgt + 1 + textblock = textblock[0:npos] # find id links only inside of tags # inside any < > pair find all "id=' and return whatever is inside # the quotes @@ -315,12 +315,18 @@ class Mobi8Reader(object): # Add href and anchor info to the index entries for entry in index_entries: - pos = entry['pos'] - fi = self.get_file_info(pos) - if fi.filename is None: - raise ValueError('Index entry has invalid pos: %d'%pos) - idtag = self.get_id_tag(pos).decode(self.header.codec) - entry['href'] = '%s/%s'%(fi.type, fi.filename) + pos_fid = entry['pos_fid'] + if pos_fid is None: + pos = entry['pos'] + fi = self.get_file_info(pos) + if fi.filename is None: + raise ValueError('Index entry has invalid pos: %d'%pos) + idtag = self.get_id_tag(pos).decode(self.header.codec) + href = '%s/%s'%(fi.type, fi.filename) + else: + href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) + + entry['href'] = href entry['idtag'] = idtag # Build the TOC object @@ -350,6 +356,8 @@ class Mobi8Reader(object): with open(href.replace('/', os.sep), 'wb') as f: f.write(font['font_data'] if font['font_data'] else font['raw_data']) + if font['encrypted']: + self.encrypted_fonts.append(href) else: imgtype = imghdr.what(None, data) if imgtype is None: diff --git a/src/calibre/ebooks/mobi/reader/ncx.py b/src/calibre/ebooks/mobi/reader/ncx.py index 96ab4ac70d..ca3255e100 100644 --- a/src/calibre/ebooks/mobi/reader/ncx.py +++ b/src/calibre/ebooks/mobi/reader/ncx.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import read_index @@ -23,7 +22,30 @@ tag_fieldname_map = { 6: ['pos_fid',0], 21: ['parent',0], 22: ['child1',0], - 23: ['childn',0] + 23: ['childn',0], + 69: ['image_index',0], + 70 : ['desc_offset', 0], # 'Description offset in cncx' + 71 : ['author_offset', 0], # 'Author offset in cncx' + 72 : ['image_caption_offset', 0], # 'Image caption offset in cncx', + 73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx', + +} + +default_entry = { + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Class", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'description': None, + 'author': None, + 'image_caption': None, + 'image_attribution': None, } def read_ncx(sections, index, codec): @@ -34,32 +56,25 @@ def read_ncx(sections, index, codec): for num, x in enumerate(table.iteritems()): text, tag_map = x - entry = { - 'name': text, - 'pos': -1, - 'len': 0, - 'noffs': -1, - 'text' : "Unknown Text", - 'hlvl' : -1, - 'kind' : "Unknown Kind", - 'pos_fid' : None, - 'parent' : -1, - 'child1' : -1, - 'childn' : -1, - 'num' : num - } + entry = default_entry.copy() + entry['name'] = text + entry['num'] = num - for tag in tag_fieldname_map.keys(): + for tag in tag_fieldname_map.iterkeys(): fieldname, i = tag_fieldname_map[tag] if tag in tag_map: fieldvalue = tag_map[tag][i] if tag == 6: - fieldvalue = to_base(fieldvalue, base=32) + # Appears to be an idx into the KF8 elems table with an + # offset + fieldvalue = tuple(tag_map[tag]) entry[fieldname] = fieldvalue - if tag == 3: - entry['text'] = cncx.get(fieldvalue, 'Unknown Text') - if tag == 5: - entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind') + for which, name in {3:'text', 5:'kind', 70:'description', + 71:'author', 72:'image_caption', + 73:'image_attribution'}.iteritems(): + if tag == which: + entry[name] = cncx.get(fieldvalue, + default_entry[name]) index_entries.append(entry) return index_entries diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 6ec86f77ee..4c1e52e119 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -15,7 +15,13 @@ from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 -def decode_hex_number(raw): +def decode_string(raw, codec='utf-8'): + length, = struct.unpack(b'>B', raw[0]) + raw = raw[1:1+length] + consumed = length+1 + return raw.decode(codec), consumed + +def decode_hex_number(raw, codec='utf-8'): ''' Return a variable length number encoded using hexadecimal encoding. These numbers have the first byte which tells the number of bytes that follow. @@ -25,13 +31,16 @@ def decode_hex_number(raw): :param raw: Raw binary data as a bytestring :return: The number and the number of bytes from raw that the number - occupies + occupies. ''' - length, = struct.unpack(b'>B', raw[0]) - raw = raw[1:1+length] - consumed = length+1 + raw, consumed = decode_string(raw, codec=codec) return int(raw, 16), consumed +def encode_string(raw): + ans = bytearray(bytes(raw)) + ans.insert(0, len(ans)) + return bytes(ans) + def encode_number_as_hex(num): ''' Encode num as a variable length encoded hexadecimal number. Returns the @@ -44,9 +53,7 @@ def encode_number_as_hex(num): nlen = len(num) if nlen % 2 != 0: num = b'0'+num - ans = bytearray(num) - ans.insert(0, len(num)) - return bytes(ans) + return encode_string(num) def encint(value, forward=True): ''' @@ -430,7 +437,7 @@ def read_font_record(data, extent=1040): # {{{ # The zlib compressed data begins with 2 bytes of header and # has 4 bytes of checksum at the end ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed', - 'headers':None} + 'headers':None, 'encrypted':False} try: usize, flags, dstart, xor_len, xor_start = struct.unpack_from( @@ -453,6 +460,7 @@ def read_font_record(data, extent=1040): # {{{ buf[n] ^= key[n%xor_len] # XOR of buf and key font_data = bytes(buf) + ans['encrypted'] = True if flags & 0b1: # ZLIB compressed data diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 60f69e2e17..0fdc6cad1d 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -234,13 +234,15 @@ class RTFMLizer(object): # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == 'img': - src = os.path.basename(elem.get('src')) - block_start = '' - block_end = '' - if 'block' not in tag_stack: - block_start = '{\\par\\pard\\hyphpar ' - block_end = '}' - text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) + src = elem.get('src') + if src: + src = os.path.basename(elem.get('src')) + block_start = '' + block_end = '' + if 'block' not in tag_stack: + block_start = '{\\par\\pard\\hyphpar ' + block_end = '}' + text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) single_tag = SINGLE_TAGS.get(tag, None) if single_tag: diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index bb695db841..bbdef5b1b5 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -70,6 +70,9 @@ class AddAction(InterfaceAction): self.add_menu.addSeparator() ma('add-formats', _('Add files to selected book records'), triggered=self.add_formats, shortcut=_('Shift+A')) + self.add_menu.addSeparator() + ma('add-config', _('Configure the adding of books'), + triggered=self.add_config) self.qaction.triggered.connect(self.add_books) @@ -78,6 +81,11 @@ class AddAction(InterfaceAction): for action in list(self.add_menu.actions())[1:]: action.setEnabled(enabled) + def add_config(self): + self.gui.iactions['Preferences'].do_config( + initial_plugin=('Import/Export', 'Adding'), + close_after_initial=True) + def add_formats(self, *args): if self.gui.stack.currentIndex() != 0: return diff --git a/src/calibre/gui2/store/stores/nexto_plugin.py b/src/calibre/gui2/store/stores/nexto_plugin.py index 16004908df..f7572e6522 100644 --- a/src/calibre/gui2/store/stores/nexto_plugin.py +++ b/src/calibre/gui2/store/stores/nexto_plugin.py @@ -3,7 +3,7 @@ from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL 3' -__copyright__ = '2011, Tomasz Długosz ' +__copyright__ = '2011-2012, Tomasz Długosz ' __docformat__ = 'restructuredtext en' import re @@ -47,41 +47,47 @@ class NextoStore(BasicStoreConfig, StorePlugin): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015' br = browser() + offset=0 counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ul[@class="productslist"]/li'): - if counter <= 0: + + while counter: + with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: + doc = html.fromstring(f.read()) + for data in doc.xpath('//ul[@class="productslist"]/li'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href')) + if not id: + continue + + price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) + + cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) + title = ''.join(data.xpath('.//a[@class="title"]/text()')) + title = re.sub(r' - ebook$', '', title) + formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()')) + DrmFree = re.search(r'bez.DRM', formats) + formats = re.sub(r'\(.+\)', '', formats) + + author = '' + with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf: + idata = html.fromstring(nf.read()) + author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED + s.formats = formats.upper().strip() + + yield s + if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'): break - - id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href')) - if not id: - continue - - price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) - - cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) - title = ''.join(data.xpath('.//a[@class="title"]/text()')) - title = re.sub(r' - ebook$', '', title) - formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()')) - DrmFree = re.search(r'bez.DRM', formats) - formats = re.sub(r'\(.+\)', '', formats) - - author = '' - with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf: - idata = html.fromstring(nf.read()) - author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()')) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED - s.formats = formats.upper().strip() - - yield s + offset+=10 diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index 3158241f28..f876b87fc3 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -255,7 +255,10 @@ - + + + Set the maximum width that the book's text and pictures will take when in fullscreen mode. This allows you to read the book text without it becoming too wide. + px @@ -270,10 +273,10 @@ - Maximum &view width: + Maximum text width in &fullscreen: - max_view_width + max_fs_width @@ -350,7 +353,7 @@ serif_family sans_family mono_family - max_view_width + max_fs_width opt_remember_window_size buttonBox diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index 2f520c1912..7999458004 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -12,7 +12,7 @@ from PyQt4.Qt import (QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, QPainter, QPalette, QBrush, QFontDatabase, QDialog, QColor, QPoint, QImage, QRegion, QVariant, QIcon, QFont, pyqtSignature, QAction, QByteArray, QMenu, - pyqtSignal, QSwipeGesture) + pyqtSignal, QSwipeGesture, QApplication) from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from calibre.utils.config import Config, StringConfig @@ -46,8 +46,10 @@ def config(defaults=None): help=_('Remember last used window size')) c.add_opt('user_css', default='', help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) - c.add_opt('max_view_width', default=6000, - help=_('Maximum width of the viewer window, in pixels.')) + c.add_opt('max_fs_width', default=800, + help=_("Set the maximum width that the book's text and pictures will take" + " when in fullscreen mode. This allows you to read the book text" + " without it becoming too wide.")) c.add_opt('fit_images', default=True, help=_('Resize images larger than the viewer window to fit inside it')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) @@ -101,7 +103,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.standard_font.setCurrentIndex({'serif':0, 'sans':1, 'mono':2}[opts.standard_font]) self.css.setPlainText(opts.user_css) self.css.setToolTip(_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) - self.max_view_width.setValue(opts.max_view_width) + self.max_fs_width.setValue(opts.max_fs_width) with zipfile.ZipFile(P('viewer/hyphenate/patterns.zip', allow_user_override=False), 'r') as zf: pats = [x.split('.')[0].replace('-', '_') for x in zf.namelist()] @@ -144,7 +146,7 @@ class ConfigDialog(QDialog, Ui_Dialog): c.set('user_css', unicode(self.css.toPlainText())) c.set('remember_window_size', self.opt_remember_window_size.isChecked()) c.set('fit_images', self.opt_fit_images.isChecked()) - c.set('max_view_width', int(self.max_view_width.value())) + c.set('max_fs_width', int(self.max_fs_width.value())) c.set('hyphenate', self.hyphenate.isChecked()) c.set('remember_current_page', self.opt_remember_current_page.isChecked()) c.set('wheel_flips_pages', self.opt_wheel_flips_pages.isChecked()) @@ -192,6 +194,8 @@ class Document(QWebPage): # {{{ self.loaded_javascript = False self.js_loader = JavaScriptLoader( dynamic_coffeescript=self.debug_javascript) + self.initial_left_margin = self.initial_right_margin = u'' + self.in_fullscreen_mode = False self.setLinkDelegationPolicy(self.DelegateAllLinks) self.scroll_marks = [] @@ -239,6 +243,9 @@ class Document(QWebPage): # {{{ self.enable_page_flip = self.page_flip_duration > 0.1 self.font_magnification_step = opts.font_magnification_step self.wheel_flips_pages = opts.wheel_flips_pages + screen_width = QApplication.desktop().screenGeometry().width() + # Leave some space for the scrollbar and some border + self.max_fs_width = min(opts.max_fs_width, screen_width-50) def fit_images(self): if self.do_fit_images: @@ -274,6 +281,30 @@ class Document(QWebPage): # {{{ self.set_bottom_padding(0) self.fit_images() self.init_hyphenate() + self.initial_left_margin = unicode(self.javascript( + 'document.body.style.marginLeft').toString()) + self.initial_right_margin = unicode(self.javascript( + 'document.body.style.marginRight').toString()) + if self.in_fullscreen_mode: + self.switch_to_fullscreen_mode() + + def switch_to_fullscreen_mode(self): + self.in_fullscreen_mode = True + self.javascript(''' + var s = document.body.style; + s.maxWidth = "%dpx"; + s.marginLeft = "auto"; + s.marginRight = "auto"; + '''%self.max_fs_width) + + def switch_to_window_mode(self): + self.in_fullscreen_mode = False + self.javascript(''' + var s = document.body.style; + s.maxWidth = "none"; + s.marginLeft = "%s"; + s.marginRight = "%s"; + '''%(self.initial_left_margin, self.initial_right_margin)) @pyqtSignature("QString") def debug(self, msg): @@ -581,8 +612,8 @@ class DocumentView(QWebView): # {{{ def config(self, parent=None): self.document.do_config(parent) - if self.manager is not None: - self.manager.set_max_width() + if self.document.in_fullscreen_mode: + self.document.switch_to_fullscreen_mode() self.setFocus(Qt.OtherFocusReason) def bookmark(self): @@ -602,6 +633,9 @@ class DocumentView(QWebView): # {{{ menu.insertAction(list(menu.actions())[0], self.search_action) menu.addSeparator() menu.addAction(self.goto_location_action) + if self.document.in_fullscreen_mode and self.manager is not None: + menu.addSeparator() + menu.addAction(self.manager.toggle_toolbar_action) menu.exec_(ev.globalPos()) def lookup(self, *args): diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 64521ecdd7..c1cb89aeb6 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -5,11 +5,11 @@ import traceback, os, sys, functools, collections, re from functools import partial from threading import Thread -from PyQt4.Qt import QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, \ - QDoubleSpinBox, QLabel, QTextBrowser, \ - QPainter, QBrush, QColor, QStandardItemModel, QPalette, \ - QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, \ - QToolButton, QMenu, QInputDialog, QAction, QKeySequence +from PyQt4.Qt import (QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, + QSize, QDoubleSpinBox, QLabel, QTextBrowser, QPropertyAnimation, + QPainter, QBrush, QColor, QStandardItemModel, QPalette, QStandardItem, + QUrl, QRegExpValidator, QRegExp, QLineEdit, QToolButton, QMenu, + QInputDialog, QAction, QKeySequence) from calibre.gui2.viewer.main_ui import Ui_EbookViewer from calibre.gui2.viewer.printing import Printing @@ -55,8 +55,6 @@ class TOC(QStandardItemModel): self.appendRow(TOCItem(t)) self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents'))) - - class Worker(Thread): def run(self): @@ -292,6 +290,37 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.tool_bar2.setContextMenuPolicy(Qt.PreventContextMenu) self.tool_bar.widgetForAction(self.action_bookmark).setPopupMode(QToolButton.MenuButtonPopup) self.action_full_screen.setCheckable(True) + self.full_screen_label = QLabel(''' +
+

%s

+

%s

+

%s

+
+ '''%(_('Full screen mode'), + _('Right click to show controls'), + _('Press Esc to quit')), + self) + self.full_screen_label.setVisible(False) + self.full_screen_label.setStyleSheet(''' + QLabel { + text-align: center; + background-color: white; + color: black; + border-width: 1px; + border-style: solid; + border-radius: 20px; + } + ''') + self.toggle_toolbar_action = QAction(_('Show/hide controls'), self) + self.toggle_toolbar_action.triggered.connect(self.toggle_toolbars) + self.addAction(self.toggle_toolbar_action) + self.full_screen_label_anim = QPropertyAnimation( + self.full_screen_label, 'size') + self.esc_full_screen_action = a = QAction(self) + self.addAction(a) + a.setShortcut(Qt.Key_Escape) + a.setEnabled(False) + a.triggered.connect(self.action_full_screen.trigger) self.print_menu = QMenu() self.print_menu.addAction(QIcon(I('print-preview.png')), _('Print Preview')) @@ -299,7 +328,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.tool_bar.widgetForAction(self.action_print).setPopupMode(QToolButton.MenuButtonPopup) self.connect(self.action_print, SIGNAL("triggered(bool)"), partial(self.print_book, preview=False)) self.connect(self.print_menu.actions()[0], SIGNAL("triggered(bool)"), partial(self.print_book, preview=True)) - self.set_max_width() ca = self.view.copy_action ca.setShortcut(QKeySequence.Copy) self.addAction(ca) @@ -313,6 +341,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer): w = self.tool_bar.widgetForAction(self.action_open_ebook) w.setPopupMode(QToolButton.MenuButtonPopup) + for x in ('tool_bar', 'tool_bar2'): + x = getattr(self, x) + for action in x.actions(): + # So that the keyboard shortcuts for these actions will + # continue to function even when the toolbars are hidden + self.addAction(action) + self.restore_state() def set_toc_visible(self, yes): @@ -338,9 +373,18 @@ class EbookViewer(MainWindow, Ui_EbookViewer): count += 1 def closeEvent(self, e): + if self.isFullScreen(): + self.action_full_screen.trigger() + e.ignore() + return self.save_state() return MainWindow.closeEvent(self, e) + def toggle_toolbars(self): + for x in ('tool_bar', 'tool_bar2'): + x = getattr(self, x) + x.setVisible(not x.isVisible()) + def save_state(self): state = bytearray(self.saveState(self.STATE_VERSION)) vprefs['viewer_toolbar_state'] = state @@ -382,11 +426,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self._lookup = None self.dictionary_view.setHtml(html) - def set_max_width(self): - from calibre.gui2.viewer.documentview import config - c = config().parse() - self.frame.setMaximumWidth(c.max_view_width) - def get_remember_current_page_opt(self): from calibre.gui2.viewer.documentview import config c = config().parse() @@ -401,6 +440,46 @@ class EbookViewer(MainWindow, Ui_EbookViewer): else: self.showFullScreen() + def showFullScreen(self): + self.tool_bar.setVisible(False) + self.tool_bar2.setVisible(False) + self._original_frame_margins = ( + self.centralwidget.layout().contentsMargins(), + self.frame.layout().contentsMargins()) + self.frame.layout().setContentsMargins(0, 0, 0, 0) + self.centralwidget.layout().setContentsMargins(0, 0, 0, 0) + + super(EbookViewer, self).showFullScreen() + QTimer.singleShot(10, self.show_full_screen_label) + + def show_full_screen_label(self): + f = self.full_screen_label + self.esc_full_screen_action.setEnabled(True) + f.setVisible(True) + height = 200 + width = int(0.7*self.view.width()) + f.resize(width, height) + f.move((self.view.width() - width)//2, (self.view.height()-height)//2) + a = self.full_screen_label_anim + a.setDuration(500) + a.setStartValue(QSize(width, 0)) + a.setEndValue(QSize(width, height)) + a.start() + QTimer.singleShot(2750, self.full_screen_label.hide) + self.view.document.switch_to_fullscreen_mode() + + def showNormal(self): + self.esc_full_screen_action.setEnabled(False) + self.tool_bar.setVisible(True) + self.tool_bar2.setVisible(True) + self.full_screen_label.setVisible(False) + if hasattr(self, '_original_frame_margins'): + om = self._original_frame_margins + self.centralwidget.layout().setContentsMargins(om[0]) + self.frame.layout().setContentsMargins(om[1]) + super(EbookViewer, self).showNormal() + self.view.document.switch_to_window_mode() + def goto(self, ref): if ref: tokens = ref.split('.') diff --git a/src/calibre/gui2/viewer/main.ui b/src/calibre/gui2/viewer/main.ui index 3137ad2e07..659a534fa8 100644 --- a/src/calibre/gui2/viewer/main.ui +++ b/src/calibre/gui2/viewer/main.ui @@ -284,6 +284,9 @@ Toggle full screen + + Toggle full screen (F11) + diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index f3badd91c9..c9a3061295 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -15,6 +15,7 @@ from PyQt4.Qt import (QIcon, QFont, QLabel, QListWidget, QAction, QMenu, QStringListModel, QCompleter, QStringList, QTimer, QRect, QFontDatabase, QGraphicsView) +from calibre.constants import iswindows from calibre.gui2 import (NONE, error_dialog, pixmap_to_data, gprefs, warning_dialog) from calibre.gui2.filename_pattern_ui import Ui_Form @@ -365,7 +366,7 @@ class FontFamilyModel(QAbstractListModel): # {{{ self.families = list(qt_families.intersection(set(self.families))) self.families.sort() self.families[:0] = [_('None')] - self.font = QFont('sansserif') + self.font = QFont('Verdana' if iswindows else 'sansserif') def rowCount(self, *args): return len(self.families) diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py index 8763a313fc..fa3119bf53 100644 --- a/src/calibre/utils/smartypants.py +++ b/src/calibre/utils/smartypants.py @@ -591,6 +591,21 @@ def educateQuotes(str): str = re.sub(r'''""''', """””""", str) str = re.sub(r"""''""", """’’""", str) + # Special case for Quotes at inside of other entities, e.g.: + #

A double quote--"within dashes"--would be nice.

+ str = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", str) + str = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", str) + str = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", str) + str = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", str) + + # Special case for Quotes at end of line with a preceeding space (may change just to end of line) + str = re.sub(r"""(?<=\s)"$""", r"""”""", str) + str = re.sub(r"""(?<=\s)'$""", r"""’""", str) + + # Special case for Quotes at beginning of line with a space - multiparagraph quoted text: + str = re.sub(r"""^"(?=\s)""", r"""“""", str) + str = re.sub(r"""^'(?=\s)""", r"""‘""", str) + # Special case for decade abbreviations (the '80s): str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str)