Merge from trunk

2025-07-09 03:04:10 -04:00 · 2012-03-20 19:01:20 +01:00 · 2012-03-20 19:01:20 +01:00 · ab19edb96f
commit ab19edb96f
parent d93c5f39ef 931d46cd84
36 changed files with 2062 additions and 1782 deletions
--- a/recipes/fhm_uk.recipe
+++ b/recipes/fhm_uk.recipe
@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'FHM UK'
    description = 'Good News for Men'
-    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
+    cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg'
+    #   cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
    masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
    __author__ = 'Dave Asbury'
-    # last updated 27/1/12
+    # last updated 17/3/12
    language = 'en_GB'
    oldest_article = 28
    max_articles_per_feed = 12
@ -29,6 +30,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    feeds          = [
    (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
    (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
-    (u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
-    (u'Gaming',u'http://feed43.com/0755006465351035.xml'),
+                           (u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
+    #(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
+    #(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
+            (u'Gaming',u'http://feed43.com/6537162612465672.xml'),
                           ]
--- a/recipes/ivanamilakovic.recipe
+++ b/recipes/ivanamilakovic.recipe
@ -0,0 +1,43 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+'''
+ivanamilakovic.blogspot.com
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class IvanaMilakovic(BasicNewsRecipe):
+    title                 = u'Ivana Milaković'
+    __author__            = 'Darko Miletic'
+    description           = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...'
+    oldest_article        = 80
+    max_articles_per_feed = 100
+    language              = 'sr'
+    encoding              = 'utf-8'
+    no_stylesheets        = True
+    use_embedded_content  = True
+    publication_type      = 'blog'
+    extra_css             = """
+                               @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
+                               body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
+                               img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
+                            """
+
+    conversion_options = {
+                          'comment'  : description
+                        , 'tags'     : 'knjige, blog, srbija, sf'
+                        , 'publisher': 'Ivana Milakovic'
+                        , 'language' : language
+                        }
+
+    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
+
+    feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return self.adeify_images(soup)
--- a/recipes/klubknjige.recipe
+++ b/recipes/klubknjige.recipe
@ -0,0 +1,42 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+'''
+klub-knjige.blogspot.com
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class KlubKnjige(BasicNewsRecipe):
+    title                 = 'Klub knjige'
+    __author__            = 'Darko Miletic'
+    description           = 'literarni blog'    
+    oldest_article        = 30
+    max_articles_per_feed = 100
+    language              = 'sr'
+    encoding              = 'utf-8'
+    no_stylesheets        = True
+    use_embedded_content  = True
+    publication_type      = 'blog'    
+    extra_css             = """ 
+                               @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} 
+                               body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif} 
+                               img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } 
+                            """
+
+    conversion_options = {
+                          'comment'  : description
+                        , 'tags'     : 'knjige, blog, srbija, sf'
+                        , 'publisher': 'Klub Knjige'
+                        , 'language' : language
+                        }
+
+    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
+
+    feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return self.adeify_images(soup)
--- a/recipes/le_monde.recipe
+++ b/recipes/le_monde.recipe
@ -3,7 +3,6 @@ __copyright__ = '2011'
 '''
 lemonde.fr
 '''
-import re
 from calibre.web.feeds.recipes import BasicNewsRecipe

 class LeMonde(BasicNewsRecipe):
@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe):

    remove_empty_feeds = True

-    filterDuplicates = True
+    auto_cleanup = True

-    def preprocess_html(self, soup):
-        for alink in soup.findAll('a'):
-            if alink.string is not None:
-               tstr = alink.string
-               alink.replaceWith(tstr)
-        return self.adeify_images(soup)
-
-    preprocess_regexps = [
-        (re.compile(r'([0-9])%'), lambda m: m.group(1) + '&nbsp;%'),
-        (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + '&nbsp;' + m.group(4) + m.group(5) + m.group(6)),
-        (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + '&nbsp;' + m.group(2) + m.group(3) + m.group(4)),
-        (re.compile(r'<span>'), lambda match: ' <span>'),
-        (re.compile(r'\("'), lambda match: '(&laquo;&nbsp;'),
-        (re.compile(r'"\)'), lambda match: '&nbsp;&raquo;)'),
-        (re.compile(r'&ldquo;'), lambda match: '(&laquo;&nbsp;'),
-        (re.compile(r'&rdquo;'), lambda match: '&nbsp;&raquo;)'),
-        (re.compile(r'>\''), lambda match: '>&lsquo;'),
-        (re.compile(r' \''), lambda match: ' &lsquo;'),
-        (re.compile(r'\''), lambda match: '&rsquo;'),
-        (re.compile(r'"<em>'), lambda match: '<em>&laquo;&nbsp;'),
-        (re.compile(r'"<em>"</em><em>'), lambda match: '<em>&laquo;&nbsp;'),
-        (re.compile(r'"<a href='), lambda match: '&laquo;&nbsp;<a href='),
-        (re.compile(r'</em>"'), lambda match: '&nbsp;&raquo;</em>'),
-        (re.compile(r'</a>"'), lambda match: '&nbsp;&raquo;</a>'),
-        (re.compile(r'"</'), lambda match: '&nbsp;&raquo;</'),
-        (re.compile(r'>"'), lambda match: '>&laquo;&nbsp;'),
-        (re.compile(r'"<'), lambda match: '&nbsp;&raquo;<'),
-        (re.compile(r'&rsquo;"'), lambda match: '&rsquo;«&nbsp;'),
-        (re.compile(r' "'), lambda match: ' &laquo;&nbsp;'),
-        (re.compile(r'" '), lambda match: '&nbsp;&raquo; '),
-        (re.compile(r'"\.'), lambda match: '&nbsp;&raquo;.'),
-        (re.compile(r'",'), lambda match: '&nbsp;&raquo;,'),
-        (re.compile(r'"\?'), lambda match: '&nbsp;&raquo;?'),
-        (re.compile(r'":'), lambda match: '&nbsp;&raquo;:'),
-        (re.compile(r'";'), lambda match: '&nbsp;&raquo;;'),
-        (re.compile(r'"\!'), lambda match: '&nbsp;&raquo;!'),
-        (re.compile(r' :'), lambda match: '&nbsp;:'),
-        (re.compile(r' ;'), lambda match: '&nbsp;;'),
-        (re.compile(r' \?'), lambda match: '&nbsp;?'),
-        (re.compile(r' \!'), lambda match: '&nbsp;!'),
-        (re.compile(r'\s»'), lambda match: '&nbsp;»'),
-        (re.compile(r'«\s'), lambda match: '«&nbsp;'),
-        (re.compile(r' %'), lambda match: '&nbsp;%'),
-        (re.compile(r'\.jpg&nbsp;&raquo; border='), lambda match: '.jpg'),
-        (re.compile(r'\.png&nbsp;&raquo; border='), lambda match: '.png'),
-        (re.compile(r' &ndash; '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' – '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' - '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' -,'), lambda match: '&nbsp;&ndash;,'),
-        (re.compile(r'&raquo;:'), lambda match: '&raquo;&nbsp;:'),
-        ]
-
-
-    keep_only_tags    = [
-                       dict(name='div', attrs={'class':['contenu']})
-                        ]
-    remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
-    remove_tags_after = [dict(id='appel_temoignage')]
-
-    def get_article_url(self, article):
-          url = article.get('guid', None)
-          if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
-              url = None
-          return url
-
-#    def get_article_url(self, article):
-#        link = article.get('link')
-#        if 'blog' not in link and ('chat' not in link):
-#             return link

    feeds          = [
                      ('A la une', 'http://www.lemonde.fr/rss/une.xml'),
@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe):

        return cover_url

+    def get_article_url(self, article):
+        url = article.get('guid', None)
+        if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
+            url = None
+        return url
+
+
--- a/setup/installer/windows/freeze.py
+++ b/setup/installer/windows/freeze.py
@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
 from setup.installer.windows.wix import WixMixIn

 OPENSSL_DIR = r'Q:\openssl'
-QT_DIR = 'Q:\\Qt\\4.7.3'
+QT_DIR = 'Q:\\Qt\\4.8.0'
 QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
 LIBUNRAR         = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
 SW               = r'C:\cygwin\home\kovid\sw'
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@ -97,7 +97,9 @@ Now, run configure and make::

 -no-plugin-manifests is needed so that loading the plugins does not fail looking for the CRT assembly

-    configure -opensource -release -qt-zlib -qt-gif -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake
+    configure -opensource -release -qt-zlib -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake
+
+Add the path to the bin folder inside the Qt dir to your system PATH.

 SIP
 -----
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -381,12 +381,15 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
        user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
+    to_add = {}
    http_proxy = proxies.get('http', None)
    if http_proxy:
-        opener.set_proxies({'http':http_proxy})
+        to_add['http'] = http_proxy
    https_proxy = proxies.get('https', None)
    if https_proxy:
-        opener.set_proxies({'https':https_proxy})
+        to_add['https'] = https_proxy
+    if to_add:
+        opener.set_proxies(to_add)

    return opener

--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -625,7 +625,8 @@ from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK,
                POCKETBOOK701, POCKETBOOK360P, PI2)
 from calibre.devices.iliad.driver import ILIAD
 from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
-from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
+from calibre.devices.jetbook.driver import (JETBOOK, MIBUK, JETBOOK_MINI,
+        JETBOOK_COLOR)
 from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX,
        KINDLE_FIRE)
 from calibre.devices.nook.driver import NOOK, NOOK_COLOR
@ -664,9 +665,7 @@ plugins += [
    ILIAD,
    IREXDR1000,
    IREXDR800,
-    JETBOOK,
-    JETBOOK_MINI,
-    MIBUK,
+    JETBOOK, JETBOOK_MINI, MIBUK, JETBOOK_COLOR,
    SHINEBOOK,
    POCKETBOOK360, POCKETBOOK301, POCKETBOOK602, POCKETBOOK701, POCKETBOOK360P,
    PI2,
--- a/src/calibre/debug.py
+++ b/src/calibre/debug.py
@ -234,7 +234,7 @@ def main(args=sys.argv):
            sql_dump = args[-1]
        reinit_db(opts.reinitialize_db, sql_dump=sql_dump)
    elif opts.inspect_mobi:
-        from calibre.ebooks.mobi.debug import inspect_mobi
+        from calibre.ebooks.mobi.debug.main import inspect_mobi
        for path in args[1:]:
            prints('Inspecting:', path)
            inspect_mobi(path)
--- a/src/calibre/devices/jetbook/driver.py
+++ b/src/calibre/devices/jetbook/driver.py
@ -125,4 +125,29 @@ class JETBOOK_MINI(USBMS):

    SUPPORTS_SUB_DIRS = True

+class JETBOOK_COLOR(USBMS):
+
+    '''
+set([(u'0x951',
+      u'0x160b',
+      u'0x0',
+      u'Freescale',
+      u'Mass Storage Device',
+      u'0802270905553')])
+    '''
+
+    FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'djvu']
+
+    gui_name = 'JetBook Color'
+    name = 'JetBook Color Device Interface'
+    description    = _('Communicate with the JetBook Color reader.')
+    author         = 'Kovid Goyal'
+
+    VENDOR_ID = [0x951]
+    PRODUCT_ID = [0x160b]
+    BCD = [0x0]
+    EBOOK_DIR_MAIN = 'My Books'
+
+    SUPPORTS_SUB_DIRS = True
+

--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -27,7 +27,7 @@ class PRS505(USBMS):
    booklist_class = CollectionsBookList


-    FORMATS      = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt']
+    FORMATS      = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt', 'zbf']
    CAN_SET_METADATA = ['title', 'authors', 'collections']
    CAN_DO_DEVICE_DB_PLUGBOARD = True

--- a/src/calibre/ebooks/conversion/plugins/epub_output.py
+++ b/src/calibre/ebooks/conversion/plugins/epub_output.py
@ -190,12 +190,22 @@ class EPUBOutput(OutputFormatPlugin):
            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'):
                uuid = unicode(x).split(':')[-1]
                break
+        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
+
        if uuid is None:
            self.log.warn('No UUID identifier found')
            from uuid import uuid4
            uuid = str(uuid4())
            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)

+        if encrypted_fonts and not uuid.startswith('urn:uuid:'):
+            # Apparently ADE requires this value to start with urn:uuid:
+            # for some absurd reason, or it will throw a hissy fit and refuse
+            # to use the obfuscated fonts.
+            for x in identifiers:
+                if unicode(x) == uuid:
+                    x.content = 'urn:uuid:'+uuid
+
        with TemporaryDirectory(u'_epub_output') as tdir:
            from calibre.customize.ui import plugin_for_output_format
            metadata_xml = None
@ -210,7 +220,6 @@ class EPUBOutput(OutputFormatPlugin):
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
                    if x.endswith('.ncx')][0])
-            encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
--- a/src/calibre/ebooks/conversion/plugins/mobi_input.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py
@ -59,7 +59,10 @@ class MOBIInput(InputFormatPlugin):
        if mr.kf8_type is not None:
            log('Found KF8 MOBI of type %r'%mr.kf8_type)
            from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
-            return os.path.abspath(Mobi8Reader(mr, log)())
+            mr = Mobi8Reader(mr, log)
+            opf = os.path.abspath(mr())
+            self.encrypted_fonts = mr.encrypted_fonts
+            return opf

        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
        if raw:
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@ -179,7 +179,7 @@ class MOBIOutput(OutputFormatPlugin):
        writer(oeb, output_path)

        if opts.extract_to is not None:
-            from calibre.ebooks.mobi.debug import inspect_mobi
+            from calibre.ebooks.mobi.debug.main import inspect_mobi
            ddir = opts.extract_to
            inspect_mobi(output_path, ddir=ddir)

--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
--- a/src/calibre/ebooks/mobi/debug/init.py
+++ b/src/calibre/ebooks/mobi/debug/init.py
@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+def format_bytes(byts):
+    byts = bytearray(byts)
+    byts = [hex(b)[2:] for b in byts]
+    return ' '.join(byts)
+
+
--- a/src/calibre/ebooks/mobi/debug/headers.py
+++ b/src/calibre/ebooks/mobi/debug/headers.py
@ -0,0 +1,535 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct, datetime, os
+
+from calibre.utils.date import utc_tz
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.langcodes import main_language, sub_language
+from calibre.ebooks.mobi.debug import format_bytes
+from calibre.ebooks.mobi.utils import get_trailing_data
+
+# PalmDB {{{
+class PalmDOCAttributes(object):
+
+    class Attr(object):
+
+        def __init__(self, name, field, val):
+            self.name = name
+            self.val = val & field
+
+        def __str__(self):
+            return '%s: %s'%(self.name, bool(self.val))
+
+    def __init__(self, raw):
+        self.val = struct.unpack(b'<H', raw)[0]
+        self.attributes = []
+        for name, field in [('Read Only', 0x02), ('Dirty AppInfoArea', 0x04),
+                ('Backup this database', 0x08),
+                ('Okay to install newer over existing copy, if present on PalmPilot', 0x10),
+                ('Force the PalmPilot to reset after this database is installed', 0x12),
+                ('Don\'t allow copy of file to be beamed to other Pilot',
+                    0x14)]:
+            self.attributes.append(PalmDOCAttributes.Attr(name, field,
+                self.val))
+
+    def __str__(self):
+        attrs = '\n\t'.join([str(x) for x in self.attributes])
+        return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
+
+class PalmDB(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+
+        if self.raw.startswith(b'TPZ'):
+            raise ValueError('This is a Topaz file')
+
+        self.name     = self.raw[:32].replace(b'\x00', b'')
+        self.attributes = PalmDOCAttributes(self.raw[32:34])
+        self.version = struct.unpack(b'>H', self.raw[34:36])[0]
+
+        palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz)
+        self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0]
+        self.creation_date = (palm_epoch +
+                datetime.timedelta(seconds=self.creation_date_raw))
+        self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0]
+        self.modification_date = (palm_epoch +
+                datetime.timedelta(seconds=self.modification_date_raw))
+        self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0]
+        self.last_backup_date = (palm_epoch +
+                datetime.timedelta(seconds=self.last_backup_date_raw))
+        self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0]
+        self.app_info_id = self.raw[52:56]
+        self.sort_info_id = self.raw[56:60]
+        self.type = self.raw[60:64]
+        self.creator = self.raw[64:68]
+        self.ident = self.type + self.creator
+        if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
+            raise ValueError('Unknown book ident: %r'%self.ident)
+        self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72])
+        self.next_rec_list_id = self.raw[72:76]
+
+        self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
+
+    def __str__(self):
+        ans = ['*'*20 + ' PalmDB Header '+ '*'*20]
+        ans.append('Name: %r'%self.name)
+        ans.append(str(self.attributes))
+        ans.append('Version: %s'%self.version)
+        ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(),
+            self.creation_date_raw))
+        ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(),
+            self.modification_date_raw))
+        ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(),
+            self.last_backup_date_raw))
+        ans.append('Modification number: %s'%self.modification_number)
+        ans.append('App Info ID: %r'%self.app_info_id)
+        ans.append('Sort Info ID: %r'%self.sort_info_id)
+        ans.append('Type: %r'%self.type)
+        ans.append('Creator: %r'%self.creator)
+        ans.append('Last record UID +1: %r'%self.last_record_uid)
+        ans.append('Next record list id: %r'%self.next_rec_list_id)
+        ans.append('Number of records: %s'%self.number_of_records)
+
+        return '\n'.join(ans)
+# }}}
+
+class Record(object): # {{{
+
+    def __init__(self, raw, header):
+        self.offset, self.flags, self.uid = header
+        self.raw = raw
+
+    @property
+    def header(self):
+        return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags,
+                self.uid, self.raw[:4], len(self.raw))
+# }}}
+
+# EXTH {{{
+class EXTHRecord(object):
+
+    def __init__(self, type_, data):
+        self.type = type_
+        self.data = data
+        self.name = {
+                1 : 'DRM Server id',
+                2 : 'DRM Commerce id',
+                3 : 'DRM ebookbase book id',
+                100 : 'author',
+                101 : 'publisher',
+                102 : 'imprint',
+                103 : 'description',
+                104 : 'isbn',
+                105 : 'subject',
+                106 : 'publishingdate',
+                107 : 'review',
+                108 : 'contributor',
+                109 : 'rights',
+                110 : 'subjectcode',
+                111 : 'type',
+                112 : 'source',
+                113 : 'asin',
+                114 : 'versionnumber',
+                115 : 'sample',
+                116 : 'startreading',
+                117 : 'adult',
+                118 : 'retailprice',
+                119 : 'retailpricecurrency',
+                121 : 'KF8 header section index',
+                125 : 'KF8 resources (images/fonts) count',
+                129 : 'KF8 cover URI',
+                131 : 'KF8 unknown count',
+                201 : 'coveroffset',
+                202 : 'thumboffset',
+                203 : 'hasfakecover',
+                204 : 'Creator Software',
+                205 : 'Creator Major Version', # '>I'
+                206 : 'Creator Minor Version', # '>I'
+                207 : 'Creator Build Number', # '>I'
+                208 : 'watermark',
+                209 : 'tamper_proof_keys',
+                300 : 'fontsignature',
+                301 : 'clippinglimit', # percentage '>B'
+                402 : 'publisherlimit',
+                404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled
+                501 : 'cdetype', # 4 chars (PDOC or EBOK)
+                502 : 'lastupdatetime',
+                503 : 'updatedtitle',
+        }.get(self.type, repr(self.type))
+
+        if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover',
+                'Creator Major Version', 'Creator Minor Version',
+                'Creator Build Number', 'Creator Software', 'startreading'} or
+                self.type in {121, 125, 131}):
+            self.data, = struct.unpack(b'>I', self.data)
+
+    def __str__(self):
+        return '%s (%d): %r'%(self.name, self.type, self.data)
+
+class EXTHHeader(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+        if not self.raw.startswith(b'EXTH'):
+            raise ValueError('EXTH header does not start with EXTH')
+        self.length, = struct.unpack(b'>I', self.raw[4:8])
+        self.count,  = struct.unpack(b'>I', self.raw[8:12])
+
+        pos = 12
+        self.records = []
+        for i in xrange(self.count):
+            pos = self.read_record(pos)
+        self.records.sort(key=lambda x:x.type)
+        self.rmap = {x.type:x for x in self.records}
+
+    def __getitem__(self, type_):
+        return self.rmap.__getitem__(type_).data
+
+    def get(self, type_, default=None):
+        ans = self.rmap.get(type_, default)
+        return getattr(ans, 'data', default)
+
+    def read_record(self, pos):
+        type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
+        data = self.raw[(pos+8):(pos+length)]
+        self.records.append(EXTHRecord(type_, data))
+        return pos + length
+
+    @property
+    def kf8_header_index(self):
+        return self.get(121, None)
+
+    def __str__(self):
+        ans = ['*'*20 + ' EXTH Header '+ '*'*20]
+        ans.append('EXTH header length: %d'%self.length)
+        ans.append('Number of EXTH records: %d'%self.count)
+        ans.append('EXTH records...')
+        for r in self.records:
+            ans.append(str(r))
+        return '\n'.join(ans)
+# }}}
+
+class MOBIHeader(object): # {{{
+
+    def __init__(self, record0, offset):
+        self.raw = record0.raw
+        self.header_offset = offset
+
+        self.compression_raw = self.raw[:2]
+        self.compression = {1: 'No compression', 2: 'PalmDoc compression',
+                17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H',
+                    self.compression_raw)[0],
+                    repr(self.compression_raw))
+        self.unused = self.raw[2:4]
+        self.text_length, = struct.unpack(b'>I', self.raw[4:8])
+        self.number_of_text_records, self.text_record_size = \
+                struct.unpack(b'>HH', self.raw[8:12])
+        self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
+        self.encryption_type = {
+                0: 'No encryption',
+                1: 'Old mobipocket encryption',
+                2: 'Mobipocket encryption'
+            }.get(self.encryption_type_raw, repr(self.encryption_type_raw))
+        self.unknown = self.raw[14:16]
+
+        self.identifier = self.raw[16:20]
+        if self.identifier != b'MOBI':
+            raise ValueError('Identifier %r unknown'%self.identifier)
+
+        self.length, = struct.unpack(b'>I', self.raw[20:24])
+        self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
+        self.type = {
+                2 : 'Mobipocket book',
+                3 : 'PalmDOC book',
+                4 : 'Audio',
+                257 : 'News',
+                258 : 'News Feed',
+                259 : 'News magazine',
+                513 : 'PICS',
+                514 : 'Word',
+                515 : 'XLS',
+                516 : 'PPT',
+                517 : 'TEXT',
+                518 : 'HTML',
+            }.get(self.type_raw, repr(self.type_raw))
+
+        self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
+        self.encoding = {
+                1252 : 'cp1252',
+                65001: 'utf-8',
+            }.get(self.encoding_raw, repr(self.encoding_raw))
+        self.uid = self.raw[32:36]
+        self.file_version, = struct.unpack(b'>I', self.raw[36:40])
+        self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
+                b'>II', self.raw[40:48])
+        self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
+        self.reserved = self.raw[52:80]
+        self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
+        self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
+        self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
+        self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
+        langcode = self.locale_raw
+        langid    = langcode & 0xFF
+        sublangid = (langcode >> 10) & 0xFF
+        self.language = main_language.get(langid, 'ENGLISH')
+        self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
+
+        self.input_language = self.raw[96:100]
+        self.output_langauage = self.raw[100:104]
+        self.min_version, = struct.unpack(b'>I', self.raw[104:108])
+        self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
+        self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
+        self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
+        self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124])
+        self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
+        self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
+        self.has_exth = bool(self.exth_flags & 0x40)
+        self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
+        if self.has_drm_data:
+            self.unknown3 = self.raw[132:164]
+            self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
+            self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
+            self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
+            self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
+        self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
+        self.has_fcis_flis = False
+        self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
+        self.extra_data_flags = 0
+        if self.has_extra_data_flags:
+            self.unknown4 = self.raw[180:192]
+            self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
+                    self.raw, 192)
+            (self.fcis_number, self.fcis_count, self.flis_number,
+                    self.flis_count) = struct.unpack(b'>IIII',
+                            self.raw[200:216])
+            self.unknown6 = self.raw[216:224]
+            self.srcs_record_index = struct.unpack(b'>I',
+                self.raw[224:228])[0]
+            self.num_srcs_records = struct.unpack(b'>I',
+                self.raw[228:232])[0]
+            self.unknown7 = self.raw[232:240]
+            self.extra_data_flags = struct.unpack(b'>I',
+                self.raw[240:244])[0]
+            self.has_multibytes = bool(self.extra_data_flags & 0b1)
+            self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
+            self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
+            self.primary_index_record, = struct.unpack(b'>I',
+                    self.raw[244:248])
+
+        if self.file_version >= 8:
+            (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
+                    ) = struct.unpack_from(b'>4L', self.raw, 248)
+            self.unknown9 = self.raw[264:self.length]
+            if self.meta_orth_indx != self.sect_idx:
+                raise ValueError('KF8 header has different Meta orth and '
+                        'section indices')
+
+        # The following are all relative to the position of the header record
+        # make them absolute for ease of debugging
+        for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
+                'meta_orth_indx', 'huffman_record_offset',
+                'first_non_book_record', 'datp_record_offset', 'fcis_number',
+                'flis_number', 'primary_index_record', 'fdst_idx',
+                'first_image_index'):
+            if hasattr(self, x):
+                setattr(self, x, self.header_offset+getattr(self, x))
+
+        if self.has_exth:
+            self.exth_offset = 16 + self.length
+
+            self.exth = EXTHHeader(self.raw[self.exth_offset:])
+
+            self.end_of_exth = self.exth_offset + self.exth.length
+            self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]
+
+    def __str__(self):
+        ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
+        a = ans.append
+        i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
+        ans.append('Compression: %s'%self.compression)
+        ans.append('Unused: %r'%self.unused)
+        ans.append('Number of text records: %d'%self.number_of_text_records)
+        ans.append('Text record size: %d'%self.text_record_size)
+        ans.append('Encryption: %s'%self.encryption_type)
+        ans.append('Unknown: %r'%self.unknown)
+        ans.append('Identifier: %r'%self.identifier)
+        ans.append('Header length: %d'% self.length)
+        ans.append('Type: %s'%self.type)
+        ans.append('Encoding: %s'%self.encoding)
+        ans.append('UID: %r'%self.uid)
+        ans.append('File version: %d'%self.file_version)
+        i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
+        i('Meta Infl Index', self.meta_infl_indx)
+        ans.append('Secondary index record: %d (null val: %d)'%(
+            self.secondary_index_record, NULL_INDEX))
+        ans.append('Reserved: %r'%self.reserved)
+        ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
+            self.first_non_book_record))
+        ans.append('Full name offset: %d'%self.fullname_offset)
+        ans.append('Full name length: %d bytes'%self.fullname_length)
+        ans.append('Langcode: %r'%self.locale_raw)
+        ans.append('Language: %s'%self.language)
+        ans.append('Sub language: %s'%self.sublanguage)
+        ans.append('Input language: %r'%self.input_language)
+        ans.append('Output language: %r'%self.output_langauage)
+        ans.append('Min version: %d'%self.min_version)
+        ans.append('First Image index: %d'%self.first_image_index)
+        ans.append('Huffman record offset: %d'%self.huffman_record_offset)
+        ans.append('Huffman record count: %d'%self.huffman_record_count)
+        ans.append('DATP record offset: %r'%self.datp_record_offset)
+        ans.append('DATP record count: %r'%self.datp_record_count)
+        ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
+        if self.has_drm_data:
+            ans.append('Unknown3: %r'%self.unknown3)
+            ans.append('DRM Offset: %s'%self.drm_offset)
+            ans.append('DRM Count: %s'%self.drm_count)
+            ans.append('DRM Size: %s'%self.drm_size)
+            ans.append('DRM Flags: %r'%self.drm_flags)
+        if self.has_extra_data_flags:
+            ans.append('Unknown4: %r'%self.unknown4)
+            ans.append('FDST Index: %d'% self.fdst_idx)
+            ans.append('FDST Count: %d'% self.fdst_count)
+            ans.append('FCIS number: %d'% self.fcis_number)
+            ans.append('FCIS count: %d'% self.fcis_count)
+            ans.append('FLIS number: %d'% self.flis_number)
+            ans.append('FLIS count: %d'% self.flis_count)
+            ans.append('Unknown6: %r'% self.unknown6)
+            ans.append('SRCS record index: %d'%self.srcs_record_index)
+            ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
+            ans.append('Unknown7: %r'%self.unknown7)
+            ans.append(('Extra data flags: %s (has multibyte: %s) '
+                '(has indexing: %s) (has uncrossable breaks: %s)')%(
+                    bin(self.extra_data_flags), self.has_multibytes,
+                    self.has_indexing_bytes, self.has_uncrossable_breaks ))
+            ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
+                self.primary_index_record))
+        if self.file_version >= 8:
+            i('Sections Index', self.sect_idx)
+            i('SKEL Index', self.skel_idx)
+            i('DATP Index', self.datp_idx)
+            i('Other Index', self.oth_idx)
+            if self.unknown9:
+                a('Unknown9: %r'%self.unknown9)
+
+        ans = '\n'.join(ans)
+
+        if self.has_exth:
+            ans += '\n\n' + str(self.exth)
+            ans += '\n\nBytes after EXTH (%d bytes): %s'%(
+                    len(self.bytes_after_exth),
+                    format_bytes(self.bytes_after_exth))
+
+        ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset +
+                self.fullname_length))
+
+        ans += '\nRecord 0 length: %d'%len(self.raw)
+        return ans
+# }}}
+
+class MOBIFile(object):
+
+    def __init__(self, stream):
+        self.raw = stream.read()
+        self.palmdb = PalmDB(self.raw[:78])
+
+        self.record_headers = []
+        self.records = []
+        for i in xrange(self.palmdb.number_of_records):
+            pos = 78 + i * 8
+            offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8])
+            flags, val = a1, a2 << 16 | a3 << 8 | a4
+            self.record_headers.append((offset, flags, val))
+
+        def section(section_number):
+            if section_number == self.palmdb.number_of_records - 1:
+                end_off = len(self.raw)
+            else:
+                end_off = self.record_headers[section_number + 1][0]
+            off = self.record_headers[section_number][0]
+            return self.raw[off:end_off]
+
+        for i in range(self.palmdb.number_of_records):
+            self.records.append(Record(section(i), self.record_headers[i]))
+
+        self.mobi_header = MOBIHeader(self.records[0], 0)
+        self.huffman_record_nums = []
+
+        self.kf8_type = None
+        mh = mh8 = self.mobi_header
+        if mh.file_version >= 8:
+            self.kf8_type = 'standalone'
+        elif mh.has_exth and mh.exth.kf8_header_index is not None:
+            self.kf8_type = 'joint'
+            kf8i = mh.exth.kf8_header_index
+            mh8 = MOBIHeader(self.records[kf8i], kf8i)
+        self.mobi8_header = mh8
+
+        if 'huff' in self.mobi_header.compression.lower():
+            from calibre.ebooks.mobi.huffcdic import HuffReader
+
+            def huffit(off, cnt):
+                huffman_record_nums = list(xrange(off, off+cnt))
+                huffrecs = [self.records[r].raw for r in huffman_record_nums]
+                huffs = HuffReader(huffrecs)
+                return huffman_record_nums, huffs.unpack
+
+            if self.kf8_type == 'joint':
+                recs6, d6 = huffit(mh.huffman_record_offset,
+                        mh.huffman_record_count)
+                recs8, d8 = huffit(mh8.huffman_record_offset,
+                        mh8.huffman_record_count)
+                self.huffman_record_nums = recs6 + recs8
+            else:
+                self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset,
+                        mh.huffman_record_count)
+                d8 = d6
+        elif 'palmdoc' in self.mobi_header.compression.lower():
+            from calibre.ebooks.compression.palmdoc import decompress_doc
+            d8 = d6 = decompress_doc
+        else:
+            d8 = d6 = lambda x: x
+
+        self.decompress6, self.decompress8 = d6, d8
+
+class TextRecord(object): # {{{
+
+    def __init__(self, idx, record, extra_data_flags, decompress):
+        self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
+        raw_trailing_bytes = record.raw[len(self.raw):]
+        self.raw = decompress(self.raw)
+
+        if 0 in self.trailing_data:
+            self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
+        if 1 in self.trailing_data:
+            self.trailing_data['indexing'] = self.trailing_data.pop(1)
+        if 2 in self.trailing_data:
+            self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
+        self.trailing_data['raw_bytes'] = raw_trailing_bytes
+
+        for typ, val in self.trailing_data.iteritems():
+            if isinstance(typ, int):
+                print ('Record %d has unknown trailing data of type: %d : %r'%
+                        (idx, typ, val))
+
+        self.idx = idx
+
+    def dump(self, folder):
+        name = '%06d'%self.idx
+        with open(os.path.join(folder, name+'.txt'), 'wb') as f:
+            f.write(self.raw)
+        with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
+            for k, v in self.trailing_data.iteritems():
+                raw = '%s : %r\n\n'%(k, v)
+                f.write(raw.encode('utf-8'))
+
+# }}}
+
+
--- a/src/calibre/ebooks/mobi/debug/main.py
+++ b/src/calibre/ebooks/mobi/debug/main.py
@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os, shutil
+
+from calibre.ebooks.mobi.debug.headers import MOBIFile
+from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6
+from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8
+
+def inspect_mobi(path_or_stream, ddir=None): # {{{
+    stream = (path_or_stream if hasattr(path_or_stream, 'read') else
+            open(path_or_stream, 'rb'))
+    f = MOBIFile(stream)
+    if ddir is None:
+        ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0]
+    try:
+        shutil.rmtree(ddir)
+    except:
+        pass
+    os.makedirs(ddir)
+    if f.kf8_type is None:
+        inspect_mobi6(f, ddir)
+    elif f.kf8_type == 'joint':
+        p6 = os.path.join(ddir, 'mobi6')
+        os.mkdir(p6)
+        inspect_mobi6(f, p6)
+        p8 = os.path.join(ddir, 'mobi8')
+        os.mkdir(p8)
+        inspect_mobi8(f, p8)
+    else:
+        inspect_mobi8(f, ddir)
+
+    print ('Debug data saved to:', ddir)
+
+# }}}
+
+def main():
+    inspect_mobi(sys.argv[1])
+
+if __name__ == '__main__':
+    main()
+
--- a/src/calibre/ebooks/mobi/debug/mobi6.py
+++ b/src/calibre/ebooks/mobi/debug/mobi6.py
@ -0,0 +1,839 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct, sys, os
+from collections import OrderedDict, defaultdict
+
+from lxml import html
+
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import (parse_index_record,
+        parse_tagx_section)
+from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
+        decode_tbs, read_font_record)
+from calibre.utils.magick.draw import identify_data
+from calibre.ebooks.mobi.debug import format_bytes
+from calibre.ebooks.mobi.debug.headers import TextRecord
+
+
+class TagX(object): # {{{
+
+    def __init__(self, tag, num_values, bitmask, eof):
+        self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
+                bitmask, eof)
+        self.num_of_values = num_values
+        self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
+                and self.bitmask == 0)
+
+    def __repr__(self):
+        return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag,
+                self.num_values, bin(self.bitmask), self.eof)
+    # }}}
+
+class SecondaryIndexHeader(object): # {{{
+
+    def __init__(self, record):
+        self.record = record
+        raw = self.record.raw
+        #open('/t/index_header.bin', 'wb').write(raw)
+        if raw[:4] != b'INDX':
+            raise ValueError('Invalid Secondary Index Record')
+        self.header_length, = struct.unpack('>I', raw[4:8])
+        self.unknown1 = raw[8:16]
+        self.index_type, = struct.unpack('>I', raw[16:20])
+        self.index_type_desc = {0: 'normal', 2:
+                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
+        self.idxt_start, = struct.unpack('>I', raw[20:24])
+        self.index_count, = struct.unpack('>I', raw[24:28])
+        self.index_encoding_num, = struct.unpack('>I', raw[28:32])
+        self.index_encoding = {65001: 'utf-8', 1252:
+                'cp1252'}.get(self.index_encoding_num, 'unknown')
+        if self.index_encoding == 'unknown':
+            raise ValueError(
+                'Unknown index encoding: %d'%self.index_encoding_num)
+        self.unknown2 = raw[32:36]
+        self.num_index_entries, = struct.unpack('>I', raw[36:40])
+        self.ordt_start, = struct.unpack('>I', raw[40:44])
+        self.ligt_start, = struct.unpack('>I', raw[44:48])
+        self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52])
+        self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56])
+        self.unknown3 = raw[56:180]
+        self.tagx_offset, = struct.unpack(b'>I', raw[180:184])
+        if self.tagx_offset != self.header_length:
+            raise ValueError('TAGX offset and header length disagree')
+        self.unknown4 = raw[184:self.header_length]
+
+        tagx = raw[self.header_length:]
+        if not tagx.startswith(b'TAGX'):
+            raise ValueError('Invalid TAGX section')
+        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
+        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
+        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
+            raise ValueError('TAGX last entry is not EOF')
+
+        idxt0_pos = self.header_length+self.tagx_header_length
+        num = ord(raw[idxt0_pos])
+        count_pos = idxt0_pos+1+num
+        self.last_entry = raw[idxt0_pos+1:count_pos]
+        self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])
+
+        # There may be some alignment zero bytes between the end of the idxt0
+        # and self.idxt_start
+        idxt = raw[self.idxt_start:]
+        if idxt[:4] != b'IDXT':
+            raise ValueError('Invalid IDXT header')
+        length_check, = struct.unpack(b'>H', idxt[4:6])
+        if length_check != self.header_length + self.tagx_header_length:
+            raise ValueError('Length check failed')
+        if idxt[6:].replace(b'\0', b''):
+            raise ValueError('Non null trailing bytes after IDXT')
+
+
+    def __str__(self):
+        ans = ['*'*20 + ' Secondary Index Header '+ '*'*20]
+        a = ans.append
+        def u(w):
+            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
+                len(w), not bool(w.replace(b'\0', b'')) ))
+
+        a('Header length: %d'%self.header_length)
+        u(self.unknown1)
+        a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
+        a('Offset to IDXT start: %d'%self.idxt_start)
+        a('Number of index records: %d'%self.index_count)
+        a('Index encoding: %s (%d)'%(self.index_encoding,
+                self.index_encoding_num))
+        u(self.unknown2)
+        a('Number of index entries: %d'% self.num_index_entries)
+        a('ORDT start: %d'%self.ordt_start)
+        a('LIGT start: %d'%self.ligt_start)
+        a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
+        a('Number of cncx blocks: %d'%self.num_of_cncx_blocks)
+        u(self.unknown3)
+        a('TAGX offset: %d'%self.tagx_offset)
+        u(self.unknown4)
+        a('\n\n')
+        a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
+        a('Header length: %d'%self.tagx_header_length)
+        a('Control byte count: %d'%self.tagx_control_byte_count)
+        for i in self.tagx_entries:
+            a('\t' + repr(i))
+        a('Index of last IndexEntry in secondary index record: %s'% self.last_entry)
+        a('Number of entries in the NCX: %d'% self.ncx_count)
+
+        return '\n'.join(ans)
+
+# }}}
+
+class IndexHeader(object): # {{{
+
+    def __init__(self, record):
+        self.record = record
+        raw = self.record.raw
+        #open('/t/index_header.bin', 'wb').write(raw)
+        if raw[:4] != b'INDX':
+            raise ValueError('Invalid Primary Index Record')
+
+        self.header_length, = struct.unpack('>I', raw[4:8])
+        self.unknown1 = raw[8:12]
+        self.header_type, = struct.unpack('>I', raw[12:16])
+        self.index_type, = struct.unpack('>I', raw[16:20])
+        self.index_type_desc = {0: 'normal', 2:
+                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
+        self.idxt_start, = struct.unpack('>I', raw[20:24])
+        self.index_count, = struct.unpack('>I', raw[24:28])
+        self.index_encoding_num, = struct.unpack('>I', raw[28:32])
+        self.index_encoding = {65001: 'utf-8', 1252:
+                'cp1252'}.get(self.index_encoding_num, 'unknown')
+        if self.index_encoding == 'unknown':
+            raise ValueError(
+                'Unknown index encoding: %d'%self.index_encoding_num)
+        self.possibly_language = raw[32:36]
+        self.num_index_entries, = struct.unpack('>I', raw[36:40])
+        self.ordt_start, = struct.unpack('>I', raw[40:44])
+        self.ligt_start, = struct.unpack('>I', raw[44:48])
+        self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52])
+        self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56])
+        self.unknown2 = raw[56:180]
+        self.tagx_offset, = struct.unpack(b'>I', raw[180:184])
+        if self.tagx_offset != self.header_length:
+            raise ValueError('TAGX offset and header length disagree')
+        self.unknown3 = raw[184:self.header_length]
+
+        tagx = raw[self.header_length:]
+        if not tagx.startswith(b'TAGX'):
+            raise ValueError('Invalid TAGX section')
+        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
+        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
+        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
+            raise ValueError('TAGX last entry is not EOF')
+
+        idxt0_pos = self.header_length+self.tagx_header_length
+        last_num, consumed = decode_hex_number(raw[idxt0_pos:])
+        count_pos = idxt0_pos + consumed
+        self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])
+        self.last_entry = last_num
+
+        if last_num != self.ncx_count - 1:
+            raise ValueError('Last id number in the NCX != NCX count - 1')
+        # There may be some alignment zero bytes between the end of the idxt0
+        # and self.idxt_start
+
+        idxt = raw[self.idxt_start:]
+        if idxt[:4] != b'IDXT':
+            raise ValueError('Invalid IDXT header')
+        length_check, = struct.unpack(b'>H', idxt[4:6])
+        if length_check != self.header_length + self.tagx_header_length:
+            raise ValueError('Length check failed')
+        if idxt[6:].replace(b'\0', b''):
+            raise ValueError('Non null trailing bytes after IDXT')
+
+
+    def __str__(self):
+        ans = ['*'*20 + ' Index Header (%d bytes)'%len(self.record.raw)+ '*'*20]
+        a = ans.append
+        def u(w):
+            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
+                len(w), not bool(w.replace(b'\0', b'')) ))
+
+        a('Header length: %d'%self.header_length)
+        u(self.unknown1)
+        a('Header type: %d'%self.header_type)
+        a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
+        a('Offset to IDXT start: %d'%self.idxt_start)
+        a('Number of index records: %d'%self.index_count)
+        a('Index encoding: %s (%d)'%(self.index_encoding,
+                self.index_encoding_num))
+        a('Unknown (possibly language?): %r'%(self.possibly_language))
+        a('Number of index entries: %d'% self.num_index_entries)
+        a('ORDT start: %d'%self.ordt_start)
+        a('LIGT start: %d'%self.ligt_start)
+        a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
+        a('Number of cncx blocks: %d'%self.num_of_cncx_blocks)
+        u(self.unknown2)
+        a('TAGX offset: %d'%self.tagx_offset)
+        u(self.unknown3)
+        a('\n\n')
+        a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
+        a('Header length: %d'%self.tagx_header_length)
+        a('Control byte count: %d'%self.tagx_control_byte_count)
+        for i in self.tagx_entries:
+            a('\t' + repr(i))
+        a('Index of last IndexEntry in primary index record: %s'% self.last_entry)
+        a('Number of entries in the NCX: %d'% self.ncx_count)
+
+        return '\n'.join(ans)
+    # }}}
+
+class Tag(object): # {{{
+
+    '''
+    Index entries are a collection of tags. Each tag is represented by this
+    class.
+    '''
+
+    TAG_MAP = {
+            1: ('offset', 'Offset in HTML'),
+            2: ('size', 'Size in HTML'),
+            3: ('label_offset', 'Label offset in CNCX'),
+            4: ('depth', 'Depth of this entry in TOC'),
+            5: ('class_offset', 'Class offset in CNCX'),
+            6: ('pos_fid', 'File Index'),
+
+            11: ('secondary', '[unknown, unknown, '
+                'tag type from TAGX in primary index header]'),
+
+            21: ('parent_index', 'Parent'),
+            22: ('first_child_index', 'First child'),
+            23: ('last_child_index', 'Last child'),
+
+            69 : ('image_index', 'Offset from first image record to the'
+                                ' image record associated with this entry'
+                                ' (masthead for periodical or thumbnail for'
+                                ' article entry).'),
+            70 : ('desc_offset', 'Description offset in cncx'),
+            71 : ('author_offset', 'Author offset in cncx'),
+            72 : ('image_caption_offset', 'Image caption offset in cncx'),
+            73 : ('image_attr_offset', 'Image attribution offset in cncx'),
+
+    }
+
+    def __init__(self, tag_type, vals, cncx):
+        self.value = vals if len(vals) > 1 else vals[0] if vals else None
+
+        self.cncx_value = None
+        if tag_type in self.TAG_MAP:
+            self.attr, self.desc = self.TAG_MAP[tag_type]
+        else:
+            print ('Unknown tag value: %%s'%tag_type)
+            self.desc = '??Unknown (tag value: %d)'%tag_type
+            self.attr = 'unknown'
+
+        if '_offset' in self.attr:
+            self.cncx_value = cncx[self.value]
+
+    def __str__(self):
+        if self.cncx_value is not None:
+            return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value)
+        return '%s : %r'%(self.desc, self.value)
+
+# }}}
+
+class IndexEntry(object): # {{{
+
+    '''
+    The index is made up of entries, each of which is represented by an
+    instance of this class. Index entries typically point to offsets in the
+    HTML, specify HTML sizes and point to text strings in the CNCX that are
+    used in the navigation UI.
+    '''
+
+    def __init__(self, ident, entry, cncx):
+        try:
+            self.index = int(ident, 16)
+        except ValueError:
+            self.index = ident
+        self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
+                entry.iteritems()]
+
+    @property
+    def label(self):
+        for tag in self.tags:
+            if tag.attr == 'label_offset':
+                return tag.cncx_value
+        return ''
+
+    @property
+    def offset(self):
+        for tag in self.tags:
+            if tag.attr == 'offset':
+                return tag.value
+        return 0
+
+    @property
+    def size(self):
+        for tag in self.tags:
+            if tag.attr == 'size':
+                return tag.value
+        return 0
+
+    @property
+    def depth(self):
+        for tag in self.tags:
+            if tag.attr == 'depth':
+                return tag.value
+        return 0
+
+    @property
+    def parent_index(self):
+        for tag in self.tags:
+            if tag.attr == 'parent_index':
+                return tag.value
+        return -1
+
+    @property
+    def first_child_index(self):
+        for tag in self.tags:
+            if tag.attr == 'first_child_index':
+                return tag.value
+        return -1
+
+    @property
+    def last_child_index(self):
+        for tag in self.tags:
+            if tag.attr == 'last_child_index':
+                return tag.value
+        return -1
+
+    @property
+    def pos_fid(self):
+        for tag in self.tags:
+            if tag.attr == 'pos_fid':
+                return tag.value
+        return [0, 0]
+
+    def __str__(self):
+        ans = ['Index Entry(index=%s, length=%d)'%(
+            self.index, len(self.tags))]
+        for tag in self.tags:
+            if tag.value is not None:
+                ans.append('\t'+str(tag))
+        if self.first_child_index != -1:
+            ans.append('\tNumber of children: %d'%(self.last_child_index -
+                self.first_child_index + 1))
+        return '\n'.join(ans)
+
+# }}}
+
+class IndexRecord(object): # {{{
+
+    '''
+    Represents all indexing information in the MOBI, apart from indexing info
+    in the trailing data of the text records.
+    '''
+
+    def __init__(self, records, index_header, cncx):
+        self.alltext = None
+        table = OrderedDict()
+        tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
+                index_header.tagx_entries]
+        for record in records:
+            raw = record.raw
+
+            if raw[:4] != b'INDX':
+                raise ValueError('Invalid Primary Index Record')
+
+            parse_index_record(table, record.raw,
+                    index_header.tagx_control_byte_count, tags,
+                    index_header.index_encoding, strict=True)
+
+        self.indices = []
+
+        for ident, entry in table.iteritems():
+            self.indices.append(IndexEntry(ident, entry, cncx))
+
+    def get_parent(self, index):
+        if index.depth < 1:
+            return None
+        parent_depth = index.depth - 1
+        for p in self.indices:
+            if p.depth != parent_depth:
+                continue
+
+    def __str__(self):
+        ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
+        a = ans.append
+        def u(w):
+            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
+                len(w), not bool(w.replace(b'\0', b'')) ))
+        for entry in self.indices:
+            offset = entry.offset
+            a(str(entry))
+            t = self.alltext
+            if offset is not None and self.alltext is not None:
+                a('\tHTML before offset: %r'%t[offset-50:offset])
+                a('\tHTML after offset: %r'%t[offset:offset+50])
+                p = offset+entry.size
+                a('\tHTML before end: %r'%t[p-50:p])
+                a('\tHTML after end: %r'%t[p:p+50])
+
+            a('')
+
+        return '\n'.join(ans)
+
+# }}}
+
+class CNCX(object): # {{{
+
+    '''
+    Parses the records that contain the compiled NCX (all strings from the
+    NCX). Presents a simple offset : string mapping interface to access the
+    data.
+    '''
+
+    def __init__(self, records, codec):
+        self.records = OrderedDict()
+        record_offset = 0
+        for record in records:
+            raw = record.raw
+            pos = 0
+            while pos < len(raw):
+                length, consumed = decint(raw[pos:])
+                if length > 0:
+                    try:
+                        self.records[pos+record_offset] = raw[
+                            pos+consumed:pos+consumed+length].decode(codec)
+                    except:
+                        byts = raw[pos:]
+                        r = format_bytes(byts)
+                        print ('CNCX entry at offset %d has unknown format %s'%(
+                            pos+record_offset, r))
+                        self.records[pos+record_offset] = r
+                        pos = len(raw)
+                pos += consumed+length
+            record_offset += 0x10000
+
+    def __getitem__(self, offset):
+        return self.records.get(offset)
+
+    def __str__(self):
+        ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20]
+        for k, v in self.records.iteritems():
+            ans.append('%10d : %s'%(k, v))
+        return '\n'.join(ans)
+
+
+# }}}
+
+class ImageRecord(object): # {{{
+
+    def __init__(self, idx, record, fmt):
+        self.raw = record.raw
+        self.fmt = fmt
+        self.idx = idx
+
+    def dump(self, folder):
+        name = '%06d'%self.idx
+        with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f:
+            f.write(self.raw)
+
+# }}}
+
+class BinaryRecord(object): # {{{
+
+    def __init__(self, idx, record):
+        self.raw = record.raw
+        sig = self.raw[:4]
+        name = '%06d'%idx
+        if sig in {b'FCIS', b'FLIS', b'SRCS', b'DATP', b'RESC', b'BOUN',
+                b'FDST', b'AUDI', b'VIDE',}:
+            name += '-' + sig.decode('ascii')
+        elif sig == b'\xe9\x8e\r\n':
+            name += '-' + 'EOF'
+        self.name = name
+
+    def dump(self, folder):
+        with open(os.path.join(folder, self.name+'.bin'), 'wb') as f:
+            f.write(self.raw)
+
+# }}}
+
+class FontRecord(object): # {{{
+
+    def __init__(self, idx, record):
+        self.raw = record.raw
+        name = '%06d'%idx
+        self.font = read_font_record(self.raw)
+        if self.font['err']:
+            raise ValueError('Failed to read font record: %s Headers: %s'%(
+                self.font['err'], self.font['headers']))
+        self.payload = (self.font['font_data'] if self.font['font_data'] else
+                self.font['raw_data'])
+        self.name = '%s.%s'%(name, self.font['ext'])
+
+    def dump(self, folder):
+        with open(os.path.join(folder, self.name), 'wb') as f:
+            f.write(self.payload)
+
+# }}}
+
+class TBSIndexing(object): # {{{
+
+    def __init__(self, text_records, indices, doc_type):
+        self.record_indices = OrderedDict()
+        self.doc_type = doc_type
+        self.indices = indices
+        pos = 0
+        for r in text_records:
+            start = pos
+            pos += len(r.raw)
+            end = pos - 1
+            self.record_indices[r] = x = {'starts':[], 'ends':[],
+                    'complete':[], 'geom': (start, end)}
+            for entry in indices:
+                istart, sz = entry.offset, entry.size
+                iend = istart + sz - 1
+                has_start = istart >= start and istart <= end
+                has_end = iend >= start and iend <= end
+                rec = None
+                if has_start and has_end:
+                    rec = 'complete'
+                elif has_start and not has_end:
+                    rec = 'starts'
+                elif not has_start and has_end:
+                    rec = 'ends'
+                if rec:
+                    x[rec].append(entry)
+
+    def get_index(self, idx):
+        for i in self.indices:
+            if i.index in {idx, unicode(idx)}: return i
+        raise IndexError('Index %d not found'%idx)
+
+    def __str__(self):
+        ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20]
+        for r, dat in self.record_indices.iteritems():
+            ans += self.dump_record(r, dat)[-1]
+        return '\n'.join(ans)
+
+    def dump(self, bdir):
+        types = defaultdict(list)
+        for r, dat in self.record_indices.iteritems():
+            tbs_type, strings = self.dump_record(r, dat)
+            if tbs_type == 0: continue
+            types[tbs_type] += strings
+        for typ, strings in types.iteritems():
+            with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
+                f.write('\n'.join(strings))
+
+    def dump_record(self, r, dat):
+        ans = []
+        ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx,
+            dat['geom'][0], dat['geom'][1]))
+        s, e, c = dat['starts'], dat['ends'], dat['complete']
+        ans.append(('\tContains: %d index entries '
+            '(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e,
+                c, s))))
+        byts = bytearray(r.trailing_data.get('indexing', b''))
+        ans.append('TBS bytes: %s'%format_bytes(byts))
+        for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)):
+            if entries:
+                ans.append('\t%s:'%typ)
+                for x in entries:
+                    ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
+                            'Depth: %d, Offset: %d, Size: %d) [%s]')%(
+                        x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
+        def bin4(num):
+            ans = bin(num)[2:]
+            return bytes('0'*(4-len(ans)) + ans)
+
+        def repr_extra(x):
+            return str({bin4(k):v for k, v in extra.iteritems()})
+
+        tbs_type = 0
+        is_periodical = self.doc_type in (257, 258, 259)
+        if len(byts):
+            outermost_index, extra, consumed = decode_tbs(byts, flag_size=3)
+            byts = byts[consumed:]
+            for k in extra:
+                tbs_type |= k
+            ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
+            ans.append('Outermost index: %d'%outermost_index)
+            ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
+            if is_periodical: # Hierarchical periodical
+                try:
+                    byts, a = self.interpret_periodical(tbs_type, byts,
+                        dat['geom'][0])
+                except:
+                    import traceback
+                    traceback.print_exc()
+                    a = []
+                    print ('Failed to decode TBS bytes for record: %d'%r.idx)
+                ans += a
+            if byts:
+                sbyts = tuple(hex(b)[2:] for b in byts)
+                ans.append('Remaining bytes: %s'%' '.join(sbyts))
+
+        ans.append('')
+        return tbs_type, ans
+
+    def interpret_periodical(self, tbs_type, byts, record_offset):
+        ans = []
+
+        def read_section_transitions(byts, psi=None): # {{{
+            if psi is None:
+                # Assume previous section is 1
+                psi = self.get_index(1)
+
+            while byts:
+                ai, extra, consumed = decode_tbs(byts)
+                byts = byts[consumed:]
+                if extra.get(0b0010, None) is not None:
+                    raise ValueError('Dont know how to interpret flag 0b0010'
+                            ' while reading section transitions')
+                if extra.get(0b1000, None) is not None:
+                    if len(extra) > 1:
+                        raise ValueError('Dont know how to interpret flags'
+                                ' %r while reading section transitions'%extra)
+                    nsi = self.get_index(psi.index+1)
+                    ans.append('Last article in this record of section %d'
+                            ' (relative to next section index [%d]): '
+                            '%d [%d absolute index]'%(psi.index, nsi.index, ai,
+                                ai+nsi.index))
+                    psi = nsi
+                    continue
+
+                ans.append('First article in this record of section %d'
+                        ' (relative to its parent section): '
+                        '%d [%d absolute index]'%(psi.index, ai, ai+psi.index))
+
+                num = extra.get(0b0100, None)
+                if num is None:
+                    msg = ('The section %d has at most one article'
+                            ' in this record')%psi.index
+                else:
+                    msg = ('Number of articles in this record of '
+                        'section %d: %d')%(psi.index, num)
+                ans.append(msg)
+
+                offset = extra.get(0b0001, None)
+                if offset is not None:
+                    if offset == 0:
+                        ans.append('This record is spanned by the article:'
+                                '%d'%(ai+psi.index))
+                    else:
+                        ans.append('->Offset to start of next section (%d) from start'
+                            ' of record: %d [%d absolute offset]'%(psi.index+1,
+                                offset, offset+record_offset))
+            return byts
+        # }}}
+
+        def read_starting_section(byts): # {{{
+            orig = byts
+            si, extra, consumed = decode_tbs(byts)
+            byts = byts[consumed:]
+            if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
+                raise ValueError('Dont know how to interpret flags %r'
+                        ' when reading starting section'%extra)
+            si = self.get_index(si)
+            ans.append('The section at the start of this record is:'
+                    ' %s'%si.index)
+            if 0b0100 in extra:
+                num = extra[0b0100]
+                ans.append('The number of articles from the section %d'
+                        ' in this record: %s'%(si.index, num))
+            elif 0b0001 in extra:
+                eof = extra[0b0001]
+                if eof != 0:
+                    raise ValueError('Unknown eof value %s when reading'
+                            ' starting section. All bytes: %r'%(eof, orig))
+                ans.append('??This record has more than one article from '
+                        ' the section: %s'%si.index)
+            return si, byts
+        # }}}
+
+        if tbs_type & 0b0100:
+            # Starting section is the first section
+            ssi = self.get_index(1)
+        else:
+            ssi, byts = read_starting_section(byts)
+
+        byts = read_section_transitions(byts, ssi)
+
+        return byts, ans
+
+# }}}
+
+class MOBIFile(object): # {{{
+
+    def __init__(self, mf):
+        for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header',
+                'huffman_record_nums',):
+            setattr(self, x, getattr(mf, x))
+
+        self.index_header = self.index_record = None
+        self.indexing_record_nums = set()
+        pir = self.mobi_header.primary_index_record
+        if pir != NULL_INDEX:
+            self.index_header = IndexHeader(self.records[pir])
+            numi = self.index_header.index_count
+            self.cncx = CNCX(self.records[
+                pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
+                self.index_header.index_encoding)
+            self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
+                    self.index_header, self.cncx)
+            self.indexing_record_nums = set(xrange(pir,
+                pir+1+numi+self.index_header.num_of_cncx_blocks))
+        self.secondary_index_record = self.secondary_index_header = None
+        sir = self.mobi_header.secondary_index_record
+        if sir != NULL_INDEX:
+            self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
+            numi = self.secondary_index_header.index_count
+            self.indexing_record_nums.add(sir)
+            self.secondary_index_record = IndexRecord(
+                    self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
+            self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
+
+
+        ntr = self.mobi_header.number_of_text_records
+        fntbr = self.mobi_header.first_non_book_record
+        fii = self.mobi_header.first_image_index
+        if fntbr == NULL_INDEX:
+            fntbr = len(self.records)
+        self.text_records = [TextRecord(r, self.records[r],
+            self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1,
+            min(len(self.records), ntr+1))]
+        self.image_records, self.binary_records = [], []
+        self.font_records = []
+        image_index = 0
+        for i in xrange(fntbr, len(self.records)):
+            if i in self.indexing_record_nums or i in self.huffman_record_nums:
+                continue
+            image_index += 1
+            r = self.records[i]
+            fmt = None
+            if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS',
+                    b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
+                    b'AUDI', b'VIDE', b'FONT'}:
+                try:
+                    width, height, fmt = identify_data(r.raw)
+                except:
+                    pass
+            if fmt is not None:
+                self.image_records.append(ImageRecord(image_index, r, fmt))
+            elif r.raw[:4] == b'FONT':
+                self.font_records.append(FontRecord(i, r))
+            else:
+                self.binary_records.append(BinaryRecord(i, r))
+
+        if self.index_record is not None:
+            self.tbs_indexing = TBSIndexing(self.text_records,
+                    self.index_record.indices, self.mobi_header.type_raw)
+
+    def print_header(self, f=sys.stdout):
+        print (str(self.palmdb).encode('utf-8'), file=f)
+        print (file=f)
+        print ('Record headers:', file=f)
+        for i, r in enumerate(self.records):
+            print ('%6d. %s'%(i, r.header), file=f)
+
+        print (file=f)
+        print (str(self.mobi_header).encode('utf-8'), file=f)
+# }}}
+
+def inspect_mobi(mobi_file, ddir):
+    f = MOBIFile(mobi_file)
+    with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
+        f.print_header(f=out)
+
+    alltext = os.path.join(ddir, 'text.html')
+    with open(alltext, 'wb') as of:
+        alltext = b''
+        for rec in f.text_records:
+            of.write(rec.raw)
+            alltext += rec.raw
+        of.seek(0)
+
+    root = html.fromstring(alltext.decode('utf-8'))
+    with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
+        of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
+            include_meta_content_type=True))
+
+    if f.index_header is not None:
+        f.index_record.alltext = alltext
+        with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
+            print(str(f.index_header), file=out)
+            print('\n\n', file=out)
+            if f.secondary_index_header is not None:
+                print(str(f.secondary_index_header).encode('utf-8'), file=out)
+                print('\n\n', file=out)
+            if f.secondary_index_record is not None:
+                print(str(f.secondary_index_record).encode('utf-8'), file=out)
+                print('\n\n', file=out)
+            print(str(f.cncx).encode('utf-8'), file=out)
+            print('\n\n', file=out)
+            print(str(f.index_record), file=out)
+        with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
+            print(str(f.tbs_indexing), file=out)
+        f.tbs_indexing.dump(ddir)
+
+    for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
+            ('binary', 'binary_records'), ('font', 'font_records')]:
+        tdir = os.path.join(ddir, tdir)
+        os.mkdir(tdir)
+        for rec in getattr(f, attr):
+            rec.dump(tdir)
+
+
+
+# }}}
+
+
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os
+
+from calibre.ebooks.mobi.debug.headers import TextRecord
+
+class MOBIFile(object):
+
+    def __init__(self, mf):
+        self.mf = mf
+        h, h8 = mf.mobi_header, mf.mobi8_header
+        first_text_record = 1
+        offset = 0
+        res_end = len(mf.records)
+        if mf.kf8_type == 'joint':
+            offset = h.exth.kf8_header_index
+            res_end = offset - 1
+
+        self.resource_records = mf.records[h.first_non_book_record:res_end]
+        self.text_records = [TextRecord(i, r, h8.extra_data_flags,
+            mf.decompress8) for i, r in
+            enumerate(mf.records[first_text_record+offset:
+                first_text_record+offset+h8.number_of_text_records])]
+
+        self.raw_text = b''.join(r.raw for r in self.text_records)
+
+    def print_header(self, f=sys.stdout):
+        print (str(self.mf.palmdb).encode('utf-8'), file=f)
+        print (file=f)
+        print ('Record headers:', file=f)
+        for i, r in enumerate(self.mf.records):
+            print ('%6d. %s'%(i, r.header), file=f)
+
+        print (file=f)
+        print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
+
+
+def inspect_mobi(mobi_file, ddir):
+    f = MOBIFile(mobi_file)
+    with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
+        f.print_header(f=out)
+
+    alltext = os.path.join(ddir, 'raw_text.html')
+    with open(alltext, 'wb') as of:
+        of.write(f.raw_text)
+
+    for tdir, attr in [('text_records', 'text_records'), ('images',
+        'image_records'), ('binary', 'binary_records'), ('font',
+            'font_records')]:
+        tdir = os.path.join(ddir, tdir)
+        os.mkdir(tdir)
+        for rec in getattr(f, attr, []):
+            rec.dump(tdir)
+
+
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@ -186,20 +186,16 @@ class BookHeader(object):
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)

-            if self.mobi_version >= 8:
-                self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
-
-                # Index into <div> sections in raw_ml
-                self.dividx, = struct.unpack_from('>L', raw, 0xF8)
-
-                # Index into Other files
-                self.othidx, = struct.unpack_from('>L', raw, 0x104)
+            # Ancient PRC files from Baen can have random values for
+            # mobi_version, so be conservative
+            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
+                self.dividx, self.skelidx, self.datpidx, self.othidx = \
+                        struct.unpack_from(b'>4L', raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
-                self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
-                self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
+                self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import struct
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple

-from calibre.ebooks.mobi.utils import decint, count_set_bits
+from calibre.ebooks.mobi.utils import (decint, count_set_bits,
+        decode_string)
+
+TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
+PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')

 class InvalidFile(ValueError):
    pass
@ -37,9 +41,8 @@ def parse_indx_header(data):
            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
    )
    num = len(words)
-    values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
-    header = {words[i]:values[i] for i in xrange(num)}
-    return header
+    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
+    return dict(zip(words, values))

 class CNCX(object): # {{{

@ -77,101 +80,94 @@ class CNCX(object): # {{{
        return self.records.get(offset, default)
 # }}}

-def parse_tag_section(data):
+def parse_tagx_section(data):
    check_signature(data, b'TAGX')

    tags = []
-    first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
-    control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
+    first_entry_offset, = struct.unpack_from(b'>L', data, 4)
+    control_byte_count, = struct.unpack_from(b'>L', data, 8)

-    # Skip the first 12 bytes already read above.
    for i in xrange(12, first_entry_offset, 4):
-        pos = i
-        tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
-            ord(data[pos+3])))
+        vals = list(bytearray(data[i:i+4]))
+        tags.append(TagX(*vals))
    return control_byte_count, tags

-def get_tag_map(control_byte_count, tags, data, start, end):
+def get_tag_map(control_byte_count, tagx, data, strict=False):
    ptags = []
    ans = {}
-    control_byte_index = 0
-    data_start = start + control_byte_count
+    control_bytes = list(bytearray(data[:control_byte_count]))
+    data = data[control_byte_count:]

-    for tag, values_per_entry, mask, end_flag in tags:
-        if end_flag == 0x01:
-            control_byte_index += 1
+    for x in tagx:
+        if x.eof == 0x01:
+            control_bytes = control_bytes[1:]
            continue
-        value = ord(data[start + control_byte_index]) & mask
+        value = control_bytes[0] & x.bitmask
        if value != 0:
-            if value == mask:
-                if count_set_bits(mask) > 1:
+            value_count = value_bytes = None
+            if value == x.bitmask:
+                if count_set_bits(x.bitmask) > 1:
                    # If all bits of masked value are set and the mask has more
                    # than one bit, a variable width value will follow after
                    # the control bytes which defines the length of bytes (NOT
                    # the value count!) which will contain the corresponding
                    # variable width values.
-                    value, consumed = decint(data[data_start:])
-                    data_start += consumed
-                    ptags.append((tag, None, value, values_per_entry))
+                    value_bytes, consumed = decint(data)
+                    data = data[consumed:]
                else:
-                    ptags.append((tag, 1, None, values_per_entry))
+                    value_count = 1
            else:
                # Shift bits to get the masked value.
-                while mask & 0x01 == 0:
-                    mask = mask >> 1
-                    value = value >> 1
-                ptags.append((tag, value, None, values_per_entry))
-    for tag, value_count, value_bytes, values_per_entry in ptags:
+                mask = x.bitmask
+                while mask & 0b1 == 0:
+                    mask >>= 1
+                    value >>= 1
+                value_count = value
+            ptags.append(PTagX(x.tag, value_count, value_bytes,
+                x.num_of_values))
+
+    for x in ptags:
        values = []
-        if value_count != None:
+        if x.value_count is not None:
            # Read value_count * values_per_entry variable width values.
-            for _ in xrange(value_count*values_per_entry):
-                byts, consumed = decint(data[data_start:])
-                data_start += consumed
+            for _ in xrange(x.value_count * x.num_of_values):
+                byts, consumed = decint(data)
+                data = data[consumed:]
                values.append(byts)
-        else:
+        else: # value_bytes is not None
            # Convert value_bytes to variable width values.
            total_consumed = 0
-            while total_consumed < value_bytes:
+            while total_consumed < x.value_bytes:
                # Does this work for values_per_entry != 1?
-                byts, consumed = decint(data[data_start:])
-                data_start += consumed
+                byts, consumed = decint(data)
+                data = data[consumed:]
                total_consumed += consumed
                values.append(byts)
-            if total_consumed != value_bytes:
-                print ("Error: Should consume %s bytes, but consumed %s" %
-                        (value_bytes, total_consumed))
-        ans[tag] = values
-    # Test that all bytes have been processed if end is given.
-    if end is not None and data_start < end:
-        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
-        rest = data[data_start:end]
-        if rest.replace(b'\0', b''):
-            print ("Warning: There are unprocessed index bytes left: %s" %
-                    format_bytes(rest))
+            if total_consumed != x.value_bytes:
+                err = ("Error: Should consume %s bytes, but consumed %s" %
+                        (x.value_bytes, total_consumed))
+                if strict:
+                    raise ValueError(err)
+                else:
+                    print(err)
+        ans[x.tag] = values
+    # Test that all bytes have been processed
+    if data.replace(b'\0', b''):
+        err = ("Warning: There are unprocessed index bytes left: %s" %
+                format_bytes(data))
+        if strict:
+            raise ValueError(err)
+        else:
+            print(err)

    return ans

-def read_index(sections, idx, codec):
-    table, cncx = OrderedDict(), CNCX([], codec)
-
-    data = sections[idx][0]
-
-    indx_header = parse_indx_header(data)
-    indx_count = indx_header['count']
-
-    if indx_header['ncncx'] > 0:
-        off = idx + indx_count + 1
-        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
-        cncx = CNCX(cncx_records, codec)
-
-    tag_section_start = indx_header['len']
-    control_byte_count, tags = parse_tag_section(data[tag_section_start:])
-
-    for i in xrange(idx + 1, idx + 1 + indx_count):
-        data = sections[i][0]
+def parse_index_record(table, data, control_byte_count, tags, codec,
+        strict=False):
    header = parse_indx_header(data)
    idxt_pos = header['start']
+    if data[idxt_pos:idxt_pos+4] != b'IDXT':
+        print ('WARNING: Invalid INDX record')
    entry_count = header['count']

    # loop through to build up the IDXT position starts
@ -187,11 +183,32 @@ def read_index(sections, idx, codec):
    # text
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
-            text_length = ord(data[start])
-            text = data[start+1:start+1+text_length]
-            tag_map = get_tag_map(control_byte_count, tags, data,
-                    start+1+text_length, end)
-            table[text] = tag_map
+        rec = data[start:end]
+        ident, consumed = decode_string(rec, codec=codec)
+        rec = rec[consumed:]
+        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
+        table[ident] = tag_map

+
+def read_index(sections, idx, codec):
+    table, cncx = OrderedDict(), CNCX([], codec)
+
+    data = sections[idx][0]
+
+    indx_header = parse_indx_header(data)
+    indx_count = indx_header['count']
+
+    if indx_header['ncncx'] > 0:
+        off = idx + indx_count + 1
+        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
+        cncx = CNCX(cncx_records, codec)
+
+    tag_section_start = indx_header['len']
+    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
+
+    for i in xrange(idx + 1, idx + 1 + indx_count):
+        # Index record
+        data = sections[i][0]
+        parse_index_record(table, data, control_byte_count, tags, codec)
    return table, cncx

--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@ -33,9 +33,11 @@ def update_internal_links(mobi8_reader):
                for m in posfid_index_pattern.finditer(tag):
                    posfid = m.group(1)
                    offset = m.group(2)
-                    filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
+                    filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32),
+                            int(offset, 32))
                    suffix = (b'#' + idtag) if idtag else b''
-                    replacement = filename.encode(mr.header.codec) + suffix
+                    replacement = filename.split('/')[-1].encode(
+                            mr.header.codec) + suffix
                    tag = posfid_index_pattern.sub(replacement, tag, 1)
                srcpieces[j] = tag
        part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -107,7 +107,10 @@ class MobiReader(object):
        self.kf8_type = None
        k8i = getattr(self.book_header.exth, 'kf8_header', None)

-        if self.book_header.mobi_version == 8:
+        # Ancient PRC files from Baen can have random values for
+        # mobi_version, so be conservative
+        if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
+            'skelidx')):
            self.kf8_type = 'standalone'
        elif k8i is not None: # Check for joint mobi 6 and kf 8 file
            try:
@ -118,12 +121,17 @@ class MobiReader(object):
                try:
                    self.book_header = BookHeader(self.sections[k8i][0],
                            self.ident, user_encoding, self.log)
-                    # The following are only correct in the Mobi 6
-                    # header not the Mobi 8 header
+
+                    # Only the first_image_index from the MOBI 6 header is
+                    # useful
                    for x in ('first_image_index',):
                        setattr(self.book_header, x, getattr(bh, x))
+
+                    # We need to do this because the MOBI 6 text extract code
+                    # does not know anything about the kf8 offset
                    if hasattr(self.book_header, 'huff_offset'):
                        self.book_header.huff_offset += k8i
+
                    self.kf8_type = 'joint'
                    self.kf8_boundary = k8i-1
                except:
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -33,6 +33,7 @@ class Mobi8Reader(object):
    def __init__(self, mobi6_reader, log):
        self.mobi6_reader, self.log = mobi6_reader, log
        self.header = mobi6_reader.book_header
+        self.encrypted_fonts = []

    def __call__(self):
        self.mobi6_reader.check_for_drm()
@ -229,11 +230,9 @@ class Mobi8Reader(object):

    def get_id_tag_by_pos_fid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
-        row = int(posfid, 32)
-        off = int(offset, 32)
-        [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
-        pos = insertpos + off
-        fname = self.get_file_info(pos).filename
+        insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
+        pos = insertpos + offset
+        fi = self.get_file_info(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not
        # have worked for linking.  Amazon seems to have added its own
        # additional "aid=" inside tags whose contents seem to represent some
@ -242,7 +241,7 @@ class Mobi8Reader(object):
        # so find the closest "id=" before position the file by actually
        # searching in that file
        idtext = self.get_id_tag(pos)
-        return fname, idtext
+        return '%s/%s'%(fi.type, fi.filename), idtext

    def get_id_tag(self, pos):
        # find the correct tag by actually searching in the destination
@ -253,12 +252,13 @@ class Mobi8Reader(object):
        textblock = self.parts[fi.num]
        id_map = []
        npos = pos - fi.start
-        # if npos inside a tag then search all text before the its end of tag
-        # marker
        pgt = textblock.find(b'>', npos)
        plt = textblock.find(b'<', npos)
-        if pgt < plt:
+        # if npos inside a tag then search all text before the its end of tag marker
+        # else not in a tag need to search the preceding tag
+        if plt == npos or pgt < plt:
            npos = pgt + 1
+        textblock = textblock[0:npos]
        # find id links only inside of tags
        #    inside any < > pair find all "id=' and return whatever is inside
        #    the quotes
@ -315,12 +315,18 @@ class Mobi8Reader(object):

        # Add href and anchor info to the index entries
        for entry in index_entries:
+            pos_fid = entry['pos_fid']
+            if pos_fid is None:
                pos = entry['pos']
                fi = self.get_file_info(pos)
                if fi.filename is None:
                    raise ValueError('Index entry has invalid pos: %d'%pos)
                idtag = self.get_id_tag(pos).decode(self.header.codec)
-            entry['href'] = '%s/%s'%(fi.type, fi.filename)
+                href = '%s/%s'%(fi.type, fi.filename)
+            else:
+                href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
+
+            entry['href'] = href
            entry['idtag'] = idtag

        # Build the TOC object
@ -350,6 +356,8 @@ class Mobi8Reader(object):
                with open(href.replace('/', os.sep), 'wb') as f:
                    f.write(font['font_data'] if font['font_data'] else
                            font['raw_data'])
+                if font['encrypted']:
+                    self.encrypted_fonts.append(href)
            else:
                imgtype = imghdr.what(None, data)
                if imgtype is None:
--- a/src/calibre/ebooks/mobi/reader/ncx.py
+++ b/src/calibre/ebooks/mobi/reader/ncx.py
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 import os

 from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index

@ -23,7 +22,30 @@ tag_fieldname_map = {
        6:  ['pos_fid',0],
        21: ['parent',0],
        22: ['child1',0],
-        23: ['childn',0]
+        23: ['childn',0],
+        69: ['image_index',0],
+        70 : ['desc_offset', 0], # 'Description offset in cncx'
+        71 : ['author_offset', 0], # 'Author offset in cncx'
+        72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
+        73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
+
+}
+
+default_entry = {
+                    'pos':  -1,
+                    'len':  0,
+                    'noffs': -1,
+                    'text' : "Unknown Text",
+                    'hlvl' : -1,
+                    'kind' : "Unknown Class",
+                    'pos_fid' : None,
+                    'parent' : -1,
+                    'child1' : -1,
+                    'childn' : -1,
+                    'description': None,
+                    'author': None,
+                    'image_caption': None,
+                    'image_attribution': None,
 }

 def read_ncx(sections, index, codec):
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):

        for num, x in enumerate(table.iteritems()):
            text, tag_map = x
-            entry = {
-                    'name': text,
-                    'pos':  -1,
-                    'len':  0,
-                    'noffs': -1,
-                    'text' : "Unknown Text",
-                    'hlvl' : -1,
-                    'kind' : "Unknown Kind",
-                    'pos_fid' : None,
-                    'parent' : -1,
-                    'child1' : -1,
-                    'childn' : -1,
-                    'num'  : num
-            }
+            entry = default_entry.copy()
+            entry['name'] = text
+            entry['num'] = num

-            for tag in tag_fieldname_map.keys():
+            for tag in tag_fieldname_map.iterkeys():
                fieldname, i = tag_fieldname_map[tag]
                if tag in tag_map:
                    fieldvalue = tag_map[tag][i]
                    if tag == 6:
-                        fieldvalue = to_base(fieldvalue, base=32)
+                        # Appears to be an idx into the KF8 elems table with an
+                        # offset
+                        fieldvalue = tuple(tag_map[tag])
                    entry[fieldname] = fieldvalue
-                    if tag == 3:
-                        entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
-                    if tag == 5:
-                        entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
+                    for which, name in {3:'text', 5:'kind', 70:'description',
+                            71:'author', 72:'image_caption',
+                            73:'image_attribution'}.iteritems():
+                        if tag == which:
+                            entry[name] = cncx.get(fieldvalue,
+                                    default_entry[name])
            index_entries.append(entry)

    return index_entries
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -15,7 +15,13 @@ from calibre.ebooks import normalize

 IMAGE_MAX_SIZE = 10 * 1024 * 1024

-def decode_hex_number(raw):
+def decode_string(raw, codec='utf-8'):
+    length, = struct.unpack(b'>B', raw[0])
+    raw = raw[1:1+length]
+    consumed = length+1
+    return raw.decode(codec), consumed
+
+def decode_hex_number(raw, codec='utf-8'):
    '''
    Return a variable length number encoded using hexadecimal encoding. These
    numbers have the first byte which tells the number of bytes that follow.
@ -25,13 +31,16 @@ def decode_hex_number(raw):
    :param raw: Raw binary data as a bytestring

    :return: The number and the number of bytes from raw that the number
-    occupies
+    occupies.
    '''
-    length, = struct.unpack(b'>B', raw[0])
-    raw = raw[1:1+length]
-    consumed = length+1
+    raw, consumed = decode_string(raw, codec=codec)
    return int(raw, 16), consumed

+def encode_string(raw):
+    ans = bytearray(bytes(raw))
+    ans.insert(0, len(ans))
+    return bytes(ans)
+
 def encode_number_as_hex(num):
    '''
    Encode num as a variable length encoded hexadecimal number. Returns the
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
    nlen = len(num)
    if nlen % 2 != 0:
        num = b'0'+num
-    ans = bytearray(num)
-    ans.insert(0, len(num))
-    return bytes(ans)
+    return encode_string(num)

 def encint(value, forward=True):
    '''
@ -430,7 +437,7 @@ def read_font_record(data, extent=1040): # {{{
    # The zlib compressed data begins with 2 bytes of header and
    # has 4 bytes of checksum at the end
    ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
-            'headers':None}
+            'headers':None, 'encrypted':False}

    try:
        usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
@ -453,6 +460,7 @@ def read_font_record(data, extent=1040): # {{{
            buf[n] ^= key[n%xor_len] # XOR of buf and key

        font_data = bytes(buf)
+        ans['encrypted'] = True

    if flags & 0b1:
        # ZLIB compressed data
--- a/src/calibre/ebooks/rtf/rtfml.py
+++ b/src/calibre/ebooks/rtf/rtfml.py
@ -234,6 +234,8 @@ class RTFMLizer(object):
        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == 'img':
+            src = elem.get('src')
+            if src:
                src = os.path.basename(elem.get('src'))
                block_start = ''
                block_end = ''
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@ -70,6 +70,9 @@ class AddAction(InterfaceAction):
        self.add_menu.addSeparator()
        ma('add-formats', _('Add files to selected book records'),
                triggered=self.add_formats, shortcut=_('Shift+A'))
+        self.add_menu.addSeparator()
+        ma('add-config', _('Configure the adding of books'),
+                triggered=self.add_config)

        self.qaction.triggered.connect(self.add_books)

@ -78,6 +81,11 @@ class AddAction(InterfaceAction):
        for action in list(self.add_menu.actions())[1:]:
            action.setEnabled(enabled)

+    def add_config(self):
+        self.gui.iactions['Preferences'].do_config(
+            initial_plugin=('Import/Export', 'Adding'),
+            close_after_initial=True)
+
    def add_formats(self, *args):
        if self.gui.stack.currentIndex() != 0:
            return
--- a/src/calibre/gui2/store/stores/nexto_plugin.py
+++ b/src/calibre/gui2/store/stores/nexto_plugin.py
@ -3,7 +3,7 @@
 from __future__ import (unicode_literals, division, absolute_import, print_function)

 __license__ = 'GPL 3'
-__copyright__ = '2011, Tomasz Długosz <tomek3d@gmail.com>'
+__copyright__ = '2011-2012, Tomasz Długosz <tomek3d@gmail.com>'
 __docformat__ = 'restructuredtext en'

 import re
@ -47,9 +47,12 @@ class NextoStore(BasicStoreConfig, StorePlugin):
        url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015'

        br = browser()
+        offset=0

        counter = max_results
-        with closing(br.open(url, timeout=timeout)) as f:
+
+        while counter:
+            with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="productslist"]/li'):
                    if counter <= 0:
@ -85,3 +88,6 @@ class NextoStore(BasicStoreConfig, StorePlugin):
                    s.formats = formats.upper().strip()

                    yield s
+                if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'):
+                    break
+            offset+=10
--- a/src/calibre/gui2/viewer/config.ui
+++ b/src/calibre/gui2/viewer/config.ui
@ -255,7 +255,10 @@
          </widget>
         </item>
         <item row="3" column="1">
-          <widget class="QSpinBox" name="max_view_width">
+          <widget class="QSpinBox" name="max_fs_width">
+           <property name="toolTip">
+            <string>Set the maximum width that the book's text and pictures will take when in fullscreen mode. This allows you to read the book text without it becoming too wide.</string>
+           </property>
           <property name="suffix">
            <string> px</string>
           </property>
@ -270,10 +273,10 @@
         <item row="3" column="0">
          <widget class="QLabel" name="label_7">
           <property name="text">
-            <string>Maximum &amp;view width:</string>
+            <string>Maximum text width in &amp;fullscreen:</string>
           </property>
           <property name="buddy">
-            <cstring>max_view_width</cstring>
+            <cstring>max_fs_width</cstring>
           </property>
          </widget>
         </item>
@ -350,7 +353,7 @@
  <tabstop>serif_family</tabstop>
  <tabstop>sans_family</tabstop>
  <tabstop>mono_family</tabstop>
-  <tabstop>max_view_width</tabstop>
+  <tabstop>max_fs_width</tabstop>
  <tabstop>opt_remember_window_size</tabstop>
  <tabstop>buttonBox</tabstop>
 </tabstops>
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@ -12,7 +12,7 @@ from PyQt4.Qt import (QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer,
                     QPainter, QPalette, QBrush, QFontDatabase, QDialog,
                     QColor, QPoint, QImage, QRegion, QVariant, QIcon,
                     QFont, pyqtSignature, QAction, QByteArray, QMenu,
-                     pyqtSignal, QSwipeGesture)
+                     pyqtSignal, QSwipeGesture, QApplication)
 from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings

 from calibre.utils.config import Config, StringConfig
@ -46,8 +46,10 @@ def config(defaults=None):
        help=_('Remember last used window size'))
    c.add_opt('user_css', default='',
              help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
-    c.add_opt('max_view_width', default=6000,
-            help=_('Maximum width of the viewer window, in pixels.'))
+    c.add_opt('max_fs_width', default=800,
+        help=_("Set the maximum width that the book's text and pictures will take"
+        " when in fullscreen mode. This allows you to read the book text"
+        " without it becoming too wide."))
    c.add_opt('fit_images', default=True,
            help=_('Resize images larger than the viewer window to fit inside it'))
    c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
@ -101,7 +103,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
        self.standard_font.setCurrentIndex({'serif':0, 'sans':1, 'mono':2}[opts.standard_font])
        self.css.setPlainText(opts.user_css)
        self.css.setToolTip(_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
-        self.max_view_width.setValue(opts.max_view_width)
+        self.max_fs_width.setValue(opts.max_fs_width)
        with zipfile.ZipFile(P('viewer/hyphenate/patterns.zip',
            allow_user_override=False), 'r') as zf:
            pats = [x.split('.')[0].replace('-', '_') for x in zf.namelist()]
@ -144,7 +146,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
        c.set('user_css', unicode(self.css.toPlainText()))
        c.set('remember_window_size', self.opt_remember_window_size.isChecked())
        c.set('fit_images', self.opt_fit_images.isChecked())
-        c.set('max_view_width', int(self.max_view_width.value()))
+        c.set('max_fs_width', int(self.max_fs_width.value()))
        c.set('hyphenate', self.hyphenate.isChecked())
        c.set('remember_current_page', self.opt_remember_current_page.isChecked())
        c.set('wheel_flips_pages', self.opt_wheel_flips_pages.isChecked())
@ -192,6 +194,8 @@ class Document(QWebPage): # {{{
        self.loaded_javascript = False
        self.js_loader = JavaScriptLoader(
                    dynamic_coffeescript=self.debug_javascript)
+        self.initial_left_margin = self.initial_right_margin = u''
+        self.in_fullscreen_mode = False

        self.setLinkDelegationPolicy(self.DelegateAllLinks)
        self.scroll_marks = []
@ -239,6 +243,9 @@ class Document(QWebPage): # {{{
        self.enable_page_flip = self.page_flip_duration > 0.1
        self.font_magnification_step = opts.font_magnification_step
        self.wheel_flips_pages = opts.wheel_flips_pages
+        screen_width = QApplication.desktop().screenGeometry().width()
+        # Leave some space for the scrollbar and some border
+        self.max_fs_width = min(opts.max_fs_width, screen_width-50)

    def fit_images(self):
        if self.do_fit_images:
@ -274,6 +281,30 @@ class Document(QWebPage): # {{{
        self.set_bottom_padding(0)
        self.fit_images()
        self.init_hyphenate()
+        self.initial_left_margin = unicode(self.javascript(
+                        'document.body.style.marginLeft').toString())
+        self.initial_right_margin = unicode(self.javascript(
+                        'document.body.style.marginRight').toString())
+        if self.in_fullscreen_mode:
+            self.switch_to_fullscreen_mode()
+
+    def switch_to_fullscreen_mode(self):
+        self.in_fullscreen_mode = True
+        self.javascript('''
+                var s = document.body.style;
+                s.maxWidth = "%dpx";
+                s.marginLeft = "auto";
+                s.marginRight = "auto";
+            '''%self.max_fs_width)
+
+    def switch_to_window_mode(self):
+        self.in_fullscreen_mode = False
+        self.javascript('''
+                var s = document.body.style;
+                s.maxWidth = "none";
+                s.marginLeft = "%s";
+                s.marginRight = "%s";
+            '''%(self.initial_left_margin, self.initial_right_margin))

    @pyqtSignature("QString")
    def debug(self, msg):
@ -581,8 +612,8 @@ class DocumentView(QWebView): # {{{

    def config(self, parent=None):
        self.document.do_config(parent)
-        if self.manager is not None:
-            self.manager.set_max_width()
+        if self.document.in_fullscreen_mode:
+            self.document.switch_to_fullscreen_mode()
        self.setFocus(Qt.OtherFocusReason)

    def bookmark(self):
@ -602,6 +633,9 @@ class DocumentView(QWebView): # {{{
            menu.insertAction(list(menu.actions())[0], self.search_action)
        menu.addSeparator()
        menu.addAction(self.goto_location_action)
+        if self.document.in_fullscreen_mode and self.manager is not None:
+            menu.addSeparator()
+            menu.addAction(self.manager.toggle_toolbar_action)
        menu.exec_(ev.globalPos())

    def lookup(self, *args):
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -5,11 +5,11 @@ import traceback, os, sys, functools, collections, re
 from functools import partial
 from threading import Thread

-from PyQt4.Qt import QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, \
-                     QDoubleSpinBox, QLabel, QTextBrowser, \
-                     QPainter, QBrush, QColor, QStandardItemModel, QPalette, \
-                     QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, \
-                     QToolButton, QMenu, QInputDialog, QAction, QKeySequence
+from PyQt4.Qt import (QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray,
+        QSize, QDoubleSpinBox, QLabel, QTextBrowser, QPropertyAnimation,
+        QPainter, QBrush, QColor, QStandardItemModel, QPalette, QStandardItem,
+        QUrl, QRegExpValidator, QRegExp, QLineEdit, QToolButton, QMenu,
+        QInputDialog, QAction, QKeySequence)

 from calibre.gui2.viewer.main_ui import Ui_EbookViewer
 from calibre.gui2.viewer.printing import Printing
@ -55,8 +55,6 @@ class TOC(QStandardItemModel):
            self.appendRow(TOCItem(t))
        self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))

-
-
 class Worker(Thread):

    def run(self):
@ -292,6 +290,37 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        self.tool_bar2.setContextMenuPolicy(Qt.PreventContextMenu)
        self.tool_bar.widgetForAction(self.action_bookmark).setPopupMode(QToolButton.MenuButtonPopup)
        self.action_full_screen.setCheckable(True)
+        self.full_screen_label = QLabel('''
+                <center>
+                <h1>%s</h1>
+                <h3>%s</h3>
+                <h3>%s</h3>
+                </center>
+                '''%(_('Full screen mode'),
+                    _('Right click to show controls'),
+                    _('Press Esc to quit')),
+                    self)
+        self.full_screen_label.setVisible(False)
+        self.full_screen_label.setStyleSheet('''
+        QLabel {
+            text-align: center;
+            background-color: white;
+            color: black;
+            border-width: 1px;
+            border-style: solid;
+            border-radius: 20px;
+        }
+        ''')
+        self.toggle_toolbar_action = QAction(_('Show/hide controls'), self)
+        self.toggle_toolbar_action.triggered.connect(self.toggle_toolbars)
+        self.addAction(self.toggle_toolbar_action)
+        self.full_screen_label_anim = QPropertyAnimation(
+                self.full_screen_label, 'size')
+        self.esc_full_screen_action = a = QAction(self)
+        self.addAction(a)
+        a.setShortcut(Qt.Key_Escape)
+        a.setEnabled(False)
+        a.triggered.connect(self.action_full_screen.trigger)

        self.print_menu = QMenu()
        self.print_menu.addAction(QIcon(I('print-preview.png')), _('Print Preview'))
@ -299,7 +328,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        self.tool_bar.widgetForAction(self.action_print).setPopupMode(QToolButton.MenuButtonPopup)
        self.connect(self.action_print, SIGNAL("triggered(bool)"), partial(self.print_book, preview=False))
        self.connect(self.print_menu.actions()[0], SIGNAL("triggered(bool)"), partial(self.print_book, preview=True))
-        self.set_max_width()
        ca = self.view.copy_action
        ca.setShortcut(QKeySequence.Copy)
        self.addAction(ca)
@ -313,6 +341,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        w = self.tool_bar.widgetForAction(self.action_open_ebook)
        w.setPopupMode(QToolButton.MenuButtonPopup)

+        for x in ('tool_bar', 'tool_bar2'):
+            x = getattr(self, x)
+            for action in x.actions():
+                # So that the keyboard shortcuts for these actions will
+                # continue to function even when the toolbars are hidden
+                self.addAction(action)
+
        self.restore_state()

    def set_toc_visible(self, yes):
@ -338,9 +373,18 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
                count += 1

    def closeEvent(self, e):
+        if self.isFullScreen():
+            self.action_full_screen.trigger()
+            e.ignore()
+            return
        self.save_state()
        return MainWindow.closeEvent(self, e)

+    def toggle_toolbars(self):
+        for x in ('tool_bar', 'tool_bar2'):
+            x = getattr(self, x)
+            x.setVisible(not x.isVisible())
+
    def save_state(self):
        state = bytearray(self.saveState(self.STATE_VERSION))
        vprefs['viewer_toolbar_state'] = state
@ -382,11 +426,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        self._lookup = None
        self.dictionary_view.setHtml(html)

-    def set_max_width(self):
-        from calibre.gui2.viewer.documentview import config
-        c = config().parse()
-        self.frame.setMaximumWidth(c.max_view_width)
-
    def get_remember_current_page_opt(self):
        from calibre.gui2.viewer.documentview import config
        c = config().parse()
@ -401,6 +440,46 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        else:
            self.showFullScreen()

+    def showFullScreen(self):
+        self.tool_bar.setVisible(False)
+        self.tool_bar2.setVisible(False)
+        self._original_frame_margins = (
+            self.centralwidget.layout().contentsMargins(),
+            self.frame.layout().contentsMargins())
+        self.frame.layout().setContentsMargins(0, 0, 0, 0)
+        self.centralwidget.layout().setContentsMargins(0, 0, 0, 0)
+
+        super(EbookViewer, self).showFullScreen()
+        QTimer.singleShot(10, self.show_full_screen_label)
+
+    def show_full_screen_label(self):
+        f = self.full_screen_label
+        self.esc_full_screen_action.setEnabled(True)
+        f.setVisible(True)
+        height = 200
+        width = int(0.7*self.view.width())
+        f.resize(width, height)
+        f.move((self.view.width() - width)//2, (self.view.height()-height)//2)
+        a = self.full_screen_label_anim
+        a.setDuration(500)
+        a.setStartValue(QSize(width, 0))
+        a.setEndValue(QSize(width, height))
+        a.start()
+        QTimer.singleShot(2750, self.full_screen_label.hide)
+        self.view.document.switch_to_fullscreen_mode()
+
+    def showNormal(self):
+        self.esc_full_screen_action.setEnabled(False)
+        self.tool_bar.setVisible(True)
+        self.tool_bar2.setVisible(True)
+        self.full_screen_label.setVisible(False)
+        if hasattr(self, '_original_frame_margins'):
+            om = self._original_frame_margins
+            self.centralwidget.layout().setContentsMargins(om[0])
+            self.frame.layout().setContentsMargins(om[1])
+        super(EbookViewer, self).showNormal()
+        self.view.document.switch_to_window_mode()
+
    def goto(self, ref):
        if ref:
            tokens = ref.split('.')
--- a/src/calibre/gui2/viewer/main.ui
+++ b/src/calibre/gui2/viewer/main.ui
@ -284,6 +284,9 @@
   <property name="text">
    <string>Toggle full screen</string>
   </property>
+   <property name="toolTip">
+    <string>Toggle full screen (F11)</string>
+   </property>
  </action>
  <action name="action_print">
   <property name="icon">
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -15,6 +15,7 @@ from PyQt4.Qt import (QIcon, QFont, QLabel, QListWidget, QAction,
                        QMenu, QStringListModel, QCompleter, QStringList,
                        QTimer, QRect, QFontDatabase, QGraphicsView)

+from calibre.constants import iswindows
 from calibre.gui2 import (NONE, error_dialog, pixmap_to_data, gprefs,
        warning_dialog)
 from calibre.gui2.filename_pattern_ui import Ui_Form
@ -365,7 +366,7 @@ class FontFamilyModel(QAbstractListModel): # {{{
        self.families = list(qt_families.intersection(set(self.families)))
        self.families.sort()
        self.families[:0] = [_('None')]
-        self.font = QFont('sansserif')
+        self.font = QFont('Verdana' if iswindows else 'sansserif')

    def rowCount(self, *args):
        return len(self.families)
--- a/src/calibre/utils/smartypants.py
+++ b/src/calibre/utils/smartypants.py
@ -591,6 +591,21 @@ def educateQuotes(str):
 	str = re.sub(r'''""''',       """&#8221;&#8221;""", str)
 	str = re.sub(r"""''""",       """&#8217;&#8217;""", str)

+	# Special case for Quotes at inside of other entities, e.g.:
+	#   <p>A double quote--"within dashes"--would be nice.</p>
+	str = re.sub(r"""(?<=\W)"(?=\w)""", r"""&#8220;""", str)
+	str = re.sub(r"""(?<=\W)'(?=\w)""", r"""&#8216;""", str)
+	str = re.sub(r"""(?<=\w)"(?=\W)""", r"""&#8221;""", str)
+	str = re.sub(r"""(?<=\w)'(?=\W)""", r"""&#8217;""", str)
+	
+	# Special case for Quotes at end of line with a preceeding space (may change just to end of line)
+	str = re.sub(r"""(?<=\s)"$""", r"""&#8221;""", str)
+	str = re.sub(r"""(?<=\s)'$""", r"""&#8217;""", str)
+	
+	# Special case for Quotes at beginning of line with a space - multiparagraph quoted text:
+	str = re.sub(r"""^"(?=\s)""", r"""&#8220;""", str)
+	str = re.sub(r"""^'(?=\s)""", r"""&#8216;""", str)
+
 	# Special case for decade abbreviations (the '80s):
 	str = re.sub(r"""\b'(?=\d{2}s)""", r"""&#8217;""", str)