Support for reading KF8

2025-08-30 23:00:21 -04:00 · 2012-03-09 21:30:24 +05:30 · 2012-03-09 21:30:24 +05:30 · d1b6bb705d
commit d1b6bb705d
parent 93bf57e6c4
13 changed files with 1426 additions and 293 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -263,7 +263,7 @@ class MOBIMetadataReader(MetadataReaderPlugin):
    description = _('Read metadata from %s files')%'MOBI'

    def get_metadata(self, stream, ftype):
-        from calibre.ebooks.mobi.reader import get_metadata
+        from calibre.ebooks.metadata.mobi import get_metadata
        return get_metadata(stream)

 class ODTMetadataReader(MetadataReaderPlugin):
--- a/src/calibre/devices/kindle/apnx.py
+++ b/src/calibre/devices/kindle/apnx.py
@ -10,7 +10,7 @@ Generates and writes an APNX page mapping file.

 import struct

-from calibre.ebooks.mobi.reader import MobiReader
+from calibre.ebooks.mobi.reader.mobi6 import MobiReader
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.utils.logging import default_log

--- a/src/calibre/ebooks/conversion/plugins/mobi_input.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py
@ -3,7 +3,10 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+import os
+
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import PersistentTemporaryDirectory

 class MOBIInput(InputFormatPlugin):

@ -14,17 +17,43 @@ class MOBIInput(InputFormatPlugin):

    def convert(self, stream, options, file_ext, log,
                accelerators):
-        from calibre.ebooks.mobi.reader import MobiReader
+
+        if os.environ.get('USE_MOBIUNPACK', None) is not None:
+            try:
+                from mobiunpack.mobi_unpack import Mobi8Reader
+                from calibre.customize.ui import plugin_for_input_format
+
+                wdir = PersistentTemporaryDirectory('_unpack_space')
+                m8r = Mobi8Reader(stream, wdir)
+                if m8r.isK8():
+                    epub_path = m8r.processMobi8()
+                    epub_input = plugin_for_input_format('epub')
+                    for opt in epub_input.options:
+                        setattr(options, opt.option.name, opt.recommended_value)
+                    options.input_encoding = m8r.getCodec()
+                    return epub_input.convert(open(epub_path,'rb'), options,
+                            'epub', log, accelerators)
+            except Exception:
+                log.exception('mobi_unpack code not working')
+
+        from calibre.ebooks.mobi.reader.mobi6 import MobiReader
        from lxml import html
        parse_cache = {}
        try:
            mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_pipeline)
-            mr.extract_content(u'.', parse_cache)
+            if mr.kf8_type is None:
+                mr.extract_content(u'.', parse_cache)
+
        except:
            mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_pipeline, try_extra_data_fix=True)
-            mr.extract_content(u'.', parse_cache)
+            if mr.kf8_type is None:
+                mr.extract_content(u'.', parse_cache)
+
+        if mr.kf8_type is not None:
+            from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
+            return os.path.abspath(Mobi8Reader(mr, log)())

        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
        if raw:
--- a/src/calibre/ebooks/metadata/mobi.py
+++ b/src/calibre/ebooks/metadata/mobi.py
@ -9,6 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
    'Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'

+import os, cStringIO
 from struct import pack, unpack
 from cStringIO import StringIO

@ -433,3 +434,75 @@ def set_metadata(stream, mi):
    mu = MetadataUpdater(stream)
    mu.update(mi)
    return
+
+def get_metadata(stream):
+    from calibre.ebooks.metadata import MetaInformation
+    from calibre.ptempfile import TemporaryDirectory
+    from calibre.ebooks.mobi.reader.headers import MetadataHeader
+    from calibre.ebooks.mobi.reader.mobi6 import MobiReader
+    from calibre import CurrentDir
+
+    try:
+        from PIL import Image as PILImage
+        PILImage
+    except ImportError:
+        import Image as PILImage
+
+
+    stream.seek(0)
+    try:
+        raw = stream.read(3)
+    except:
+        raw = ''
+    stream.seek(0)
+    if raw == b'TPZ':
+        from calibre.ebooks.metadata.topaz import get_metadata
+        return get_metadata(stream)
+    from calibre.utils.logging import Log
+    log = Log()
+    try:
+        mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
+    except:
+        mi = MetaInformation(_('Unknown'), [_('Unknown')])
+    mh = MetadataHeader(stream, log)
+    if mh.title and mh.title != _('Unknown'):
+        mi.title = mh.title
+
+    if mh.exth is not None:
+        if mh.exth.mi is not None:
+            mi = mh.exth.mi
+    else:
+        size = 1024**3
+        if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
+            pos = stream.tell()
+            stream.seek(0, 2)
+            size = stream.tell()
+            stream.seek(pos)
+        if size < 4*1024*1024:
+            with TemporaryDirectory('_mobi_meta_reader') as tdir:
+                with CurrentDir(tdir):
+                    mr = MobiReader(stream, log)
+                    parse_cache = {}
+                    mr.extract_content(tdir, parse_cache)
+                    if mr.embedded_mi is not None:
+                        mi = mr.embedded_mi
+    if hasattr(mh.exth, 'cover_offset'):
+        cover_index = mh.first_image_index + mh.exth.cover_offset
+        data  = mh.section_data(int(cover_index))
+    else:
+        try:
+            data  = mh.section_data(mh.first_image_index)
+        except:
+            data = ''
+    buf = cStringIO.StringIO(data)
+    try:
+        im = PILImage.open(buf)
+    except:
+        log.exception('Failed to read MOBI cover')
+    else:
+        obuf = cStringIO.StringIO()
+        im.convert('RGB').save(obuf, format='JPEG')
+        mi.cover_data = ('jpg', obuf.getvalue())
+    return mi
+
+
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -46,7 +46,7 @@ class TOC(list):
        self.toc_thumbnail = toc_thumbnail

    def __str__(self):
-        lines = ['TOC: %s#%s'%(self.href, self.fragment)]
+        lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
        for child in self:
            c = str(child).splitlines()
            for l in c:
--- a/src/calibre/ebooks/mobi/reader/init.py
+++ b/src/calibre/ebooks/mobi/reader/init.py
@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@ -0,0 +1,258 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (absolute_import, print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct, re, os
+
+from calibre import replace_entities
+from calibre.utils.date import parse_date
+from calibre.ebooks.mobi import MobiError
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
+
+NULL_INDEX = 0xffffffff
+
+class EXTHHeader(object): # {{{
+
+    def __init__(self, raw, codec, title):
+        self.doctype = raw[:4]
+        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
+        raw = raw[12:]
+        pos = 0
+        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
+        self.has_fake_cover = True
+        self.start_offset = None
+        left = self.num_items
+
+        while left > 0:
+            left -= 1
+            id, size = struct.unpack('>LL', raw[pos:pos + 8])
+            content = raw[pos + 8:pos + size]
+            pos += size
+            if id >= 100 and id < 200:
+                self.process_metadata(id, content, codec)
+            elif id == 203:
+                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
+            elif id == 201:
+                co, = struct.unpack('>L', content)
+                if co < NULL_INDEX:
+                    self.cover_offset = co
+            elif id == 202:
+                self.thumbnail_offset, = struct.unpack('>L', content)
+            elif id == 501:
+                # cdetype
+                pass
+            elif id == 502:
+                # last update time
+                pass
+            elif id == 503: # Long title
+                # Amazon seems to regard this as the definitive book title
+                # rather than the title from the PDB header. In fact when
+                # sending MOBI files through Amazon's email service if the
+                # title contains non ASCII chars or non filename safe chars
+                # they are messed up in the PDB header
+                try:
+                    title = content.decode(codec)
+                except:
+                    pass
+            #else:
+            #    print 'unknown record', id, repr(content)
+        if title:
+            self.mi.title = replace_entities(title)
+
+    def process_metadata(self, id, content, codec):
+        if id == 100:
+            if self.mi.authors == [_('Unknown')]:
+                self.mi.authors = []
+            au = content.decode(codec, 'ignore').strip()
+            self.mi.authors.append(au)
+            if re.match(r'\S+?\s*,\s+\S+', au.strip()):
+                self.mi.author_sort = au.strip()
+        elif id == 101:
+            self.mi.publisher = content.decode(codec, 'ignore').strip()
+        elif id == 103:
+            self.mi.comments  = content.decode(codec, 'ignore')
+        elif id == 104:
+            self.mi.isbn      = content.decode(codec, 'ignore').strip().replace('-', '')
+        elif id == 105:
+            if not self.mi.tags:
+                self.mi.tags = []
+            self.mi.tags.extend([x.strip() for x in content.decode(codec,
+                'ignore').split(';')])
+            self.mi.tags = list(set(self.mi.tags))
+        elif id == 106:
+            try:
+                self.mi.pubdate = parse_date(content, as_utc=False)
+            except:
+                pass
+        elif id == 108:
+            pass # Producer
+        elif id == 113:
+            pass # ASIN or UUID
+        elif id == 116:
+            self.start_offset, = struct.unpack(b'>L', content)
+        #else:
+        #    print 'unhandled metadata record', id, repr(content)
+# }}}
+
+class BookHeader(object):
+
+    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
+        self.log = log
+        self.compression_type = raw[:2]
+        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
+        self.encryption_type, = struct.unpack('>H', raw[12:14])
+        if ident == 'TEXTREAD':
+            self.codepage = 1252
+        if len(raw) <= 16:
+            self.codec = 'cp1252'
+            self.extra_flags = 0
+            self.title = _('Unknown')
+            self.language = 'ENGLISH'
+            self.sublanguage = 'NEUTRAL'
+            self.exth_flag, self.exth = 0, None
+            self.ancient = True
+            self.first_image_index = -1
+            self.mobi_version = 1
+        else:
+            self.ancient = False
+            self.doctype = raw[16:20]
+            self.length, self.type, self.codepage, self.unique_id, \
+                self.version = struct.unpack('>LLLLL', raw[20:40])
+
+            try:
+                self.codec = {
+                    1252: 'cp1252',
+                    65001: 'utf-8',
+                    }[self.codepage]
+            except (IndexError, KeyError):
+                self.codec = 'cp1252' if not user_encoding else user_encoding
+                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
+                    self.codec))
+            # There exists some broken DRM removal tool that removes DRM but
+            # leaves the DRM fields in the header yielding a header size of
+            # 0xF8. The actual value of max_header_length should be 0xE8 but
+            # it's changed to accommodate this silly tool. Hopefully that will
+            # not break anything else.
+            max_header_length = 0xF8
+
+            if (ident == 'TEXTREAD' or self.length < 0xE4 or
+                    self.length > max_header_length or
+                    (try_extra_data_fix and self.length == 0xE4)):
+                self.extra_flags = 0
+            else:
+                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
+
+            if self.compression_type == 'DH':
+                self.huff_offset, self.huff_number = struct.unpack('>LL',
+                        raw[0x70:0x78])
+
+            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
+            tend = toff + tlen
+            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
+            langcode  = struct.unpack('!L', raw[0x5C:0x60])[0]
+            langid    = langcode & 0xFF
+            sublangid = (langcode >> 10) & 0xFF
+            self.language = main_language.get(langid, 'ENGLISH')
+            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
+            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
+            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
+
+            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
+            self.exth = None
+            if not isinstance(self.title, unicode):
+                self.title = self.title.decode(self.codec, 'replace')
+            if self.exth_flag & 0x40:
+                try:
+                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
+                            self.title)
+                    self.exth.mi.uid = self.unique_id
+                    try:
+                        self.exth.mi.language = mobi2iana(langid, sublangid)
+                    except:
+                        self.log.exception('Unknown language code')
+                except:
+                    self.log.exception('Invalid EXTH header')
+                    self.exth_flag = 0
+
+            self.ncxidx = NULL_INDEX
+            if len(raw) >= 0xF8:
+                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
+
+            if self.mobi_version >= 8:
+                self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
+
+                # Index into <div> sections in raw_ml
+                self.dividx, = struct.unpack_from('>L', raw, 0xF8)
+
+                # Index into Other files
+                self.othidx, = struct.unpack_from('>L', raw, 0x104)
+
+                # need to use the FDST record to find out how to properly
+                # unpack the raw_ml into pieces it is simply a table of start
+                # and end locations for each flow piece
+                self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
+                self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
+                # if cnt is 1 or less, fdst section number can be garbage
+                if self.fdstcnt <= 1:
+                    self.fdstidx = NULL_INDEX
+            else: # Null values
+                self.skelidx = self.dividx = self.othidx = self.fdstidx = \
+                        NULL_INDEX
+
+class MetadataHeader(BookHeader):
+
+    def __init__(self, stream, log):
+        self.stream = stream
+        self.ident = self.identity()
+        self.num_sections = self.section_count()
+        if self.num_sections >= 2:
+            header = self.header()
+            BookHeader.__init__(self, header, self.ident, None, log)
+        else:
+            self.exth = None
+
+    def identity(self):
+        self.stream.seek(60)
+        ident = self.stream.read(8).upper()
+        if ident not in ['BOOKMOBI', 'TEXTREAD']:
+            raise MobiError('Unknown book type: %s' % ident)
+        return ident
+
+    def section_count(self):
+        self.stream.seek(76)
+        return struct.unpack('>H', self.stream.read(2))[0]
+
+    def section_offset(self, number):
+        self.stream.seek(78 + number * 8)
+        return struct.unpack('>LBBBB', self.stream.read(8))[0]
+
+    def header(self):
+        section_headers = []
+        # First section with the metadata
+        section_headers.append(self.section_offset(0))
+        # Second section used to get the length of the first
+        section_headers.append(self.section_offset(1))
+
+        end_off = section_headers[1]
+        off = section_headers[0]
+        self.stream.seek(off)
+        return self.stream.read(end_off - off)
+
+    def section_data(self, number):
+        start = self.section_offset(number)
+        if number == self.num_sections -1:
+            end = os.stat(self.stream.name).st_size
+        else:
+            end = self.section_offset(number + 1)
+        self.stream.seek(start)
+        try:
+            return self.stream.read(end - start)
+        except OverflowError:
+            self.stream.seek(start)
+            return self.stream.read()
+
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -0,0 +1,195 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct
+from collections import OrderedDict
+
+from calibre.ebooks.mobi.utils import decint, count_set_bits
+
+class InvalidFile(ValueError):
+    pass
+
+def check_signature(data, signature):
+    if data[:len(signature)] != signature:
+        raise InvalidFile('Not a valid %r section'%signature)
+
+class NotAnINDXRecord(InvalidFile):
+    pass
+
+class NotATAGXSection(InvalidFile):
+    pass
+
+def format_bytes(byts):
+    byts = bytearray(byts)
+    byts = [hex(b)[2:] for b in byts]
+    return ' '.join(byts)
+
+def parse_indx_header(data):
+    check_signature(data, b'INDX')
+    words = (
+            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
+    )
+    num = len(words)
+    values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
+    header = {words[i]:values[i] for i in xrange(num)}
+    return header
+
+class CNCX(object): # {{{
+
+    '''
+    Parses the records that contain the compiled NCX (all strings from the
+    NCX). Presents a simple offset : string mapping interface to access the
+    data.
+    '''
+
+    def __init__(self, records, codec):
+        self.records = OrderedDict()
+        record_offset = 0
+        for raw in records:
+            pos = 0
+            while pos < len(raw):
+                length, consumed = decint(raw[pos:])
+                if length > 0:
+                    try:
+                        self.records[pos+record_offset] = raw[
+                            pos+consumed:pos+consumed+length].decode(codec)
+                    except:
+                        byts = raw[pos:]
+                        r = format_bytes(byts)
+                        print ('CNCX entry at offset %d has unknown format %s'%(
+                            pos+record_offset, r))
+                        self.records[pos+record_offset] = r
+                        pos = len(raw)
+                pos += consumed+length
+            record_offset += 0x10000
+
+    def __getitem__(self, offset):
+        return self.records.get(offset)
+
+    def get(self, offset, default=None):
+        return self.records.get(offset, default)
+# }}}
+
+def parse_tag_section(data):
+    check_signature(data, b'TAGX')
+
+    tags = []
+    first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
+    control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
+
+    # Skip the first 12 bytes already read above.
+    for i in xrange(12, first_entry_offset, 4):
+        pos = i
+        tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
+            ord(data[pos+3])))
+    return control_byte_count, tags
+
+def get_tag_map(control_byte_count, tags, data, start, end):
+    ptags = []
+    ans = {}
+    control_byte_index = 0
+    data_start = start + control_byte_count
+
+    for tag, values_per_entry, mask, end_flag in tags:
+        if end_flag == 0x01:
+            control_byte_index += 1
+            continue
+        value = ord(data[start + control_byte_index]) & mask
+        if value != 0:
+            if value == mask:
+                if count_set_bits(mask) > 1:
+                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
+                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
+                    # which will contain the corresponding variable width values.
+                    value, consumed = decint(data[data_start:])
+                    data_start += consumed
+                    ptags.append((tag, None, value, values_per_entry))
+                else:
+                    ptags.append((tag, 1, None, values_per_entry))
+            else:
+                # Shift bits to get the masked value.
+                while mask & 0x01 == 0:
+                    mask = mask >> 1
+                    value = value >> 1
+                ptags.append((tag, value, None, values_per_entry))
+    for tag, value_count, value_bytes, values_per_entry in ptags:
+        values = []
+        if value_count != None:
+            # Read value_count * values_per_entry variable width values.
+            for _ in xrange(value_count*values_per_entry):
+                byts, consumed = decint(data[data_start:])
+                data_start += consumed
+                values.append(byts)
+        else:
+            # Convert value_bytes to variable width values.
+            total_consumed = 0
+            while total_consumed < value_bytes:
+                # Does this work for values_per_entry != 1?
+                byts, consumed = decint(data[data_start:])
+                data_start += consumed
+                total_consumed += consumed
+                values.append(byts)
+            if total_consumed != value_bytes:
+                print ("Error: Should consume %s bytes, but consumed %s" %
+                        (value_bytes, total_consumed))
+        ans[tag] = values
+    # Test that all bytes have been processed if end is given.
+    if end is not None and data_start < end:
+        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+        rest = data[data_start:end]
+        if rest.replace(b'\0', b''):
+            print ("Warning: There are unprocessed index bytes left: %s" %
+                    format_bytes(rest))
+
+    return ans
+
+def read_index(sections, idx, codec):
+    table, cncx = OrderedDict(), CNCX([], codec)
+
+    data = sections[idx][0]
+
+    indx_header = parse_indx_header(data)
+    indx_count = indx_header['count']
+
+    if indx_header['ncncx'] > 0:
+        off = idx + indx_count + 1
+        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
+        cncx = CNCX(cncx_records, codec)
+
+    tag_section_start = indx_header['len']
+    control_byte_count, tags = parse_tag_section(data[tag_section_start:])
+
+    for i in xrange(idx + 1, idx + 1 + indx_count):
+        data = sections[i][0]
+        header = parse_indx_header(data)
+        idxt_pos = header['start']
+        entry_count = header['count']
+
+        # loop through to build up the IDXT position starts
+        idx_positions= []
+        for j in xrange(entry_count):
+            pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
+            idx_positions.append(pos)
+        # The last entry ends before the IDXT tag (but there might be zero fill
+        # bytes we need to ignore!)
+        idx_positions.append(idxt_pos)
+
+        # For each entry in the IDXT build up the tag map and any associated
+        # text
+        for j in xrange(entry_count):
+            start, end = idx_positions[j:j+2]
+            text_length = ord(data[start])
+            text = data[start+1:start+1+text_length]
+            tag_map = get_tag_map(control_byte_count, tags, data,
+                    start+1+text_length, end)
+            table[text] = tag_map
+
+    return table, cncx
+
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@ -0,0 +1,307 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, os
+
+def update_internal_links(mobi8_reader):
+    # need to update all links that are internal which
+    # are based on positions within the xhtml files **BEFORE**
+    # cutting and pasting any pieces into the xhtml text files
+
+    #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
+    #       XXXX is the offset in records into divtbl
+    #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
+
+    mr = mobi8_reader
+
+    # pos:fid pattern
+    posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
+    posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
+
+    parts = []
+    for part in mr.parts:
+        srcpieces = posfid_pattern.split(part)
+        for j in xrange(1, len(srcpieces), 2):
+            tag = srcpieces[j]
+            if tag.startswith(b'<'):
+                for m in posfid_index_pattern.finditer(tag):
+                    posfid = m.group(1)
+                    offset = m.group(2)
+                    filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
+                    suffix = (b'#' + idtag) if idtag else b''
+                    replacement = filename.encode(mr.header.codec) + suffix
+                    tag = posfid_index_pattern.sub(replacement, tag, 1)
+                srcpieces[j] = tag
+        part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
+        parts.append(part)
+
+    # All parts are now unicode and have no internal links
+    return parts
+
+def remove_kindlegen_markup(parts):
+
+    # we can safely remove all of the Kindlegen generated aid tags
+    find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
+            re.IGNORECASE)
+    within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
+
+    for i in xrange(len(parts)):
+        part = parts[i]
+        srcpieces = find_tag_with_aid_pattern.split(part)
+        for j in range(len(srcpieces)):
+            tag = srcpieces[j]
+            if tag.startswith('<'):
+                for m in within_tag_aid_position_pattern.finditer(tag):
+                    replacement = ''
+                    tag = within_tag_aid_position_pattern.sub(replacement, tag,
+                            1)
+                srcpieces[j] = tag
+        part = "".join(srcpieces)
+        parts[i] = part
+
+    # we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
+    find_tag_with_AmznPageBreak_pattern = re.compile(
+            r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
+    within_tag_AmznPageBreak_position_pattern = re.compile(
+            r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
+
+    for i in xrange(len(parts)):
+        part = parts[i]
+        srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
+        for j in range(len(srcpieces)):
+            tag = srcpieces[j]
+            if tag.startswith('<'):
+                for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
+                    replacement = ''
+                    tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
+                srcpieces[j] = tag
+        part = "".join(srcpieces)
+        parts[i] = part
+
+def update_flow_links(mobi8_reader, resource_map, log):
+    #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+    #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+    #   kindle:embed:XXXX   (used for fonts)
+
+    mr = mobi8_reader
+    flows = []
+
+    img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+    img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
+
+    tag_pattern = re.compile(r'''(<[^>]*>)''')
+    flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+
+    url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
+    url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
+    font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
+    url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
+
+    for flow in mr.flows:
+        if flow is None: # 0th flow is None
+            flows.append(flow)
+            continue
+
+        if not isinstance(flow, unicode):
+            flow = flow.decode(mr.header.codec)
+
+        # links to raster image files from image tags
+        # image_pattern
+        srcpieces = img_pattern.split(flow)
+        for j in range(1, len(srcpieces), 2):
+            tag = srcpieces[j]
+            if tag.startswith('<im'):
+                for m in img_index_pattern.finditer(tag):
+                    num = int(m.group(1), 32)
+                    href = resource_map[num-1]
+                    if href:
+                        replacement = '"%s"'%('../'+ href)
+                        tag = img_index_pattern.sub(replacement, tag, 1)
+                    else:
+                        log.warn('Referenced image %s was not recognized '
+                                'as a valid image in %s' % (num, tag))
+                srcpieces[j] = tag
+        flow = "".join(srcpieces)
+
+        # replacements inside css url():
+        srcpieces = url_pattern.split(flow)
+        for j in range(1, len(srcpieces), 2):
+            tag = srcpieces[j]
+
+            # process links to raster image files
+            for m in url_img_index_pattern.finditer(tag):
+                num = int(m.group(1), 32)
+                href = resource_map[num-1]
+                if href:
+                    replacement = '"%s"'%('../'+ href)
+                    tag = url_img_index_pattern.sub(replacement, tag, 1)
+                else:
+                    log.warn('Referenced image %s was not recognized as a '
+                    'valid image in %s' % (num, tag))
+
+            # process links to fonts
+            for m in font_index_pattern.finditer(tag):
+                num = int(m.group(1), 32)
+                href = resource_map[num-1]
+                if href is None:
+                    log.warn('Referenced font %s was not recognized as a '
+                    'valid font in %s' % (num, tag))
+                else:
+                    replacement = '"%s"'%('../'+ href)
+                    tag = font_index_pattern.sub(replacement, tag, 1)
+
+            # process links to other css pieces
+            for m in url_css_index_pattern.finditer(tag):
+                num = int(m.group(1), 32)
+                fi = mr.flowinfo[num]
+                replacement = '"../' + fi.dir + '/' + fi.fname + '"'
+                tag = url_css_index_pattern.sub(replacement, tag, 1)
+
+            srcpieces[j] = tag
+        flow = "".join(srcpieces)
+
+        # flow pattern not inside url()
+        srcpieces = re.split(tag_pattern, flow)
+        for j in range(1, len(srcpieces), 2):
+            tag = srcpieces[j]
+            if tag.startswith('<'):
+                for m in re.finditer(flow_pattern, tag):
+                    num = int(m.group(1), 32)
+                    fi = mr.flowinfo[num]
+                    if fi.format == 'inline':
+                        flowtext = mr.flows[num]
+                        tag = flowtext
+                    else:
+                        replacement = '"../' + fi.dir + '/' + fi.fname + '"'
+                        tag = flow_pattern.sub(replacement, tag, 1)
+                srcpieces[j] = tag
+        flow = "".join(srcpieces)
+
+        flows.append(flow)
+
+    # All flows are now unicode and have links resolved
+    return flows
+
+def insert_flows_into_markup(parts, flows, mobi8_reader):
+    mr = mobi8_reader
+
+    # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+    tag_pattern = re.compile(r'''(<[^>]*>)''')
+    flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+    for i in xrange(len(parts)):
+        part = parts[i]
+
+        # flow pattern
+        srcpieces = tag_pattern.split(part)
+        for j in range(1, len(srcpieces),2):
+            tag = srcpieces[j]
+            if tag.startswith('<'):
+                for m in flow_pattern.finditer(tag):
+                    num = int(m.group(1), 32)
+                    fi = mr.flowinfo[num]
+                    if fi.format == 'inline':
+                        tag = flows[num]
+                    else:
+                        replacement = '"../' + fi.dir + '/' + fi.fname + '"'
+                        tag = flow_pattern.sub(replacement, tag, 1)
+                srcpieces[j] = tag
+        part = "".join(srcpieces)
+        # store away modified version
+        parts[i] = part
+
+def insert_images_into_markup(parts, resource_map, log):
+    # Handle any embedded raster images links in the xhtml text
+    # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+    img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+    img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
+    for i in xrange(len(parts)):
+        part = parts[i]
+        #[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+        # links to raster image files
+        # image_pattern
+        srcpieces = img_pattern.split(part)
+        for j in range(1, len(srcpieces), 2):
+            tag = srcpieces[j]
+            if tag.startswith('<im'):
+                for m in img_index_pattern.finditer(tag):
+                    num = int(m.group(1), 32)
+                    href = resource_map[num-1]
+                    if href:
+                        replacement = '"%s"'%('../' + href)
+                        tag = img_index_pattern.sub(replacement, tag, 1)
+                    else:
+                        log.warn('Referenced image %s was not recognized as '
+                                'a valid image in %s' % (num, tag))
+                srcpieces[j] = tag
+        part = "".join(srcpieces)
+        # store away modified version
+        parts[i] = part
+
+def upshift_markup(parts):
+    tag_pattern = re.compile(r'''(<(svg)[^>]*>)''', re.IGNORECASE)
+
+    for i in xrange(len(parts)):
+        part = parts[i]
+
+        # tag pattern
+        srcpieces = re.split(tag_pattern, part)
+        for j in range(1, len(srcpieces),2):
+            tag = srcpieces[j]
+            if tag[:4].lower() == '<svg':
+                tag = tag.replace('preserveaspectratio','preserveAspectRatio')
+                tag = tag.replace('viewbox','viewBox')
+            srcpieces[j] = tag
+        part = "".join(srcpieces)
+        # store away modified version
+        parts[i] = part
+
+def expand_mobi8_markup(mobi8_reader, resource_map, log):
+    # First update all internal links that are based on offsets
+    parts = update_internal_links(mobi8_reader)
+
+    # Remove pointless markup inserted by kindlegen
+    remove_kindlegen_markup(parts)
+
+    # Handle substitutions for the flows pieces first as they may
+    # be inlined into the xhtml text
+    flows = update_flow_links(mobi8_reader, resource_map, log)
+
+    # Insert inline flows into the markup
+    insert_flows_into_markup(parts, flows, mobi8_reader)
+
+    # Insert raster images into markup
+    insert_images_into_markup(parts, resource_map, log)
+
+    # Perform general markup cleanups
+    upshift_markup(parts)
+
+    # Update the parts and flows stored in the reader
+    mobi8_reader.parts = parts
+    mobi8_reader.flows = flows
+
+    # write out the parts and file flows
+    os.mkdir('text') # directory containing all parts
+    spine = []
+    for i, part in enumerate(parts):
+        pi = mobi8_reader.partinfo[i]
+        with open(os.path.join(pi.type, pi.filename), 'wb') as f:
+            f.write(part.encode('utf-8'))
+            spine.append(f.name)
+
+    for i, flow in enumerate(flows):
+        fi = mobi8_reader.flowinfo[i]
+        if fi.format == 'file':
+            if not os.path.exists(fi.dir):
+                os.mkdir(fi.dir)
+            with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
+                f.write(flow.encode('utf-8'))
+
+    return spine
+
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -1,10 +1,12 @@
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-Read data from .mobi files
-'''
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (absolute_import, print_function)

-import shutil, os, re, struct, textwrap, cStringIO, sys
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import shutil, os, re, struct, textwrap, cStringIO

 try:
    from PIL import Image as PILImage
@ -14,235 +16,22 @@ except ImportError:

 from lxml import html, etree

-from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode, \
-    replace_entities
+from calibre import (xml_entity_to_unicode, entity_to_unicode)
 from calibre.utils.filenames import ascii_filename
-from calibre.utils.date import parse_date
 from calibre.utils.cleantext import clean_ascii_chars
-from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks import DRMError, unit_convert
 from calibre.ebooks.chardet import ENCODING_PATS
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.huffcdic import HuffReader
-from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
 from calibre.ebooks.compression.palmdoc import decompress_doc
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
 from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.mobi.reader.headers import BookHeader

 class TopazError(ValueError):
    pass

-class EXTHHeader(object):
-
-    def __init__(self, raw, codec, title):
-        self.doctype = raw[:4]
-        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
-        raw = raw[12:]
-        pos = 0
-        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
-        self.has_fake_cover = True
-        left = self.num_items
-
-        while left > 0:
-            left -= 1
-            id, size = struct.unpack('>LL', raw[pos:pos + 8])
-            content = raw[pos + 8:pos + size]
-            pos += size
-            if id >= 100 and id < 200:
-                self.process_metadata(id, content, codec)
-            elif id == 203:
-                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
-            elif id == 201:
-                co, = struct.unpack('>L', content)
-                if co < 1e7:
-                    self.cover_offset = co
-            elif id == 202:
-                self.thumbnail_offset, = struct.unpack('>L', content)
-            elif id == 501:
-                # cdetype
-                pass
-            elif id == 502:
-                # last update time
-                pass
-            elif id == 503: # Long title
-                # Amazon seems to regard this as the definitive book title
-                # rather than the title from the PDB header. In fact when
-                # sending MOBI files through Amazon's email service if the
-                # title contains non ASCII chars or non filename safe chars
-                # they are messed up in the PDB header
-                try:
-                    title = content.decode(codec)
-                except:
-                    pass
-            #else:
-            #    print 'unknown record', id, repr(content)
-        if title:
-            self.mi.title = replace_entities(title)
-
-    def process_metadata(self, id, content, codec):
-        if id == 100:
-            if self.mi.authors == [_('Unknown')]:
-                self.mi.authors = []
-            au = content.decode(codec, 'ignore').strip()
-            self.mi.authors.append(au)
-            if re.match(r'\S+?\s*,\s+\S+', au.strip()):
-                self.mi.author_sort = au.strip()
-        elif id == 101:
-            self.mi.publisher = content.decode(codec, 'ignore').strip()
-        elif id == 103:
-            self.mi.comments  = content.decode(codec, 'ignore')
-        elif id == 104:
-            self.mi.isbn      = content.decode(codec, 'ignore').strip().replace('-', '')
-        elif id == 105:
-            if not self.mi.tags:
-                self.mi.tags = []
-            self.mi.tags.extend([x.strip() for x in content.decode(codec,
-                'ignore').split(';')])
-            self.mi.tags = list(set(self.mi.tags))
-        elif id == 106:
-            try:
-                self.mi.pubdate = parse_date(content, as_utc=False)
-            except:
-                pass
-        elif id == 108:
-            pass # Producer
-        elif id == 113:
-            pass # ASIN or UUID
-        #else:
-        #    print 'unhandled metadata record', id, repr(content)
-
-
-class BookHeader(object):
-
-    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
-        self.log = log
-        self.compression_type = raw[:2]
-        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
-        self.encryption_type, = struct.unpack('>H', raw[12:14])
-        if ident == 'TEXTREAD':
-            self.codepage = 1252
-        if len(raw) <= 16:
-            self.codec = 'cp1252'
-            self.extra_flags = 0
-            self.title = _('Unknown')
-            self.language = 'ENGLISH'
-            self.sublanguage = 'NEUTRAL'
-            self.exth_flag, self.exth = 0, None
-            self.ancient = True
-            self.first_image_index = -1
-            self.mobi_version = 1
-        else:
-            self.ancient = False
-            self.doctype = raw[16:20]
-            self.length, self.type, self.codepage, self.unique_id, \
-                self.version = struct.unpack('>LLLLL', raw[20:40])
-
-            try:
-                self.codec = {
-                    1252: 'cp1252',
-                    65001: 'utf-8',
-                    }[self.codepage]
-            except (IndexError, KeyError):
-                self.codec = 'cp1252' if not user_encoding else user_encoding
-                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
-                    self.codec))
-            # There exists some broken DRM removal tool that removes DRM but
-            # leaves the DRM fields in the header yielding a header size of
-            # 0xF8. The actual value of max_header_length should be 0xE8 but
-            # it's changed to accommodate this silly tool. Hopefully that will
-            # not break anything else.
-            max_header_length = 0xF8
-
-            if (ident == 'TEXTREAD' or self.length < 0xE4 or
-                    self.length > max_header_length or
-                    (try_extra_data_fix and self.length == 0xE4)):
-                self.extra_flags = 0
-            else:
-                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
-
-            if self.compression_type == 'DH':
-                self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
-
-            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
-            tend = toff + tlen
-            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
-            langcode  = struct.unpack('!L', raw[0x5C:0x60])[0]
-            langid    = langcode & 0xFF
-            sublangid = (langcode >> 10) & 0xFF
-            self.language = main_language.get(langid, 'ENGLISH')
-            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
-            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
-            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
-
-            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
-            self.exth = None
-            if not isinstance(self.title, unicode):
-                self.title = self.title.decode(self.codec, 'replace')
-            if self.exth_flag & 0x40:
-                try:
-                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
-                    self.exth.mi.uid = self.unique_id
-                    try:
-                        self.exth.mi.language = mobi2iana(langid, sublangid)
-                    except:
-                        self.log.exception('Unknown language code')
-                except:
-                    self.log.exception('Invalid EXTH header')
-                    self.exth_flag = 0
-
-
-class MetadataHeader(BookHeader):
-    def __init__(self, stream, log):
-        self.stream = stream
-        self.ident = self.identity()
-        self.num_sections = self.section_count()
-        if self.num_sections >= 2:
-            header = self.header()
-            BookHeader.__init__(self, header, self.ident, None, log)
-        else:
-            self.exth = None
-
-    def identity(self):
-        self.stream.seek(60)
-        ident = self.stream.read(8).upper()
-        if ident not in ['BOOKMOBI', 'TEXTREAD']:
-            raise MobiError('Unknown book type: %s' % ident)
-        return ident
-
-    def section_count(self):
-        self.stream.seek(76)
-        return struct.unpack('>H', self.stream.read(2))[0]
-
-    def section_offset(self, number):
-        self.stream.seek(78 + number * 8)
-        return struct.unpack('>LBBBB', self.stream.read(8))[0]
-
-    def header(self):
-        section_headers = []
-        # First section with the metadata
-        section_headers.append(self.section_offset(0))
-        # Second section used to get the lengh of the first
-        section_headers.append(self.section_offset(1))
-
-        end_off = section_headers[1]
-        off = section_headers[0]
-        self.stream.seek(off)
-        return self.stream.read(end_off - off)
-
-    def section_data(self, number):
-        start = self.section_offset(number)
-        if number == self.num_sections -1:
-            end = os.stat(self.stream.name).st_size
-        else:
-            end = self.section_offset(number + 1)
-        self.stream.seek(start)
-        try:
-            return self.stream.read(end - start)
-        except OverflowError:
-            return self.stream.read(os.stat(self.stream.name).st_size - start)
-
-
 class MobiReader(object):
    PAGE_BREAK_PAT = re.compile(
        r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
@ -312,15 +101,46 @@ class MobiReader(object):
            self.sections.append((section(i), self.section_headers[i]))


-        self.book_header = BookHeader(self.sections[0][0], self.ident,
+        self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
            user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
        self.name = self.name.decode(self.book_header.codec, 'replace')
+        self.kf8_type = None
+        is_kf8 = self.book_header.mobi_version == 8
+        if is_kf8:
+            self.kf8_type = 'standalone'
+        else: # Check for joint mobi 6 and kf 8 file
+            KF8_BOUNDARY = b'BOUNDARY'
+            for i, x in enumerate(self.sections[:-1]):
+                sec = x[0]
+                if (len(sec) == len(KF8_BOUNDARY) and sec ==
+                        KF8_BOUNDARY):
+                    try:
+                        self.book_header = BookHeader(self.sections[i+1][0],
+                                self.ident, user_encoding, self.log)
+                        # The following are only correct in the Mobi 6
+                        # header not the Mobi 8 header
+                        for x in ('first_image_index',):
+                            setattr(self.book_header, x, getattr(bh, x))
+                        self.book_header.huff_offset += i + 1
+                        self.kf8_type = 'joint'
+                        self.kf8_boundary = i
+                    except:
+                        pass
+                    break
+
+    def check_for_drm(self):
+        if self.book_header.encryption_type != 0:
+            try:
+                name = self.book_header.exth.mi.title
+            except:
+                name = self.name
+            if not name:
+                name = self.name
+            raise DRMError(name)

    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
-        if self.book_header.encryption_type != 0:
-            raise DRMError(self.name)
-
+        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
@ -916,11 +736,12 @@ class MobiReader(object):
        trail_size = self.sizeof_trailing_entries(data)
        return data[:len(data)-trail_size]

-    def extract_text(self):
+    def extract_text(self, offset=1):
        self.log.debug('Extracting text...')
-        text_sections = [self.text_section(i) for i in range(1,
-            min(self.book_header.records + 1, len(self.sections)))]
-        processed_records = list(range(0, self.book_header.records + 1))
+        text_sections = [self.text_section(i) for i in xrange(offset,
+            min(self.book_header.records + offset, len(self.sections)))]
+        processed_records = list(range(offset-1, self.book_header.records +
+            offset))

        self.mobi_html = ''

@ -1027,63 +848,6 @@ class MobiReader(object):
            self.image_names.append(os.path.basename(path))
            im.save(open(path, 'wb'), format='JPEG')

-def get_metadata(stream):
-    stream.seek(0)
-    try:
-        raw = stream.read(3)
-    except:
-        raw = ''
-    stream.seek(0)
-    if raw == 'TPZ':
-        from calibre.ebooks.metadata.topaz import get_metadata
-        return get_metadata(stream)
-    from calibre.utils.logging import Log
-    log = Log()
-    try:
-        mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
-    except:
-        mi = MetaInformation(_('Unknown'), [_('Unknown')])
-    mh = MetadataHeader(stream, log)
-    if mh.title and mh.title != _('Unknown'):
-        mi.title = mh.title
-
-    if mh.exth is not None:
-        if mh.exth.mi is not None:
-            mi = mh.exth.mi
-    else:
-        size = sys.maxint
-        if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
-            pos = stream.tell()
-            stream.seek(0, 2)
-            size = stream.tell()
-            stream.seek(pos)
-        if size < 4*1024*1024:
-            with TemporaryDirectory('_mobi_meta_reader') as tdir:
-                with CurrentDir(tdir):
-                    mr = MobiReader(stream, log)
-                    parse_cache = {}
-                    mr.extract_content(tdir, parse_cache)
-                    if mr.embedded_mi is not None:
-                        mi = mr.embedded_mi
-    if hasattr(mh.exth, 'cover_offset'):
-        cover_index = mh.first_image_index + mh.exth.cover_offset
-        data  = mh.section_data(int(cover_index))
-    else:
-        try:
-            data  = mh.section_data(mh.first_image_index)
-        except:
-            data = ''
-    buf = cStringIO.StringIO(data)
-    try:
-        im = PILImage.open(buf)
-    except:
-        log.exception('Failed to read MOBI cover')
-    else:
-        obuf = cStringIO.StringIO()
-        im.convert('RGB').save(obuf, format='JPEG')
-        mi.cover_data = ('jpg', obuf.getvalue())
-    return mi
-
 def test_mbp_regex():
    for raw, m in {
        '<mbp:pagebreak></mbp:pagebreak>':'',
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -0,0 +1,390 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct, re, os, zlib, imghdr
+from collections import namedtuple
+from itertools import repeat
+
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import read_index
+from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
+from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
+from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
+
+Part = namedtuple('Part',
+    'num type filename start end aid')
+
+Elem = namedtuple('Elem',
+    'insert_pos toc_text file_number sequence_number start_pos '
+    'length')
+
+FlowInfo = namedtuple('FlowInfo',
+        'type format dir fname')
+
+class Mobi8Reader(object):
+
+    def __init__(self, mobi6_reader, log):
+        self.mobi6_reader, self.log = mobi6_reader, log
+        self.header = mobi6_reader.book_header
+
+    def __call__(self):
+        self.mobi6_reader.check_for_drm()
+        offset = 1
+        res_end = len(self.mobi6_reader.sections)
+        if self.mobi6_reader.kf8_type == 'joint':
+            offset = self.mobi6_reader.kf8_boundary + 2
+            res_end = self.mobi6_reader.kf8_boundary
+
+        self.processed_records = self.mobi6_reader.extract_text(offset=offset)
+        self.raw_ml = self.mobi6_reader.mobi_html
+        with open('debug-raw.html', 'wb') as f:
+            f.write(self.raw_ml)
+
+        self.kf8_sections = self.mobi6_reader.sections[offset-1:]
+        first_resource_index = self.header.first_image_index
+        if first_resource_index in {-1, NULL_INDEX}:
+            first_resource_index = self.header.records + 1
+        self.resource_sections = \
+                self.mobi6_reader.sections[first_resource_index:res_end]
+        self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
+
+        self.read_indices()
+        self.build_parts()
+        guide = self.create_guide()
+        ncx = self.create_ncx()
+        resource_map = self.extract_resources()
+        spine = self.expand_text(resource_map)
+        return self.write_opf(guide, ncx, spine, resource_map)
+
+    def read_indices(self):
+        self.flow_table = (0, NULL_INDEX)
+
+        if self.header.fdstidx != NULL_INDEX:
+            header = self.kf8_sections[self.header.fdstidx][0]
+            if header[:4] != b'FDST':
+                raise ValueError('KF8 does not have a valid FDST record')
+            num_sections, = struct.unpack_from(b'>L', header, 0x08)
+            sections = header[0x0c:]
+            self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
+                    sections, 0)[::2] + (NULL_INDEX,)
+
+        self.files = []
+        if self.header.skelidx != NULL_INDEX:
+            table = read_index(self.kf8_sections, self.header.skelidx,
+                    self.header.codec)[0]
+            File = namedtuple('File',
+                'file_number name divtbl_count start_position length')
+
+            for i, text in enumerate(table.iterkeys()):
+                tag_map = table[text]
+                self.files.append(File(i, text, tag_map[1][0],
+                    tag_map[6][0], tag_map[6][1]))
+
+        self.elems = []
+        if self.header.dividx != NULL_INDEX:
+            table, cncx = read_index(self.kf8_sections, self.header.dividx,
+                    self.header.codec)
+            for i, text in enumerate(table.iterkeys()):
+                tag_map = table[text]
+                toc_text = cncx[tag_map[2][0]]
+                self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
+                    tag_map[4][0], tag_map[6][0], tag_map[6][1]))
+
+        self.guide = []
+        if self.header.othidx != NULL_INDEX:
+            table, cncx = read_index(self.kf8_sections, self.header.othidx,
+                    self.header.codec)
+            Item = namedtuple('Item',
+                'type title div_frag_num')
+
+            for i, ref_type in enumerate(table.iterkeys()):
+                tag_map = table[ref_type]
+                 # ref_type, ref_title, div/frag number
+                title = cncx[tag_map[1][0]]
+                fileno = None
+                if 3 in tag_map.keys():
+                    fileno  = tag_map[3][0]
+                if 6 in tag_map.keys():
+                    fileno = tag_map[6][0]
+                self.guide.append(Item(ref_type.decode(self.header.codec),
+                    title, fileno))
+
+    def build_parts(self):
+        raw_ml = self.mobi6_reader.mobi_html
+        self.flows = []
+        self.flowinfo = []
+
+        # now split the raw_ml into its flow pieces
+        for j in xrange(0, len(self.flow_table)-1):
+            start = self.flow_table[j]
+            end = self.flow_table[j+1]
+            if end == NULL_INDEX:
+                end = len(raw_ml)
+            self.flows.append(raw_ml[start:end])
+
+        # the first piece represents the xhtml text
+        text = self.flows[0]
+        self.flows[0] = b''
+
+        # walk the <skeleton> and <div> tables to build original source xhtml
+        # files *without* destroying any file position information needed for
+        # later href processing and create final list of file separation start:
+        # stop points and etc in partinfo
+        self.parts = []
+        self.partinfo = []
+        divptr = 0
+        baseptr = 0
+        for skelnum, skelname, divcnt, skelpos, skellen in self.files:
+            baseptr = skelpos + skellen
+            skeleton = text[skelpos:baseptr]
+            for i in xrange(divcnt):
+                insertpos, idtext, filenum, seqnum, startpos, length = \
+                                    self.elems[divptr]
+                if i == 0:
+                    aidtext = idtext[12:-2]
+                    filename = 'part%04d.html' % filenum
+                part = text[baseptr:baseptr + length]
+                insertpos = insertpos - skelpos
+                skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
+                baseptr = baseptr + length
+                divptr += 1
+            self.parts.append(skeleton)
+            self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
+                baseptr, aidtext))
+
+        # The primary css style sheet is typically stored next followed by any
+        # snippets of code that were previously inlined in the
+        # original xhtml but have been stripped out and placed here.
+        # This can include local CDATA snippets and and svg sections.
+
+        # The problem is that for most browsers and ereaders, you can not
+        # use <img src="imageXXXX.svg" /> to import any svg image that itself
+        # properly uses an <image/> tag to import some raster image - it
+        # should work according to the spec but does not for almost all browsers
+        # and ereaders and causes epub validation issues because those  raster
+        # images are in manifest but not in xhtml text - since they only
+        # referenced from an svg image
+
+        # So we need to check the remaining flow pieces to see if they are css
+        # or svg images.  if svg images, we must check if they have an <image/>
+        # and if so inline them into the xhtml text pieces.
+
+        # there may be other sorts of pieces stored here but until we see one
+        # in the wild to reverse engineer we won't be able to tell
+
+        self.flowinfo.append(FlowInfo(None, None, None, None))
+        svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
+        image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
+        for j in xrange(1, len(self.flows)):
+            flowpart = self.flows[j]
+            nstr = '%04d' % j
+            m = svg_tag_pattern.search(flowpart)
+            if m != None:
+                # svg
+                typ = 'svg'
+                start = m.start()
+                m2 = image_tag_pattern.search(flowpart)
+                if m2 != None:
+                    format = 'inline'
+                    dir = None
+                    fname = None
+                    # strip off anything before <svg if inlining
+                    flowpart = flowpart[start:]
+                else:
+                    format = 'file'
+                    dir = "images"
+                    fname = 'svgimg' + nstr + '.svg'
+            else:
+                # search for CDATA and if exists inline it
+                if flowpart.find('[CDATA[') >= 0:
+                    typ = 'css'
+                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
+                    format = 'inline'
+                    dir = None
+                    fname = None
+                else:
+                    # css - assume as standalone css file
+                    typ = 'css'
+                    format = 'file'
+                    dir = "styles"
+                    fname = nstr + '.css'
+
+            self.flows[j] = flowpart
+            self.flowinfo.append(FlowInfo(typ, format, dir, fname))
+
+    def get_file_info(self, pos):
+        ''' Get information about the part (file) that exists at pos in
+        the raw markup '''
+        for part in self.partinfo:
+            if pos >= part.start and pos < part.end:
+                return part
+        return Part(*repeat(None, len(Part._fields)))
+
+    def get_id_tag_by_pos_fid(self, posfid, offset):
+        # first convert kindle:pos:fid and offset info to position in file
+        row = int(posfid, 32)
+        off = int(offset, 32)
+        [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
+        pos = insertpos + off
+        fname = self.get_file_info(pos).filename
+        # an existing "id=" must exist in original xhtml otherwise it would not
+        # have worked for linking.  Amazon seems to have added its own
+        # additional "aid=" inside tags whose contents seem to represent some
+        # position information encoded into Base32 name.
+
+        # so find the closest "id=" before position the file by actually
+        # searching in that file
+        idtext = self.get_id_tag(pos)
+        return fname, idtext
+
+    def get_id_tag(self, pos):
+        # find the correct tag by actually searching in the destination
+        # textblock at position
+        fi = self.get_file_info(pos)
+        if fi.num is None and fi.start is None:
+            raise ValueError('No file contains pos: %d'%pos)
+        textblock = self.parts[fi.num]
+        id_map = []
+        npos = pos - fi.start
+        # if npos inside a tag then search all text before the its end of tag
+        # marker
+        pgt = textblock.find(b'>', npos)
+        plt = textblock.find(b'<', npos)
+        if pgt < plt:
+            npos = pgt + 1
+        # find id links only inside of tags
+        #    inside any < > pair find all "id=' and return whatever is inside
+        #    the quotes
+        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
+                re.IGNORECASE)
+        for m in re.finditer(id_pattern, textblock):
+            id_map.append((m.start(), m.group(1)))
+
+        if not id_map:
+            # Found no id in the textblock, link must be to top of file
+            return b''
+        # if npos is before first id= inside a tag, return the first
+        if npos < id_map[0][0]:
+            return id_map[0][1]
+        # if npos is after the last id= inside a tag, return the last
+        if npos > id_map[-1][0]:
+            return id_map[-1][1]
+        # otherwise find last id before npos
+        for i, item in enumerate(id_map):
+            if npos < item[0]:
+                return id_map[i-1][1]
+        return id_map[0][1]
+
+    def create_guide(self):
+        guide = Guide()
+        for ref_type, ref_title, fileno in self.guide:
+            elem = self.elems[fileno]
+            fi = self.get_file_info(elem.insert_pos)
+            idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
+            linktgt = fi.filename
+            if idtext:
+                linktgt += b'#' + idtext
+            g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
+            g.title, g.type = ref_title, ref_type
+            guide.append(g)
+
+        so = self.header.exth.start_offset
+        if so not in {None, NULL_INDEX}:
+            fi = self.get_file_info(so)
+            if fi.filename is not None:
+                idtext = self.get_id_tag(so).decode(self.header.codec)
+                linktgt = fi.filename
+                if idtext:
+                    linktgt += '#' + idtext
+                g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
+                g.title, g.type = 'start', 'text'
+                guide.append(g)
+
+        return guide
+
+    def create_ncx(self):
+        index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
+                self.header.codec)
+
+        # Add href and anchor info to the index entries
+        for entry in index_entries:
+            pos = entry['pos']
+            fi = self.get_file_info(pos)
+            if fi.filename is None:
+                raise ValueError('Index entry has invalid pos: %d'%pos)
+            idtag = self.get_id_tag(pos).decode(self.header.codec)
+            entry['href'] = '%s/%s'%(fi.type, fi.filename)
+            entry['idtag'] = idtag
+
+        # Build the TOC object
+        return build_toc(index_entries)
+
+    def extract_resources(self):
+        resource_map = []
+        for x in ('fonts', 'images'):
+            os.mkdir(x)
+
+        for i, sec in enumerate(self.resource_sections):
+            fname_idx = i+1
+            data = sec[0]
+            typ = data[:4]
+            href = None
+            if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
+                    b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
+                pass # Ignore these records
+            elif typ == b'FONT':
+                # fonts only exist in K8 ebooks
+                # Format:
+                # bytes  0 -  3:  'FONT'
+                # bytes  4 -  7:  ?? Expanded size in bytes ??
+                # bytes  8 - 11:  ?? number of files ??
+                # bytes 12 - 15:  ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
+                # bytes 16 - 23:  ?? typically all 0x00 ??  Are these compression flags from zlib?
+                # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
+                data = data[26:-4]
+                uncompressed_data = zlib.decompress(data, -15)
+                hdr = uncompressed_data[0:4]
+                ext = 'dat'
+                if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
+                    ext = 'ttf'
+                href = "fonts/%05d.%s" % (fname_idx, ext)
+                with open(href.replace('/', os.sep), 'wb') as f:
+                    f.write(uncompressed_data)
+            else:
+                imgtype = imghdr.what(None, data)
+                if imgtype is None:
+                    imgtype = 'unknown'
+                href = 'images/%05d.%s'%(fname_idx, imgtype)
+                with open(href.replace('/', os.sep), 'wb') as f:
+                    f.write(data)
+
+            resource_map.append(href)
+
+        return resource_map
+
+    def expand_text(self, resource_map):
+        return expand_mobi8_markup(self, resource_map, self.log)
+
+    def write_opf(self, guide, toc, spine, resource_map):
+        mi = self.header.exth.mi
+        if (self.cover_offset is not None and self.cover_offset <
+                len(resource_map)):
+            mi.cover = resource_map[self.cover_offset]
+
+        opf = OPFCreator(os.getcwdu(), mi)
+        opf.guide = guide
+        opf.create_manifest_from_files_in([os.getcwdu()])
+        opf.create_spine(spine)
+        opf.set_toc(toc)
+
+        with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
+            opf.render(of, ncx, 'toc.ncx')
+        return 'metadata.opf'
+
+
--- a/src/calibre/ebooks/mobi/reader/ncx.py
+++ b/src/calibre/ebooks/mobi/reader/ncx.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.mobi.utils import to_base
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import read_index
+
+tag_fieldname_map = {
+        1:  ['pos',0],
+        2:  ['len',0],
+        3:  ['noffs',0],
+        4:  ['hlvl',0],
+        5:  ['koffs',0],
+        6:  ['pos_fid',0],
+        21: ['parent',0],
+        22: ['child1',0],
+        23: ['childn',0]
+}
+
+def read_ncx(sections, index, codec):
+    index_entries = []
+
+    if index != NULL_INDEX:
+        table, cncx = read_index(sections, index, codec)
+
+        for num, x in enumerate(table.iteritems()):
+            text, tag_map = x
+            entry = {
+                    'name': text,
+                    'pos':  -1,
+                    'len':  0,
+                    'noffs': -1,
+                    'text' : "Unknown Text",
+                    'hlvl' : -1,
+                    'kind' : "Unknown Kind",
+                    'pos_fid' : None,
+                    'parent' : -1,
+                    'child1' : -1,
+                    'childn' : -1,
+                    'num'  : num
+            }
+
+            for tag in tag_fieldname_map.keys():
+                fieldname, i = tag_fieldname_map[tag]
+                if tag in tag_map:
+                    fieldvalue = tag_map[tag][i]
+                    if tag == 6:
+                        fieldvalue = to_base(fieldvalue, base=32)
+                    entry[fieldname] = fieldvalue
+                    if tag == 3:
+                        entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
+                    if tag == 5:
+                        entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
+            index_entries.append(entry)
+
+    return index_entries
+
+def build_toc(index_entries):
+    ans = TOC(base_path=os.getcwdu())
+    levels = {x['hlvl'] for x in index_entries}
+    num_map = {-1: ans}
+    level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
+            levels}
+    for lvl in sorted(levels):
+        for item in level_map[lvl]:
+            parent = num_map[item['parent']]
+            child = parent.add_item(item['href'], item['idtag'], item['text'])
+            num_map[item['num']] = child
+
+    # Set play orders in depth first order
+    for i, item in enumerate(ans.flat()):
+        item.play_order = i
+
+    return ans
+
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct
+import struct, string
 from collections import OrderedDict

 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
@ -340,4 +340,26 @@ def detect_periodical(toc, log=None):
            return False
    return True

+def count_set_bits(num):
+    if num < 0:
+        num = -num
+    ans = 0
+    while num > 0:
+        ans += (num & 0b1)
+        num >>= 1
+    return ans
+
+def to_base(num, base=32):
+    digits = string.digits + string.ascii_uppercase
+    sign = 1 if num >= 0 else -1
+    if num == 0: return '0'
+    num *= sign
+    ans = []
+    while num:
+        ans.append(digits[(num % base)])
+        num //= base
+    if sign < 0:
+        ans.append('-')
+    ans.reverse()
+    return ''.join(ans)