Start refactoring of MobiWriter

2025-08-30 23:00:21 -04:00 · 2011-07-16 22:50:35 -06:00 · 2011-07-16 22:50:35 -06:00 · 4805fa7c77
commit 4805fa7c77
parent 48929a4dbd
4 changed files with 846 additions and 1 deletions
--- a/src/calibre/ebooks/mobi/output.py
+++ b/src/calibre/ebooks/mobi/output.py
@ -184,7 +184,12 @@ class MOBIOutput(OutputFormatPlugin):
        mobimlizer(oeb, opts)
        self.check_for_periodical()
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
-        from calibre.ebooks.mobi.writer import MobiWriter
+        from calibre.utils.config import tweaks
+        if tweaks.get('new_mobi_writer', False):
+            from calibre.ebooks.mobi.writer2.main import MobiWriter
+            MobiWriter
+        else:
+            from calibre.ebooks.mobi.writer import MobiWriter
        writer = MobiWriter(opts,
                        write_page_breaks_after_item=write_page_breaks_after_item)
        writer(oeb, output_path)
--- a/src/calibre/ebooks/mobi/writer2/init.py
+++ b/src/calibre/ebooks/mobi/writer2/init.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+UNCOMPRESSED = 1
+PALMDOC = 2
+HUFFDIC = 17480
+PALM_MAX_IMAGE_SIZE = 63 * 1024
+
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -0,0 +1,579 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, random, time
+from cStringIO import StringIO
+from struct import pack
+
+from calibre.ebooks import normalize
+from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+from calibre.ebooks.mobi.writer2.serializer import Serializer
+from calibre.ebooks.compression.palmdoc import compress_doc
+from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
+from calibre.ebooks.mobi.langcodes import iana2mobi
+from calibre.utils.filenames import ascii_filename
+from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
+
+EXTH_CODES = {
+    'creator': 100,
+    'publisher': 101,
+    'description': 103,
+    'identifier': 104,
+    'subject': 105,
+    'pubdate': 106,
+    'date': 106,
+    'review': 107,
+    'contributor': 108,
+    'rights': 109,
+    'type': 111,
+    'source': 112,
+    'title': 503,
+    }
+
+# Disabled as I dont care about uncrossable breaks
+WRITE_UNCROSSABLE_BREAKS = False
+
+RECORD_SIZE = 0x1000 # 4096
+
+IMAGE_MAX_SIZE = 10 * 1024 * 1024
+MAX_THUMB_SIZE = 16 * 1024
+MAX_THUMB_DIMEN = (180, 240)
+
+# Almost like the one for MS LIT, but not quite.
+DECINT_FORWARD = 0
+DECINT_BACKWARD = 1
+
+def decint(value, direction):
+    '''
+    Some parts of the Mobipocket format encode data as variable-width integers.
+    These integers are represented big-endian with 7 bits per byte in bits 1-7.
+    They may be either forward-encoded, in which case only the LSB has bit 8 set,
+    or backward-encoded, in which case only the MSB has bit 8 set.
+    For example, the number 0x11111 would be represented forward-encoded as:
+
+        0x04 0x22 0x91
+
+    And backward-encoded as:
+
+        0x84 0x22 0x11
+
+    This function encodes the integer ``value`` as a variable width integer and
+    returns the bytestring corresponding to it.
+    '''
+    # Encode vwi
+    byts = bytearray()
+    while True:
+        b = value & 0x7f
+        value >>= 7
+        byts.append(b)
+        if value == 0:
+            break
+    if direction == DECINT_FORWARD:
+        byts[0] |= 0x80
+    elif direction == DECINT_BACKWARD:
+        byts[-1] |= 0x80
+    return bytes(byts)
+
+def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
+    '''
+    Convert image setting all transparent pixels to white and changing format
+    to JPEG. Ensure the resultant image has a byte size less than
+    maxsizeb.
+
+    If dimen is not None, generate a thumbnail of width=dimen, height=dimen
+
+    Returns the image as a bytestring
+    '''
+    if dimen is not None:
+        data = thumbnail(data, width=dimen, height=dimen,
+                compression_quality=90)[-1]
+    else:
+        # Replace transparent pixels with white pixels and convert to JPEG
+        data = save_cover_data_to(data, 'img.jpg', return_data=True)
+    if len(data) <= maxsizeb:
+        return data
+    orig_data = data
+    img = Image()
+    quality = 95
+
+    img.load(data)
+    while len(data) >= maxsizeb and quality >= 10:
+        quality -= 5
+        img.set_compression_quality(quality)
+        data = img.export('jpg')
+    if len(data) <= maxsizeb:
+        return data
+    orig_data = data
+
+    scale = 0.9
+    while len(data) >= maxsizeb and scale >= 0.05:
+        img = Image()
+        img.load(orig_data)
+        w, h = img.size
+        img.size = (int(scale*w), int(scale*h))
+        img.set_compression_quality(quality)
+        data = img.export('jpg')
+        scale -= 0.05
+    return data
+
+class MobiWriter(object):
+    COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
+
+    def __init__(self, opts, write_page_breaks_after_item=True):
+        self.opts = opts
+        self.write_page_breaks_after_item = write_page_breaks_after_item
+        self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
+        self.prefer_author_sort = opts.prefer_author_sort
+
+    def __call__(self, oeb, path_or_stream):
+        if hasattr(path_or_stream, 'write'):
+            return self.dump_stream(oeb, path_or_stream)
+        with open(path_or_stream, 'w+b') as stream:
+            return self.dump_stream(oeb, stream)
+
+    def write(self, *args):
+        for datum in args:
+            self.stream.write(datum)
+
+    def tell(self):
+        return self.stream.tell()
+
+    def dump_stream(self, oeb, stream):
+        self.oeb = oeb
+        self.stream = stream
+        self.records = [None]
+        self.generate_content()
+        self.generate_record0()
+        self.write_header()
+        self.write_content()
+
+    def generate_content(self):
+        self.map_image_names()
+        self.generate_text()
+        # Image records come after text records
+        self.generate_images()
+
+    def map_image_names(self):
+        '''
+        Map image names to record indices, ensuring that the masthead image if
+        present has index number 1.
+        '''
+        index = 1
+        self.images = images = {}
+        mh_href = None
+
+        if 'masthead' in self.oeb.guide:
+            mh_href = self.oeb.guide['masthead'].href
+            images[mh_href] = 1
+            index += 1
+
+        for item in self.oeb.manifest.values():
+            if item.media_type in OEB_RASTER_IMAGES:
+                if item.href == mh_href: continue
+                images[item.href] = index
+                index += 1
+
+    def generate_images(self):
+        self.oeb.logger.info('Serializing images...')
+        images = [(index, href) for href, index in self.images.iteritems()]
+        images.sort()
+        self.first_image_record = None
+        for _, href in images:
+            item = self.oeb.manifest.hrefs[href]
+            try:
+                data = rescale_image(item.data)
+            except:
+                self.oeb.logger.warn('Bad image file %r' % item.href)
+                continue
+            finally:
+                item.unload_data_from_memory()
+            self.records.append(data)
+            if self.first_image_record is None:
+                self.first_image_record = len(self.records) - 1
+
+    def generate_text(self):
+        self.oeb.logger.info('Serializing markup content...')
+        serializer = Serializer(self.oeb, self.images,
+                write_page_breaks_after_item=self.write_page_breaks_after_item)
+        text = serializer()
+        breaks = serializer.breaks
+        self.anchor_offset_kindle = serializer.anchor_offset_kindle
+        self.id_offsets = serializer.id_offsets
+        self.content_length = len(text)
+        self.text_length = len(text)
+        text = StringIO(text)
+        buf = []
+        nrecords = 0
+        offset = 0
+
+        if self.compression != UNCOMPRESSED:
+            self.oeb.logger.info('  Compressing markup content...')
+        data, overlap = self.read_text_record(text)
+
+        while len(data) > 0:
+            if self.compression == PALMDOC:
+                data = compress_doc(data)
+            record = StringIO()
+            record.write(data)
+
+            self.records.append(record.getvalue())
+            buf.append(self.records[-1])
+            nrecords += 1
+            offset += RECORD_SIZE
+            data, overlap = self.read_text_record(text)
+
+            # Write information about the mutibyte character overlap, if any
+            record.write(overlap)
+            record.write(pack('>B', len(overlap)))
+
+            # Write information about uncrossable breaks (non linear items in
+            # the spine)
+            if WRITE_UNCROSSABLE_BREAKS:
+                nextra = 0
+                pbreak = 0
+                running = offset
+
+                # Write information about every uncrossable break that occurs in
+                # the next record.
+                while breaks and (breaks[0] - offset) < RECORD_SIZE:
+                    pbreak = (breaks.pop(0) - running) >> 3
+                    encoded = decint(pbreak, DECINT_FORWARD)
+                    record.write(encoded)
+                    running += pbreak << 3
+                    nextra += len(encoded)
+                lsize = 1
+                while True:
+                    size = decint(nextra + lsize, DECINT_BACKWARD)
+                    if len(size) == lsize:
+                        break
+                    lsize += 1
+                record.write(size)
+
+        self.text_nrecords = nrecords + 1
+
+    def read_text_record(self, text):
+        '''
+        Return a Palmdoc record of size RECORD_SIZE from the text file object.
+        In case the record ends in the middle of a multibyte character return
+        the overlap as well.
+
+        Returns data, overlap: where both are byte strings. overlap is the
+        extra bytes needed to complete the truncated multibyte character.
+        '''
+        opos = text.tell()
+        text.seek(0, 2)
+        # npos is the position of the next record
+        npos = min((opos + RECORD_SIZE, text.tell()))
+        # Number of bytes from the next record needed to complete the last
+        # character in this record
+        extra = 0
+
+        last = b''
+        while not last.decode('utf-8', 'ignore'):
+            # last contains no valid utf-8 characters
+            size = len(last) + 1
+            text.seek(npos - size)
+            last = text.read(size)
+
+        # last now has one valid utf-8 char and possibly some bytes that belong
+        # to a truncated char
+
+        try:
+            last.decode('utf-8', 'strict')
+        except UnicodeDecodeError:
+            # There are some truncated bytes in last
+            prev = len(last)
+            while True:
+                text.seek(npos - prev)
+                last = text.read(len(last) + 1)
+                try:
+                    last.decode('utf-8')
+                except UnicodeDecodeError:
+                    pass
+                else:
+                    break
+            extra = len(last) - prev
+
+        text.seek(opos)
+        data = text.read(RECORD_SIZE)
+        overlap = text.read(extra)
+        text.seek(npos)
+
+        return data, overlap
+
+    def generate_end_records(self):
+        self.flis_number = len(self.records)
+        self.records.append('\xE9\x8E\x0D\x0A')
+
+    def generate_record0(self): # {{{
+        metadata = self.oeb.metadata
+        exth = self.build_exth()
+        last_content_record = len(self.records) - 1
+
+        self.generate_end_records()
+
+        record0 = StringIO()
+        # The PalmDOC Header
+        record0.write(pack('>HHIHHHH', self.compression, 0,
+            self.text_length,
+            self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
+        uid = random.randint(0, 0xffffffff)
+        title = normalize(unicode(metadata.title[0])).encode('utf-8')
+        # The MOBI Header
+
+        # 0x0 - 0x3
+        record0.write(b'MOBI')
+
+        # 0x4 - 0x7   : Length of header
+        # 0x8 - 0x11  : MOBI type
+        #   type    meaning
+        #   0x002   MOBI book (chapter - chapter navigation)
+        #   0x101   News - Hierarchical navigation with sections and articles
+        #   0x102   News feed - Flat navigation
+        #   0x103   News magazine - same as 0x101
+        # 0xC - 0xF   : Text encoding (65001 is utf-8)
+        # 0x10 - 0x13 : UID
+        # 0x14 - 0x17 : Generator version
+
+        record0.write(pack('>IIIII',
+            0xe8, 0x002, 65001, uid, 6))
+
+        # 0x18 - 0x1f : Unknown
+        record0.write(b'\xff' * 8)
+
+
+        # 0x20 - 0x23 : Secondary index record
+        record0.write(pack('>I', 0xffffffff))
+
+        # 0x24 - 0x3f : Unknown
+        record0.write(b'\xff' * 28)
+
+        # 0x40 - 0x43 : Offset of first non-text record
+        record0.write(pack('>I',
+            self.text_nrecords + 1))
+
+        # 0x44 - 0x4b : title offset, title length
+        record0.write(pack('>II',
+            0xe8 + 16 + len(exth), len(title)))
+
+        # 0x4c - 0x4f : Language specifier
+        record0.write(iana2mobi(
+            str(metadata.language[0])))
+
+        # 0x50 - 0x57 : Unknown
+        record0.write(b'\0' * 8)
+
+        # 0x58 - 0x5b : Format version
+        # 0x5c - 0x5f : First image record number
+        record0.write(pack('>II',
+            6, self.first_image_record if self.first_image_record else 0))
+
+        # 0x60 - 0x63 : First HUFF/CDIC record number
+        # 0x64 - 0x67 : Number of HUFF/CDIC records
+        # 0x68 - 0x6b : First DATP record number
+        # 0x6c - 0x6f : Number of DATP records
+        record0.write(b'\0' * 16)
+
+        # 0x70 - 0x73 : EXTH flags
+        record0.write(pack('>I', 0x50))
+
+        # 0x74 - 0x93 : Unknown
+        record0.write(b'\0' * 32)
+
+        # 0x94 - 0x97 : DRM offset
+        # 0x98 - 0x9b : DRM count
+        # 0x9c - 0x9f : DRM size
+        # 0xa0 - 0xa3 : DRM flags
+        record0.write(pack('>IIII',
+            0xffffffff, 0xffffffff, 0, 0))
+
+
+        # 0xa4 - 0xaf : Unknown
+        record0.write(b'\0'*12)
+
+        # 0xb0 - 0xb1 : First content record number
+        # 0xb2 - 0xb3 : last content record number
+        # (Includes Image, DATP, HUFF, DRM)
+        record0.write(pack('>HH', 1, last_content_record))
+
+        # 0xb4 - 0xb7 : Unknown
+        record0.write(b'\0\0\0\x01')
+
+        # 0xb8 - 0xbb : FCIS record number
+        record0.write(pack('>I', 0xffffffff))
+
+        # 0xbc - 0xbf : Unknown (FCIS record count?)
+        record0.write(pack('>I', 0xffffffff))
+
+        # 0xc0 - 0xc3 : FLIS record number
+        record0.write(pack('>I', 0xffffffff))
+
+        # 0xc4 - 0xc7 : Unknown (FLIS record count?)
+        record0.write(pack('>I', 1))
+
+        # 0xc8 - 0xcf : Unknown
+        record0.write(b'\0'*8)
+
+        # 0xd0 - 0xdf : Unknown
+        record0.write(pack('>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
+
+        # 0xe0 - 0xe3 : Extra record data
+        # Extra record data flags:
+        #   - 0x1: <extra multibyte bytes><size> (?)
+        #   - 0x2: <TBS indexing description of this HTML record><size> GR
+        #   - 0x4: <uncrossable breaks><size>
+        # GR: Use 7 for indexed files, 5 for unindexed
+        # Setting bit 2 (0x4) disables <guide><reference type="start"> functionality
+
+        trailingDataFlags = 1
+        if WRITE_UNCROSSABLE_BREAKS:
+            trailingDataFlags |= 4
+        record0.write(pack('>I', trailingDataFlags))
+
+        # 0xe4 - 0xe7 : Primary index record
+        record0.write(pack('>I', 0xffffffff))
+
+        record0.write(exth)
+        record0.write(title)
+        record0 = record0.getvalue()
+        # Add some buffer so that Amazon can add encryption information if this
+        # MOBI is submitted for publication
+        record0 += (b'\0' * (1024*8))
+        self.records[0] = record0
+    # }}}
+
+    def build_exth(self): # {{{
+        oeb = self.oeb
+        exth = StringIO()
+        nrecs = 0
+        for term in oeb.metadata:
+            if term not in EXTH_CODES: continue
+            code = EXTH_CODES[term]
+            items = oeb.metadata[term]
+            if term == 'creator':
+                if self.prefer_author_sort:
+                    creators = [normalize(unicode(c.file_as or c)) for c in items]
+                else:
+                    creators = [normalize(unicode(c)) for c in items]
+                items = ['; '.join(creators)]
+            for item in items:
+                data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item)))
+                if term == 'identifier':
+                    if data.lower().startswith('urn:isbn:'):
+                        data = data[9:]
+                    elif item.scheme.lower() == 'isbn':
+                        pass
+                    else:
+                        continue
+                data = data.encode('utf-8')
+                exth.write(pack('>II', code, len(data) + 8))
+                exth.write(data)
+                nrecs += 1
+            if term == 'rights' :
+                try:
+                    rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
+                except:
+                    rights = b'Unknown'
+                exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
+                exth.write(rights)
+                nrecs += 1
+
+        # Write UUID as ASIN
+        uuid = None
+        from calibre.ebooks.oeb.base import OPF
+        for x in oeb.metadata['identifier']:
+            if (x.get(OPF('scheme'), None).lower() == 'uuid' or
+                    unicode(x).startswith('urn:uuid:')):
+                uuid = unicode(x).split(':')[-1]
+                break
+        if uuid is None:
+            from uuid import uuid4
+            uuid = str(uuid4())
+
+        if isinstance(uuid, unicode):
+            uuid = uuid.encode('utf-8')
+        exth.write(pack('>II', 113, len(uuid) + 8))
+        exth.write(uuid)
+        nrecs += 1
+
+        # Write cdetype
+        if not self.opts.mobi_periodical:
+            data = b'EBOK'
+            exth.write(pack('>II', 501, len(data)+8))
+            exth.write(data)
+            nrecs += 1
+
+        # Add a publication date entry
+        if oeb.metadata['date'] != [] :
+            datestr = str(oeb.metadata['date'][0])
+        elif oeb.metadata['timestamp'] != [] :
+            datestr = str(oeb.metadata['timestamp'][0])
+
+        if datestr is not None:
+            exth.write(pack('>II', EXTH_CODES['pubdate'], len(datestr) + 8))
+            exth.write(datestr)
+            nrecs += 1
+        else:
+            raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
+
+        if (oeb.metadata.cover and
+                unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
+            id = unicode(oeb.metadata.cover[0])
+            item = oeb.manifest.ids[id]
+            href = item.href
+            if href in self.images:
+                index = self.images[href] - 1
+                exth.write(pack('>III', 0xc9, 0x0c, index))
+                exth.write(pack('>III', 0xcb, 0x0c, 0))
+                nrecs += 2
+                index = self.add_thumbnail(item)
+                if index is not None:
+                    exth.write(pack('>III', 0xca, 0x0c, index - 1))
+                    nrecs += 1
+
+        exth = exth.getvalue()
+        trail = len(exth) % 4
+        pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
+        exth = [b'EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
+        return b''.join(exth)
+    # }}}
+
+    def add_thumbnail(self, item):
+        try:
+            data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
+                    maxsizeb=MAX_THUMB_SIZE)
+        except IOError:
+            self.oeb.logger.warn('Bad image file %r' % item.href)
+            return None
+        manifest = self.oeb.manifest
+        id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
+        manifest.add(id, href, 'image/jpeg', data=data)
+        index = len(self.images) + 1
+        self.images[href] = index
+        self.records.append(data)
+        return index
+
+    def write_header(self):
+        title = ascii_filename(unicode(self.oeb.metadata.title[0]))
+        title = title + (b'\0' * (32 - len(title)))
+        now = int(time.time())
+        nrecords = len(self.records)
+        self.write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
+            b'BOOK', b'MOBI', pack('>IIH', nrecords, 0, nrecords))
+        offset = self.tell() + (8 * nrecords) + 2
+        for i, record in enumerate(self.records):
+            self.write(pack('>I', offset), b'\0', pack('>I', 2*i)[1:])
+            offset += len(record)
+        self.write(b'\0\0')
+
+    def write_content(self):
+        for record in self.records:
+            self.write(record)
+
+
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
+        namespace, prefixname, urlnormalize)
+from calibre.ebooks.mobi.mobiml import MBP_NS
+
+from collections import defaultdict
+from urlparse import urldefrag
+from cStringIO import StringIO
+
+
+class Serializer(object):
+    NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
+
+    def __init__(self, oeb, images, write_page_breaks_after_item=True):
+        '''
+        Write all the HTML markup in oeb into a single in memory buffer
+        containing a single html document with links replaced by offsets into
+        the buffer.
+
+        :param oeb: OEBBook object that encapsulates the document to be
+        processed.
+
+        :param images: Mapping of image hrefs (urlnormalized) to image record
+        indices.
+
+        :param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
+        is written after every element of the spine in ``oeb``.
+        '''
+        self.oeb = oeb
+        self.images = images
+        self.logger = oeb.logger
+        self.write_page_breaks_after_item = write_page_breaks_after_item
+
+        # Mapping of hrefs (urlnormalized) to the offset in the buffer where
+        # the resource pointed to by the href lives. Used at the end to fill in
+        # the correct values into all filepos="..." links.
+        self.id_offsets = {}
+
+        # Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
+        # where filepos="..." elements are written corresponding to links that
+        # point to the href. This is used at the end to fill in the correct values.
+        self.href_offsets = defaultdict(list)
+
+        # List of offsets in the buffer of non linear items in the spine. These
+        # become uncrossable breaks in the MOBI
+        self.breaks = []
+
+    def __call__(self):
+        '''
+        Return the document serialized as a single UTF-8 encoded bytestring.
+        '''
+        buf = self.buf = StringIO()
+        buf.write(b'<html>')
+        self.serialize_head()
+        self.serialize_body()
+        buf.write(b'</html>')
+        self.fixup_links()
+        return buf.getvalue()
+
+    def serialize_head(self):
+        buf = self.buf
+        buf.write(b'<head>')
+        if len(self.oeb.guide) > 0:
+            self.serialize_guide()
+        buf.write(b'</head>')
+
+    def serialize_guide(self):
+        '''
+        The Kindle decides where to open a book based on the presence of
+        an item in the guide that looks like
+        <reference type="text" title="Start" href="chapter-one.xhtml"/>
+
+        Similarly an item with type="toc" controls where the Goto Table of
+        Contents operation on the kindle goes.
+        '''
+
+        buf = self.buf
+        hrefs = self.oeb.manifest.hrefs
+        buf.write(b'<guide>')
+        for ref in self.oeb.guide.values():
+            path = urldefrag(ref.href)[0]
+            if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
+                continue
+
+            buf.write(b'<reference type="')
+            if ref.type.startswith('other.') :
+                self.serialize_text(ref.type.replace('other.',''), quot=True)
+            else:
+                self.serialize_text(ref.type, quot=True)
+            buf.write(b'" ')
+            if ref.title is not None:
+                buf.write(b'title="')
+                self.serialize_text(ref.title, quot=True)
+                buf.write(b'" ')
+            self.serialize_href(ref.href)
+            # Space required or won't work, I kid you not
+            buf.write(b' />')
+
+        buf.write(b'</guide>')
+
+    def serialize_href(self, href, base=None):
+        '''
+        Serialize the href attribute of an <a> or <reference> tag. It is
+        serialized as filepos="000000000" and a pointer to its location is
+        stored in self.href_offsets so that the correct value can be filled in
+        at the end.
+        '''
+        hrefs = self.oeb.manifest.hrefs
+        path, frag = urldefrag(urlnormalize(href))
+        if path and base:
+            path = base.abshref(path)
+        if path and path not in hrefs:
+            return False
+        buf = self.buf
+        item = hrefs[path] if path else None
+        if item and item.spine_position is None:
+            return False
+        path = item.href if item else base.href
+        href = '#'.join((path, frag)) if frag else path
+        buf.write(b'filepos=')
+        self.href_offsets[href].append(buf.tell())
+        buf.write(b'0000000000')
+        return True
+
+    def serialize_body(self):
+        '''
+        Serialize all items in the spine of the document. Non linear items are
+        moved to the end.
+        '''
+        buf = self.buf
+        self.anchor_offset = buf.tell()
+        buf.write(b'<body>')
+        self.anchor_offset_kindle = buf.tell()
+        spine = [item for item in self.oeb.spine if item.linear]
+        spine.extend([item for item in self.oeb.spine if not item.linear])
+        for item in spine:
+            self.serialize_item(item)
+        buf.write(b'</body>')
+
+    def serialize_item(self, item):
+        '''
+        Serialize an individual item from the spine of the input document.
+        A reference to this item is stored in self.href_offsets
+        '''
+        buf = self.buf
+        if not item.linear:
+            self.breaks.append(buf.tell() - 1)
+        self.id_offsets[urlnormalize(item.href)] = buf.tell()
+        # Kindle periodical articles are contained in a <div> tag
+        buf.write(b'<div>')
+        for elem in item.data.find(XHTML('body')):
+            self.serialize_elem(elem, item)
+        # Kindle periodical article end marker
+        buf.write(b'<div></div>')
+        if self.write_page_breaks_after_item:
+            buf.write(b'<mbp:pagebreak/>')
+        buf.write(b'</div>')
+        self.anchor_offset = None
+
+    def serialize_elem(self, elem, item, nsrmap=NSRMAP):
+        buf = self.buf
+        if not isinstance(elem.tag, basestring) \
+            or namespace(elem.tag) not in nsrmap:
+                return
+        tag = prefixname(elem.tag, nsrmap)
+        # Previous layers take care of @name
+        id_ = elem.attrib.pop('id', None)
+        if id_:
+            href = '#'.join((item.href, id_))
+            offset = self.anchor_offset or buf.tell()
+            self.id_offsets[urlnormalize(href)] = offset
+        if self.anchor_offset is not None and \
+            tag == 'a' and not elem.attrib and \
+            not len(elem) and not elem.text:
+                return
+        self.anchor_offset = buf.tell()
+        buf.write(b'<')
+        buf.write(tag.encode('utf-8'))
+        if elem.attrib:
+            for attr, val in elem.attrib.items():
+                if namespace(attr) not in nsrmap:
+                    continue
+                attr = prefixname(attr, nsrmap)
+                buf.write(b' ')
+                if attr == 'href':
+                    if self.serialize_href(val, item):
+                        continue
+                elif attr == 'src':
+                    href = urlnormalize(item.abshref(val))
+                    if href in self.images:
+                        index = self.images[href]
+                        buf.write(b'recindex="%05d"' % index)
+                        continue
+                buf.write(attr.encode('utf-8'))
+                buf.write(b'="')
+                self.serialize_text(val, quot=True)
+                buf.write(b'"')
+        buf.write(b'>')
+        if elem.text or len(elem) > 0:
+            if elem.text:
+                self.anchor_offset = None
+                self.serialize_text(elem.text)
+            for child in elem:
+                self.serialize_elem(child, item)
+                if child.tail:
+                    self.anchor_offset = None
+                    self.serialize_text(child.tail)
+        buf.write(b'</%s>' % tag.encode('utf-8'))
+
+    def serialize_text(self, text, quot=False):
+        text = text.replace('&', '&amp;')
+        text = text.replace('<', '&lt;')
+        text = text.replace('>', '&gt;')
+        text = text.replace(u'\u00AD', '') # Soft-hyphen
+        if quot:
+            text = text.replace('"', '&quot;')
+        self.buf.write(text.encode('utf-8'))
+
+    def fixup_links(self):
+        '''
+        Fill in the correct values for all filepos="..." links with the offsets
+        of the linked to content (as stored in id_offsets).
+        '''
+        buf = self.buf
+        id_offsets = self.id_offsets
+        for href, hoffs in self.href_offsets.items():
+            # Iterate over all filepos items
+            if href not in id_offsets:
+                self.logger.warn('Hyperlink target %r not found' % href)
+                # Link to the top of the document, better than just ignoring
+                href, _ = urldefrag(href)
+            if href in self.id_offsets:
+                ioff = self.id_offsets[href]
+                for hoff in hoffs:
+                    buf.seek(hoff)
+                    buf.write(b'%010d' % ioff)
+
+