diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index c70f36b8c2..982ee74a2b 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -184,7 +184,12 @@ class MOBIOutput(OutputFormatPlugin): mobimlizer(oeb, opts) self.check_for_periodical() write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') - from calibre.ebooks.mobi.writer import MobiWriter + from calibre.utils.config import tweaks + if tweaks.get('new_mobi_writer', False): + from calibre.ebooks.mobi.writer2.main import MobiWriter + MobiWriter + else: + from calibre.ebooks.mobi.writer import MobiWriter writer = MobiWriter(opts, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py new file mode 100644 index 0000000000..df3dcefb94 --- /dev/null +++ b/src/calibre/ebooks/mobi/writer2/__init__.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +UNCOMPRESSED = 1 +PALMDOC = 2 +HUFFDIC = 17480 +PALM_MAX_IMAGE_SIZE = 63 * 1024 + diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py new file mode 100644 index 0000000000..bb50d6cb59 --- /dev/null +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -0,0 +1,579 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, random, time +from cStringIO import StringIO +from struct import pack + +from calibre.ebooks import normalize +from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES +from calibre.ebooks.mobi.writer2.serializer import Serializer +from calibre.ebooks.compression.palmdoc import compress_doc +from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail +from calibre.ebooks.mobi.langcodes import iana2mobi +from calibre.utils.filenames import ascii_filename +from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED + +EXTH_CODES = { + 'creator': 100, + 'publisher': 101, + 'description': 103, + 'identifier': 104, + 'subject': 105, + 'pubdate': 106, + 'date': 106, + 'review': 107, + 'contributor': 108, + 'rights': 109, + 'type': 111, + 'source': 112, + 'title': 503, + } + +# Disabled as I dont care about uncrossable breaks +WRITE_UNCROSSABLE_BREAKS = False + +RECORD_SIZE = 0x1000 # 4096 + +IMAGE_MAX_SIZE = 10 * 1024 * 1024 +MAX_THUMB_SIZE = 16 * 1024 +MAX_THUMB_DIMEN = (180, 240) + +# Almost like the one for MS LIT, but not quite. +DECINT_FORWARD = 0 +DECINT_BACKWARD = 1 + +def decint(value, direction): + ''' + Some parts of the Mobipocket format encode data as variable-width integers. + These integers are represented big-endian with 7 bits per byte in bits 1-7. + They may be either forward-encoded, in which case only the LSB has bit 8 set, + or backward-encoded, in which case only the MSB has bit 8 set. + For example, the number 0x11111 would be represented forward-encoded as: + + 0x04 0x22 0x91 + + And backward-encoded as: + + 0x84 0x22 0x11 + + This function encodes the integer ``value`` as a variable width integer and + returns the bytestring corresponding to it. + ''' + # Encode vwi + byts = bytearray() + while True: + b = value & 0x7f + value >>= 7 + byts.append(b) + if value == 0: + break + if direction == DECINT_FORWARD: + byts[0] |= 0x80 + elif direction == DECINT_BACKWARD: + byts[-1] |= 0x80 + return bytes(byts) + +def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): + ''' + Convert image setting all transparent pixels to white and changing format + to JPEG. Ensure the resultant image has a byte size less than + maxsizeb. + + If dimen is not None, generate a thumbnail of width=dimen, height=dimen + + Returns the image as a bytestring + ''' + if dimen is not None: + data = thumbnail(data, width=dimen, height=dimen, + compression_quality=90)[-1] + else: + # Replace transparent pixels with white pixels and convert to JPEG + data = save_cover_data_to(data, 'img.jpg', return_data=True) + if len(data) <= maxsizeb: + return data + orig_data = data + img = Image() + quality = 95 + + img.load(data) + while len(data) >= maxsizeb and quality >= 10: + quality -= 5 + img.set_compression_quality(quality) + data = img.export('jpg') + if len(data) <= maxsizeb: + return data + orig_data = data + + scale = 0.9 + while len(data) >= maxsizeb and scale >= 0.05: + img = Image() + img.load(orig_data) + w, h = img.size + img.size = (int(scale*w), int(scale*h)) + img.set_compression_quality(quality) + data = img.export('jpg') + scale -= 0.05 + return data + +class MobiWriter(object): + COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + def __init__(self, opts, write_page_breaks_after_item=True): + self.opts = opts + self.write_page_breaks_after_item = write_page_breaks_after_item + self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC + self.prefer_author_sort = opts.prefer_author_sort + + def __call__(self, oeb, path_or_stream): + if hasattr(path_or_stream, 'write'): + return self.dump_stream(oeb, path_or_stream) + with open(path_or_stream, 'w+b') as stream: + return self.dump_stream(oeb, stream) + + def write(self, *args): + for datum in args: + self.stream.write(datum) + + def tell(self): + return self.stream.tell() + + def dump_stream(self, oeb, stream): + self.oeb = oeb + self.stream = stream + self.records = [None] + self.generate_content() + self.generate_record0() + self.write_header() + self.write_content() + + def generate_content(self): + self.map_image_names() + self.generate_text() + # Image records come after text records + self.generate_images() + + def map_image_names(self): + ''' + Map image names to record indices, ensuring that the masthead image if + present has index number 1. + ''' + index = 1 + self.images = images = {} + mh_href = None + + if 'masthead' in self.oeb.guide: + mh_href = self.oeb.guide['masthead'].href + images[mh_href] = 1 + index += 1 + + for item in self.oeb.manifest.values(): + if item.media_type in OEB_RASTER_IMAGES: + if item.href == mh_href: continue + images[item.href] = index + index += 1 + + def generate_images(self): + self.oeb.logger.info('Serializing images...') + images = [(index, href) for href, index in self.images.iteritems()] + images.sort() + self.first_image_record = None + for _, href in images: + item = self.oeb.manifest.hrefs[href] + try: + data = rescale_image(item.data) + except: + self.oeb.logger.warn('Bad image file %r' % item.href) + continue + finally: + item.unload_data_from_memory() + self.records.append(data) + if self.first_image_record is None: + self.first_image_record = len(self.records) - 1 + + def generate_text(self): + self.oeb.logger.info('Serializing markup content...') + serializer = Serializer(self.oeb, self.images, + write_page_breaks_after_item=self.write_page_breaks_after_item) + text = serializer() + breaks = serializer.breaks + self.anchor_offset_kindle = serializer.anchor_offset_kindle + self.id_offsets = serializer.id_offsets + self.content_length = len(text) + self.text_length = len(text) + text = StringIO(text) + buf = [] + nrecords = 0 + offset = 0 + + if self.compression != UNCOMPRESSED: + self.oeb.logger.info(' Compressing markup content...') + data, overlap = self.read_text_record(text) + + while len(data) > 0: + if self.compression == PALMDOC: + data = compress_doc(data) + record = StringIO() + record.write(data) + + self.records.append(record.getvalue()) + buf.append(self.records[-1]) + nrecords += 1 + offset += RECORD_SIZE + data, overlap = self.read_text_record(text) + + # Write information about the mutibyte character overlap, if any + record.write(overlap) + record.write(pack('>B', len(overlap))) + + # Write information about uncrossable breaks (non linear items in + # the spine) + if WRITE_UNCROSSABLE_BREAKS: + nextra = 0 + pbreak = 0 + running = offset + + # Write information about every uncrossable break that occurs in + # the next record. + while breaks and (breaks[0] - offset) < RECORD_SIZE: + pbreak = (breaks.pop(0) - running) >> 3 + encoded = decint(pbreak, DECINT_FORWARD) + record.write(encoded) + running += pbreak << 3 + nextra += len(encoded) + lsize = 1 + while True: + size = decint(nextra + lsize, DECINT_BACKWARD) + if len(size) == lsize: + break + lsize += 1 + record.write(size) + + self.text_nrecords = nrecords + 1 + + def read_text_record(self, text): + ''' + Return a Palmdoc record of size RECORD_SIZE from the text file object. + In case the record ends in the middle of a multibyte character return + the overlap as well. + + Returns data, overlap: where both are byte strings. overlap is the + extra bytes needed to complete the truncated multibyte character. + ''' + opos = text.tell() + text.seek(0, 2) + # npos is the position of the next record + npos = min((opos + RECORD_SIZE, text.tell())) + # Number of bytes from the next record needed to complete the last + # character in this record + extra = 0 + + last = b'' + while not last.decode('utf-8', 'ignore'): + # last contains no valid utf-8 characters + size = len(last) + 1 + text.seek(npos - size) + last = text.read(size) + + # last now has one valid utf-8 char and possibly some bytes that belong + # to a truncated char + + try: + last.decode('utf-8', 'strict') + except UnicodeDecodeError: + # There are some truncated bytes in last + prev = len(last) + while True: + text.seek(npos - prev) + last = text.read(len(last) + 1) + try: + last.decode('utf-8') + except UnicodeDecodeError: + pass + else: + break + extra = len(last) - prev + + text.seek(opos) + data = text.read(RECORD_SIZE) + overlap = text.read(extra) + text.seek(npos) + + return data, overlap + + def generate_end_records(self): + self.flis_number = len(self.records) + self.records.append('\xE9\x8E\x0D\x0A') + + def generate_record0(self): # {{{ + metadata = self.oeb.metadata + exth = self.build_exth() + last_content_record = len(self.records) - 1 + + self.generate_end_records() + + record0 = StringIO() + # The PalmDOC Header + record0.write(pack('>HHIHHHH', self.compression, 0, + self.text_length, + self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf) + uid = random.randint(0, 0xffffffff) + title = normalize(unicode(metadata.title[0])).encode('utf-8') + # The MOBI Header + + # 0x0 - 0x3 + record0.write(b'MOBI') + + # 0x4 - 0x7 : Length of header + # 0x8 - 0x11 : MOBI type + # type meaning + # 0x002 MOBI book (chapter - chapter navigation) + # 0x101 News - Hierarchical navigation with sections and articles + # 0x102 News feed - Flat navigation + # 0x103 News magazine - same as 0x101 + # 0xC - 0xF : Text encoding (65001 is utf-8) + # 0x10 - 0x13 : UID + # 0x14 - 0x17 : Generator version + + record0.write(pack('>IIIII', + 0xe8, 0x002, 65001, uid, 6)) + + # 0x18 - 0x1f : Unknown + record0.write(b'\xff' * 8) + + + # 0x20 - 0x23 : Secondary index record + record0.write(pack('>I', 0xffffffff)) + + # 0x24 - 0x3f : Unknown + record0.write(b'\xff' * 28) + + # 0x40 - 0x43 : Offset of first non-text record + record0.write(pack('>I', + self.text_nrecords + 1)) + + # 0x44 - 0x4b : title offset, title length + record0.write(pack('>II', + 0xe8 + 16 + len(exth), len(title))) + + # 0x4c - 0x4f : Language specifier + record0.write(iana2mobi( + str(metadata.language[0]))) + + # 0x50 - 0x57 : Unknown + record0.write(b'\0' * 8) + + # 0x58 - 0x5b : Format version + # 0x5c - 0x5f : First image record number + record0.write(pack('>II', + 6, self.first_image_record if self.first_image_record else 0)) + + # 0x60 - 0x63 : First HUFF/CDIC record number + # 0x64 - 0x67 : Number of HUFF/CDIC records + # 0x68 - 0x6b : First DATP record number + # 0x6c - 0x6f : Number of DATP records + record0.write(b'\0' * 16) + + # 0x70 - 0x73 : EXTH flags + record0.write(pack('>I', 0x50)) + + # 0x74 - 0x93 : Unknown + record0.write(b'\0' * 32) + + # 0x94 - 0x97 : DRM offset + # 0x98 - 0x9b : DRM count + # 0x9c - 0x9f : DRM size + # 0xa0 - 0xa3 : DRM flags + record0.write(pack('>IIII', + 0xffffffff, 0xffffffff, 0, 0)) + + + # 0xa4 - 0xaf : Unknown + record0.write(b'\0'*12) + + # 0xb0 - 0xb1 : First content record number + # 0xb2 - 0xb3 : last content record number + # (Includes Image, DATP, HUFF, DRM) + record0.write(pack('>HH', 1, last_content_record)) + + # 0xb4 - 0xb7 : Unknown + record0.write(b'\0\0\0\x01') + + # 0xb8 - 0xbb : FCIS record number + record0.write(pack('>I', 0xffffffff)) + + # 0xbc - 0xbf : Unknown (FCIS record count?) + record0.write(pack('>I', 0xffffffff)) + + # 0xc0 - 0xc3 : FLIS record number + record0.write(pack('>I', 0xffffffff)) + + # 0xc4 - 0xc7 : Unknown (FLIS record count?) + record0.write(pack('>I', 1)) + + # 0xc8 - 0xcf : Unknown + record0.write(b'\0'*8) + + # 0xd0 - 0xdf : Unknown + record0.write(pack('>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff)) + + # 0xe0 - 0xe3 : Extra record data + # Extra record data flags: + # - 0x1: (?) + # - 0x2: GR + # - 0x4: + # GR: Use 7 for indexed files, 5 for unindexed + # Setting bit 2 (0x4) disables functionality + + trailingDataFlags = 1 + if WRITE_UNCROSSABLE_BREAKS: + trailingDataFlags |= 4 + record0.write(pack('>I', trailingDataFlags)) + + # 0xe4 - 0xe7 : Primary index record + record0.write(pack('>I', 0xffffffff)) + + record0.write(exth) + record0.write(title) + record0 = record0.getvalue() + # Add some buffer so that Amazon can add encryption information if this + # MOBI is submitted for publication + record0 += (b'\0' * (1024*8)) + self.records[0] = record0 + # }}} + + def build_exth(self): # {{{ + oeb = self.oeb + exth = StringIO() + nrecs = 0 + for term in oeb.metadata: + if term not in EXTH_CODES: continue + code = EXTH_CODES[term] + items = oeb.metadata[term] + if term == 'creator': + if self.prefer_author_sort: + creators = [normalize(unicode(c.file_as or c)) for c in items] + else: + creators = [normalize(unicode(c)) for c in items] + items = ['; '.join(creators)] + for item in items: + data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item))) + if term == 'identifier': + if data.lower().startswith('urn:isbn:'): + data = data[9:] + elif item.scheme.lower() == 'isbn': + pass + else: + continue + data = data.encode('utf-8') + exth.write(pack('>II', code, len(data) + 8)) + exth.write(data) + nrecs += 1 + if term == 'rights' : + try: + rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8') + except: + rights = b'Unknown' + exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8)) + exth.write(rights) + nrecs += 1 + + # Write UUID as ASIN + uuid = None + from calibre.ebooks.oeb.base import OPF + for x in oeb.metadata['identifier']: + if (x.get(OPF('scheme'), None).lower() == 'uuid' or + unicode(x).startswith('urn:uuid:')): + uuid = unicode(x).split(':')[-1] + break + if uuid is None: + from uuid import uuid4 + uuid = str(uuid4()) + + if isinstance(uuid, unicode): + uuid = uuid.encode('utf-8') + exth.write(pack('>II', 113, len(uuid) + 8)) + exth.write(uuid) + nrecs += 1 + + # Write cdetype + if not self.opts.mobi_periodical: + data = b'EBOK' + exth.write(pack('>II', 501, len(data)+8)) + exth.write(data) + nrecs += 1 + + # Add a publication date entry + if oeb.metadata['date'] != [] : + datestr = str(oeb.metadata['date'][0]) + elif oeb.metadata['timestamp'] != [] : + datestr = str(oeb.metadata['timestamp'][0]) + + if datestr is not None: + exth.write(pack('>II', EXTH_CODES['pubdate'], len(datestr) + 8)) + exth.write(datestr) + nrecs += 1 + else: + raise NotImplementedError("missing date or timestamp needed for mobi_periodical") + + if (oeb.metadata.cover and + unicode(oeb.metadata.cover[0]) in oeb.manifest.ids): + id = unicode(oeb.metadata.cover[0]) + item = oeb.manifest.ids[id] + href = item.href + if href in self.images: + index = self.images[href] - 1 + exth.write(pack('>III', 0xc9, 0x0c, index)) + exth.write(pack('>III', 0xcb, 0x0c, 0)) + nrecs += 2 + index = self.add_thumbnail(item) + if index is not None: + exth.write(pack('>III', 0xca, 0x0c, index - 1)) + nrecs += 1 + + exth = exth.getvalue() + trail = len(exth) % 4 + pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte + exth = [b'EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad] + return b''.join(exth) + # }}} + + def add_thumbnail(self, item): + try: + data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN, + maxsizeb=MAX_THUMB_SIZE) + except IOError: + self.oeb.logger.warn('Bad image file %r' % item.href) + return None + manifest = self.oeb.manifest + id, href = manifest.generate('thumbnail', 'thumbnail.jpeg') + manifest.add(id, href, 'image/jpeg', data=data) + index = len(self.images) + 1 + self.images[href] = index + self.records.append(data) + return index + + def write_header(self): + title = ascii_filename(unicode(self.oeb.metadata.title[0])) + title = title + (b'\0' * (32 - len(title))) + now = int(time.time()) + nrecords = len(self.records) + self.write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0), + b'BOOK', b'MOBI', pack('>IIH', nrecords, 0, nrecords)) + offset = self.tell() + (8 * nrecords) + 2 + for i, record in enumerate(self.records): + self.write(pack('>I', offset), b'\0', pack('>I', 2*i)[1:]) + offset += len(record) + self.write(b'\0\0') + + def write_content(self): + for record in self.records: + self.write(record) + + diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py new file mode 100644 index 0000000000..7f1ca3931e --- /dev/null +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS, + namespace, prefixname, urlnormalize) +from calibre.ebooks.mobi.mobiml import MBP_NS + +from collections import defaultdict +from urlparse import urldefrag +from cStringIO import StringIO + + +class Serializer(object): + NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} + + def __init__(self, oeb, images, write_page_breaks_after_item=True): + ''' + Write all the HTML markup in oeb into a single in memory buffer + containing a single html document with links replaced by offsets into + the buffer. + + :param oeb: OEBBook object that encapsulates the document to be + processed. + + :param images: Mapping of image hrefs (urlnormalized) to image record + indices. + + :param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag + is written after every element of the spine in ``oeb``. + ''' + self.oeb = oeb + self.images = images + self.logger = oeb.logger + self.write_page_breaks_after_item = write_page_breaks_after_item + + # Mapping of hrefs (urlnormalized) to the offset in the buffer where + # the resource pointed to by the href lives. Used at the end to fill in + # the correct values into all filepos="..." links. + self.id_offsets = {} + + # Mapping of hrefs (urlnormalized) to a list of offsets into the buffer + # where filepos="..." elements are written corresponding to links that + # point to the href. This is used at the end to fill in the correct values. + self.href_offsets = defaultdict(list) + + # List of offsets in the buffer of non linear items in the spine. These + # become uncrossable breaks in the MOBI + self.breaks = [] + + def __call__(self): + ''' + Return the document serialized as a single UTF-8 encoded bytestring. + ''' + buf = self.buf = StringIO() + buf.write(b'') + self.serialize_head() + self.serialize_body() + buf.write(b'') + self.fixup_links() + return buf.getvalue() + + def serialize_head(self): + buf = self.buf + buf.write(b'') + if len(self.oeb.guide) > 0: + self.serialize_guide() + buf.write(b'') + + def serialize_guide(self): + ''' + The Kindle decides where to open a book based on the presence of + an item in the guide that looks like + + + Similarly an item with type="toc" controls where the Goto Table of + Contents operation on the kindle goes. + ''' + + buf = self.buf + hrefs = self.oeb.manifest.hrefs + buf.write(b'') + for ref in self.oeb.guide.values(): + path = urldefrag(ref.href)[0] + if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: + continue + + buf.write(b'') + + buf.write(b'') + + def serialize_href(self, href, base=None): + ''' + Serialize the href attribute of an or tag. It is + serialized as filepos="000000000" and a pointer to its location is + stored in self.href_offsets so that the correct value can be filled in + at the end. + ''' + hrefs = self.oeb.manifest.hrefs + path, frag = urldefrag(urlnormalize(href)) + if path and base: + path = base.abshref(path) + if path and path not in hrefs: + return False + buf = self.buf + item = hrefs[path] if path else None + if item and item.spine_position is None: + return False + path = item.href if item else base.href + href = '#'.join((path, frag)) if frag else path + buf.write(b'filepos=') + self.href_offsets[href].append(buf.tell()) + buf.write(b'0000000000') + return True + + def serialize_body(self): + ''' + Serialize all items in the spine of the document. Non linear items are + moved to the end. + ''' + buf = self.buf + self.anchor_offset = buf.tell() + buf.write(b'') + self.anchor_offset_kindle = buf.tell() + spine = [item for item in self.oeb.spine if item.linear] + spine.extend([item for item in self.oeb.spine if not item.linear]) + for item in spine: + self.serialize_item(item) + buf.write(b'') + + def serialize_item(self, item): + ''' + Serialize an individual item from the spine of the input document. + A reference to this item is stored in self.href_offsets + ''' + buf = self.buf + if not item.linear: + self.breaks.append(buf.tell() - 1) + self.id_offsets[urlnormalize(item.href)] = buf.tell() + # Kindle periodical articles are contained in a
tag + buf.write(b'
') + for elem in item.data.find(XHTML('body')): + self.serialize_elem(elem, item) + # Kindle periodical article end marker + buf.write(b'
') + if self.write_page_breaks_after_item: + buf.write(b'') + buf.write(b'
') + self.anchor_offset = None + + def serialize_elem(self, elem, item, nsrmap=NSRMAP): + buf = self.buf + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) not in nsrmap: + return + tag = prefixname(elem.tag, nsrmap) + # Previous layers take care of @name + id_ = elem.attrib.pop('id', None) + if id_: + href = '#'.join((item.href, id_)) + offset = self.anchor_offset or buf.tell() + self.id_offsets[urlnormalize(href)] = offset + if self.anchor_offset is not None and \ + tag == 'a' and not elem.attrib and \ + not len(elem) and not elem.text: + return + self.anchor_offset = buf.tell() + buf.write(b'<') + buf.write(tag.encode('utf-8')) + if elem.attrib: + for attr, val in elem.attrib.items(): + if namespace(attr) not in nsrmap: + continue + attr = prefixname(attr, nsrmap) + buf.write(b' ') + if attr == 'href': + if self.serialize_href(val, item): + continue + elif attr == 'src': + href = urlnormalize(item.abshref(val)) + if href in self.images: + index = self.images[href] + buf.write(b'recindex="%05d"' % index) + continue + buf.write(attr.encode('utf-8')) + buf.write(b'="') + self.serialize_text(val, quot=True) + buf.write(b'"') + buf.write(b'>') + if elem.text or len(elem) > 0: + if elem.text: + self.anchor_offset = None + self.serialize_text(elem.text) + for child in elem: + self.serialize_elem(child, item) + if child.tail: + self.anchor_offset = None + self.serialize_text(child.tail) + buf.write(b'' % tag.encode('utf-8')) + + def serialize_text(self, text, quot=False): + text = text.replace('&', '&') + text = text.replace('<', '<') + text = text.replace('>', '>') + text = text.replace(u'\u00AD', '') # Soft-hyphen + if quot: + text = text.replace('"', '"') + self.buf.write(text.encode('utf-8')) + + def fixup_links(self): + ''' + Fill in the correct values for all filepos="..." links with the offsets + of the linked to content (as stored in id_offsets). + ''' + buf = self.buf + id_offsets = self.id_offsets + for href, hoffs in self.href_offsets.items(): + # Iterate over all filepos items + if href not in id_offsets: + self.logger.warn('Hyperlink target %r not found' % href) + # Link to the top of the document, better than just ignoring + href, _ = urldefrag(href) + if href in self.id_offsets: + ioff = self.id_offsets[href] + for hoff in hoffs: + buf.seek(hoff) + buf.write(b'%010d' % ioff) + +