From d1b6bb705dd0ea2700a9f0cb5feadbf0b576263f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 9 Mar 2012 21:30:24 +0530 Subject: [PATCH] Support for reading KF8 --- src/calibre/customize/builtins.py | 2 +- src/calibre/devices/kindle/apnx.py | 2 +- .../ebooks/conversion/plugins/mobi_input.py | 35 +- src/calibre/ebooks/metadata/mobi.py | 73 ++++ src/calibre/ebooks/metadata/toc.py | 2 +- src/calibre/ebooks/mobi/reader/__init__.py | 11 + src/calibre/ebooks/mobi/reader/headers.py | 258 ++++++++++++ src/calibre/ebooks/mobi/reader/index.py | 195 +++++++++ src/calibre/ebooks/mobi/reader/markup.py | 307 ++++++++++++++ .../mobi/{reader.py => reader/mobi6.py} | 336 +++------------ src/calibre/ebooks/mobi/reader/mobi8.py | 390 ++++++++++++++++++ src/calibre/ebooks/mobi/reader/ncx.py | 84 ++++ src/calibre/ebooks/mobi/utils.py | 24 +- 13 files changed, 1426 insertions(+), 293 deletions(-) create mode 100644 src/calibre/ebooks/mobi/reader/__init__.py create mode 100644 src/calibre/ebooks/mobi/reader/headers.py create mode 100644 src/calibre/ebooks/mobi/reader/index.py create mode 100644 src/calibre/ebooks/mobi/reader/markup.py rename src/calibre/ebooks/mobi/{reader.py => reader/mobi6.py} (75%) create mode 100644 src/calibre/ebooks/mobi/reader/mobi8.py create mode 100644 src/calibre/ebooks/mobi/reader/ncx.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 50f77108d9..2908444665 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -263,7 +263,7 @@ class MOBIMetadataReader(MetadataReaderPlugin): description = _('Read metadata from %s files')%'MOBI' def get_metadata(self, stream, ftype): - from calibre.ebooks.mobi.reader import get_metadata + from calibre.ebooks.metadata.mobi import get_metadata return get_metadata(stream) class ODTMetadataReader(MetadataReaderPlugin): diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index 75b0804e6a..4336df81a6 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -10,7 +10,7 @@ Generates and writes an APNX page mapping file. import struct -from calibre.ebooks.mobi.reader import MobiReader +from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.utils.logging import default_log diff --git a/src/calibre/ebooks/conversion/plugins/mobi_input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py index e5a3c6ac10..2cf4ef50a2 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_input.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py @@ -3,7 +3,10 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os + from calibre.customize.conversion import InputFormatPlugin +from calibre.ptempfile import PersistentTemporaryDirectory class MOBIInput(InputFormatPlugin): @@ -14,17 +17,43 @@ class MOBIInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - from calibre.ebooks.mobi.reader import MobiReader + + if os.environ.get('USE_MOBIUNPACK', None) is not None: + try: + from mobiunpack.mobi_unpack import Mobi8Reader + from calibre.customize.ui import plugin_for_input_format + + wdir = PersistentTemporaryDirectory('_unpack_space') + m8r = Mobi8Reader(stream, wdir) + if m8r.isK8(): + epub_path = m8r.processMobi8() + epub_input = plugin_for_input_format('epub') + for opt in epub_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = m8r.getCodec() + return epub_input.convert(open(epub_path,'rb'), options, + 'epub', log, accelerators) + except Exception: + log.exception('mobi_unpack code not working') + + from calibre.ebooks.mobi.reader.mobi6 import MobiReader from lxml import html parse_cache = {} try: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline) - mr.extract_content(u'.', parse_cache) + if mr.kf8_type is None: + mr.extract_content(u'.', parse_cache) + except: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline, try_extra_data_fix=True) - mr.extract_content(u'.', parse_cache) + if mr.kf8_type is None: + mr.extract_content(u'.', parse_cache) + + if mr.kf8_type is not None: + from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader + return os.path.abspath(Mobi8Reader(mr, log)()) raw = parse_cache.pop('calibre_raw_mobi_markup', False) if raw: diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index cf8fdbd7e8..911421a6ce 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -9,6 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \ 'Marshall T. Vandegrift ' __docformat__ = 'restructuredtext en' +import os, cStringIO from struct import pack, unpack from cStringIO import StringIO @@ -433,3 +434,75 @@ def set_metadata(stream, mi): mu = MetadataUpdater(stream) mu.update(mi) return + +def get_metadata(stream): + from calibre.ebooks.metadata import MetaInformation + from calibre.ptempfile import TemporaryDirectory + from calibre.ebooks.mobi.reader.headers import MetadataHeader + from calibre.ebooks.mobi.reader.mobi6 import MobiReader + from calibre import CurrentDir + + try: + from PIL import Image as PILImage + PILImage + except ImportError: + import Image as PILImage + + + stream.seek(0) + try: + raw = stream.read(3) + except: + raw = '' + stream.seek(0) + if raw == b'TPZ': + from calibre.ebooks.metadata.topaz import get_metadata + return get_metadata(stream) + from calibre.utils.logging import Log + log = Log() + try: + mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) + except: + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + mh = MetadataHeader(stream, log) + if mh.title and mh.title != _('Unknown'): + mi.title = mh.title + + if mh.exth is not None: + if mh.exth.mi is not None: + mi = mh.exth.mi + else: + size = 1024**3 + if hasattr(stream, 'seek') and hasattr(stream, 'tell'): + pos = stream.tell() + stream.seek(0, 2) + size = stream.tell() + stream.seek(pos) + if size < 4*1024*1024: + with TemporaryDirectory('_mobi_meta_reader') as tdir: + with CurrentDir(tdir): + mr = MobiReader(stream, log) + parse_cache = {} + mr.extract_content(tdir, parse_cache) + if mr.embedded_mi is not None: + mi = mr.embedded_mi + if hasattr(mh.exth, 'cover_offset'): + cover_index = mh.first_image_index + mh.exth.cover_offset + data = mh.section_data(int(cover_index)) + else: + try: + data = mh.section_data(mh.first_image_index) + except: + data = '' + buf = cStringIO.StringIO(data) + try: + im = PILImage.open(buf) + except: + log.exception('Failed to read MOBI cover') + else: + obuf = cStringIO.StringIO() + im.convert('RGB').save(obuf, format='JPEG') + mi.cover_data = ('jpg', obuf.getvalue()) + return mi + + diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 370246b288..a95ff9f44c 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -46,7 +46,7 @@ class TOC(list): self.toc_thumbnail = toc_thumbnail def __str__(self): - lines = ['TOC: %s#%s'%(self.href, self.fragment)] + lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)] for child in self: c = str(child).splitlines() for l in c: diff --git a/src/calibre/ebooks/mobi/reader/__init__.py b/src/calibre/ebooks/mobi/reader/__init__.py new file mode 100644 index 0000000000..dd9615356c --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py new file mode 100644 index 0000000000..8cff1360de --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (absolute_import, print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, re, os + +from calibre import replace_entities +from calibre.utils.date import parse_date +from calibre.ebooks.mobi import MobiError +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana + +NULL_INDEX = 0xffffffff + +class EXTHHeader(object): # {{{ + + def __init__(self, raw, codec, title): + self.doctype = raw[:4] + self.length, self.num_items = struct.unpack('>LL', raw[4:12]) + raw = raw[12:] + pos = 0 + self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) + self.has_fake_cover = True + self.start_offset = None + left = self.num_items + + while left > 0: + left -= 1 + id, size = struct.unpack('>LL', raw[pos:pos + 8]) + content = raw[pos + 8:pos + size] + pos += size + if id >= 100 and id < 200: + self.process_metadata(id, content, codec) + elif id == 203: + self.has_fake_cover = bool(struct.unpack('>L', content)[0]) + elif id == 201: + co, = struct.unpack('>L', content) + if co < NULL_INDEX: + self.cover_offset = co + elif id == 202: + self.thumbnail_offset, = struct.unpack('>L', content) + elif id == 501: + # cdetype + pass + elif id == 502: + # last update time + pass + elif id == 503: # Long title + # Amazon seems to regard this as the definitive book title + # rather than the title from the PDB header. In fact when + # sending MOBI files through Amazon's email service if the + # title contains non ASCII chars or non filename safe chars + # they are messed up in the PDB header + try: + title = content.decode(codec) + except: + pass + #else: + # print 'unknown record', id, repr(content) + if title: + self.mi.title = replace_entities(title) + + def process_metadata(self, id, content, codec): + if id == 100: + if self.mi.authors == [_('Unknown')]: + self.mi.authors = [] + au = content.decode(codec, 'ignore').strip() + self.mi.authors.append(au) + if re.match(r'\S+?\s*,\s+\S+', au.strip()): + self.mi.author_sort = au.strip() + elif id == 101: + self.mi.publisher = content.decode(codec, 'ignore').strip() + elif id == 103: + self.mi.comments = content.decode(codec, 'ignore') + elif id == 104: + self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '') + elif id == 105: + if not self.mi.tags: + self.mi.tags = [] + self.mi.tags.extend([x.strip() for x in content.decode(codec, + 'ignore').split(';')]) + self.mi.tags = list(set(self.mi.tags)) + elif id == 106: + try: + self.mi.pubdate = parse_date(content, as_utc=False) + except: + pass + elif id == 108: + pass # Producer + elif id == 113: + pass # ASIN or UUID + elif id == 116: + self.start_offset, = struct.unpack(b'>L', content) + #else: + # print 'unhandled metadata record', id, repr(content) +# }}} + +class BookHeader(object): + + def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False): + self.log = log + self.compression_type = raw[:2] + self.records, self.records_size = struct.unpack('>HH', raw[8:12]) + self.encryption_type, = struct.unpack('>H', raw[12:14]) + if ident == 'TEXTREAD': + self.codepage = 1252 + if len(raw) <= 16: + self.codec = 'cp1252' + self.extra_flags = 0 + self.title = _('Unknown') + self.language = 'ENGLISH' + self.sublanguage = 'NEUTRAL' + self.exth_flag, self.exth = 0, None + self.ancient = True + self.first_image_index = -1 + self.mobi_version = 1 + else: + self.ancient = False + self.doctype = raw[16:20] + self.length, self.type, self.codepage, self.unique_id, \ + self.version = struct.unpack('>LLLLL', raw[20:40]) + + try: + self.codec = { + 1252: 'cp1252', + 65001: 'utf-8', + }[self.codepage] + except (IndexError, KeyError): + self.codec = 'cp1252' if not user_encoding else user_encoding + log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, + self.codec)) + # There exists some broken DRM removal tool that removes DRM but + # leaves the DRM fields in the header yielding a header size of + # 0xF8. The actual value of max_header_length should be 0xE8 but + # it's changed to accommodate this silly tool. Hopefully that will + # not break anything else. + max_header_length = 0xF8 + + if (ident == 'TEXTREAD' or self.length < 0xE4 or + self.length > max_header_length or + (try_extra_data_fix and self.length == 0xE4)): + self.extra_flags = 0 + else: + self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) + + if self.compression_type == 'DH': + self.huff_offset, self.huff_number = struct.unpack('>LL', + raw[0x70:0x78]) + + toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) + tend = toff + tlen + self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') + langcode = struct.unpack('!L', raw[0x5C:0x60])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + self.language = main_language.get(langid, 'ENGLISH') + self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') + self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] + self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] + + self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) + self.exth = None + if not isinstance(self.title, unicode): + self.title = self.title.decode(self.codec, 'replace') + if self.exth_flag & 0x40: + try: + self.exth = EXTHHeader(raw[16 + self.length:], self.codec, + self.title) + self.exth.mi.uid = self.unique_id + try: + self.exth.mi.language = mobi2iana(langid, sublangid) + except: + self.log.exception('Unknown language code') + except: + self.log.exception('Invalid EXTH header') + self.exth_flag = 0 + + self.ncxidx = NULL_INDEX + if len(raw) >= 0xF8: + self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) + + if self.mobi_version >= 8: + self.skelidx, = struct.unpack_from('>L', raw, 0xFC) + + # Index into
sections in raw_ml + self.dividx, = struct.unpack_from('>L', raw, 0xF8) + + # Index into Other files + self.othidx, = struct.unpack_from('>L', raw, 0x104) + + # need to use the FDST record to find out how to properly + # unpack the raw_ml into pieces it is simply a table of start + # and end locations for each flow piece + self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) + self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4) + # if cnt is 1 or less, fdst section number can be garbage + if self.fdstcnt <= 1: + self.fdstidx = NULL_INDEX + else: # Null values + self.skelidx = self.dividx = self.othidx = self.fdstidx = \ + NULL_INDEX + +class MetadataHeader(BookHeader): + + def __init__(self, stream, log): + self.stream = stream + self.ident = self.identity() + self.num_sections = self.section_count() + if self.num_sections >= 2: + header = self.header() + BookHeader.__init__(self, header, self.ident, None, log) + else: + self.exth = None + + def identity(self): + self.stream.seek(60) + ident = self.stream.read(8).upper() + if ident not in ['BOOKMOBI', 'TEXTREAD']: + raise MobiError('Unknown book type: %s' % ident) + return ident + + def section_count(self): + self.stream.seek(76) + return struct.unpack('>H', self.stream.read(2))[0] + + def section_offset(self, number): + self.stream.seek(78 + number * 8) + return struct.unpack('>LBBBB', self.stream.read(8))[0] + + def header(self): + section_headers = [] + # First section with the metadata + section_headers.append(self.section_offset(0)) + # Second section used to get the length of the first + section_headers.append(self.section_offset(1)) + + end_off = section_headers[1] + off = section_headers[0] + self.stream.seek(off) + return self.stream.read(end_off - off) + + def section_data(self, number): + start = self.section_offset(number) + if number == self.num_sections -1: + end = os.stat(self.stream.name).st_size + else: + end = self.section_offset(number + 1) + self.stream.seek(start) + try: + return self.stream.read(end - start) + except OverflowError: + self.stream.seek(start) + return self.stream.read() + diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py new file mode 100644 index 0000000000..41177873b7 --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct +from collections import OrderedDict + +from calibre.ebooks.mobi.utils import decint, count_set_bits + +class InvalidFile(ValueError): + pass + +def check_signature(data, signature): + if data[:len(signature)] != signature: + raise InvalidFile('Not a valid %r section'%signature) + +class NotAnINDXRecord(InvalidFile): + pass + +class NotATAGXSection(InvalidFile): + pass + +def format_bytes(byts): + byts = bytearray(byts) + byts = [hex(b)[2:] for b in byts] + return ' '.join(byts) + +def parse_indx_header(data): + check_signature(data, b'INDX') + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' + ) + num = len(words) + values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)]) + header = {words[i]:values[i] for i in xrange(num)} + return header + +class CNCX(object): # {{{ + + ''' + Parses the records that contain the compiled NCX (all strings from the + NCX). Presents a simple offset : string mapping interface to access the + data. + ''' + + def __init__(self, records, codec): + self.records = OrderedDict() + record_offset = 0 + for raw in records: + pos = 0 + while pos < len(raw): + length, consumed = decint(raw[pos:]) + if length > 0: + try: + self.records[pos+record_offset] = raw[ + pos+consumed:pos+consumed+length].decode(codec) + except: + byts = raw[pos:] + r = format_bytes(byts) + print ('CNCX entry at offset %d has unknown format %s'%( + pos+record_offset, r)) + self.records[pos+record_offset] = r + pos = len(raw) + pos += consumed+length + record_offset += 0x10000 + + def __getitem__(self, offset): + return self.records.get(offset) + + def get(self, offset, default=None): + return self.records.get(offset, default) +# }}} + +def parse_tag_section(data): + check_signature(data, b'TAGX') + + tags = [] + first_entry_offset, = struct.unpack_from(b'>L', data, 0x04) + control_byte_count, = struct.unpack_from(b'>L', data, 0x08) + + # Skip the first 12 bytes already read above. + for i in xrange(12, first_entry_offset, 4): + pos = i + tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]), + ord(data[pos+3]))) + return control_byte_count, tags + +def get_tag_map(control_byte_count, tags, data, start, end): + ptags = [] + ans = {} + control_byte_index = 0 + data_start = start + control_byte_count + + for tag, values_per_entry, mask, end_flag in tags: + if end_flag == 0x01: + control_byte_index += 1 + continue + value = ord(data[start + control_byte_index]) & mask + if value != 0: + if value == mask: + if count_set_bits(mask) > 1: + # If all bits of masked value are set and the mask has more than one bit, a variable width value + # will follow after the control bytes which defines the length of bytes (NOT the value count!) + # which will contain the corresponding variable width values. + value, consumed = decint(data[data_start:]) + data_start += consumed + ptags.append((tag, None, value, values_per_entry)) + else: + ptags.append((tag, 1, None, values_per_entry)) + else: + # Shift bits to get the masked value. + while mask & 0x01 == 0: + mask = mask >> 1 + value = value >> 1 + ptags.append((tag, value, None, values_per_entry)) + for tag, value_count, value_bytes, values_per_entry in ptags: + values = [] + if value_count != None: + # Read value_count * values_per_entry variable width values. + for _ in xrange(value_count*values_per_entry): + byts, consumed = decint(data[data_start:]) + data_start += consumed + values.append(byts) + else: + # Convert value_bytes to variable width values. + total_consumed = 0 + while total_consumed < value_bytes: + # Does this work for values_per_entry != 1? + byts, consumed = decint(data[data_start:]) + data_start += consumed + total_consumed += consumed + values.append(byts) + if total_consumed != value_bytes: + print ("Error: Should consume %s bytes, but consumed %s" % + (value_bytes, total_consumed)) + ans[tag] = values + # Test that all bytes have been processed if end is given. + if end is not None and data_start < end: + # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. + rest = data[data_start:end] + if rest.replace(b'\0', b''): + print ("Warning: There are unprocessed index bytes left: %s" % + format_bytes(rest)) + + return ans + +def read_index(sections, idx, codec): + table, cncx = OrderedDict(), CNCX([], codec) + + data = sections[idx][0] + + indx_header = parse_indx_header(data) + indx_count = indx_header['count'] + + if indx_header['ncncx'] > 0: + off = idx + indx_count + 1 + cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]] + cncx = CNCX(cncx_records, codec) + + tag_section_start = indx_header['len'] + control_byte_count, tags = parse_tag_section(data[tag_section_start:]) + + for i in xrange(idx + 1, idx + 1 + indx_count): + data = sections[i][0] + header = parse_indx_header(data) + idxt_pos = header['start'] + entry_count = header['count'] + + # loop through to build up the IDXT position starts + idx_positions= [] + for j in xrange(entry_count): + pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) + idx_positions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill + # bytes we need to ignore!) + idx_positions.append(idxt_pos) + + # For each entry in the IDXT build up the tag map and any associated + # text + for j in xrange(entry_count): + start, end = idx_positions[j:j+2] + text_length = ord(data[start]) + text = data[start+1:start+1+text_length] + tag_map = get_tag_map(control_byte_count, tags, data, + start+1+text_length, end) + table[text] = tag_map + + return table, cncx + diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py new file mode 100644 index 0000000000..cac0e71515 --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, os + +def update_internal_links(mobi8_reader): + # need to update all links that are internal which + # are based on positions within the xhtml files **BEFORE** + # cutting and pasting any pieces into the xhtml text files + + # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) + # XXXX is the offset in records into divtbl + # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position + + mr = mobi8_reader + + # pos:fid pattern + posfid_pattern = re.compile(br'''()''', re.IGNORECASE) + posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') + + parts = [] + for part in mr.parts: + srcpieces = posfid_pattern.split(part) + for j in xrange(1, len(srcpieces), 2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in posfid_index_pattern.finditer(tag): + posfid = m.group(1) + offset = m.group(2) + filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset) + suffix = (b'#' + idtag) if idtag else b'' + replacement = filename.encode(mr.header.codec) + suffix + tag = posfid_index_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = ''.join([x.decode(mr.header.codec) for x in srcpieces]) + parts.append(part) + + # All parts are now unicode and have no internal links + return parts + +def remove_kindlegen_markup(parts): + + # we can safely remove all of the Kindlegen generated aid tags + find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', + re.IGNORECASE) + within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''') + + for i in xrange(len(parts)): + part = parts[i] + srcpieces = find_tag_with_aid_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith('<'): + for m in within_tag_aid_position_pattern.finditer(tag): + replacement = '' + tag = within_tag_aid_position_pattern.sub(replacement, tag, + 1) + srcpieces[j] = tag + part = "".join(srcpieces) + parts[i] = part + + # we can safely remove all of the Kindlegen generated data-AmznPageBreak tags + find_tag_with_AmznPageBreak_pattern = re.compile( + r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) + within_tag_AmznPageBreak_position_pattern = re.compile( + r'''\sdata-AmznPageBreak=['"][^'"]*['"]''') + + for i in xrange(len(parts)): + part = parts[i] + srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith('<'): + for m in within_tag_AmznPageBreak_position_pattern.finditer(tag): + replacement = '' + tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = "".join(srcpieces) + parts[i] = part + +def update_flow_links(mobi8_reader, resource_map, log): + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + # kindle:embed:XXXX (used for fonts) + + mr = mobi8_reader + flows = [] + + img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE) + + tag_pattern = re.compile(r'''(<[^>]*>)''') + flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + + url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE) + url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE) + font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE) + url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) + + for flow in mr.flows: + if flow is None: # 0th flow is None + flows.append(flow) + continue + + if not isinstance(flow, unicode): + flow = flow.decode(mr.header.codec) + + # links to raster image files from image tags + # image_pattern + srcpieces = img_pattern.split(flow) + for j in range(1, len(srcpieces), 2): + tag = srcpieces[j] + if tag.startswith(']*>)''') + flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + for i in xrange(len(parts)): + part = parts[i] + + # flow pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith('<'): + for m in flow_pattern.finditer(tag): + num = int(m.group(1), 32) + fi = mr.flowinfo[num] + if fi.format == 'inline': + tag = flows[num] + else: + replacement = '"../' + fi.dir + '/' + fi.fname + '"' + tag = flow_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = "".join(srcpieces) + # store away modified version + parts[i] = part + +def insert_images_into_markup(parts, resource_map, log): + # Handle any embedded raster images links in the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') + for i in xrange(len(parts)): + part = parts[i] + #[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # links to raster image files + # image_pattern + srcpieces = img_pattern.split(part) + for j in range(1, len(srcpieces), 2): + tag = srcpieces[j] + if tag.startswith(']*>)''', re.IGNORECASE) + + for i in xrange(len(parts)): + part = parts[i] + + # tag pattern + srcpieces = re.split(tag_pattern, part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag[:4].lower() == 'LL', raw[4:12]) - raw = raw[12:] - pos = 0 - self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) - self.has_fake_cover = True - left = self.num_items - - while left > 0: - left -= 1 - id, size = struct.unpack('>LL', raw[pos:pos + 8]) - content = raw[pos + 8:pos + size] - pos += size - if id >= 100 and id < 200: - self.process_metadata(id, content, codec) - elif id == 203: - self.has_fake_cover = bool(struct.unpack('>L', content)[0]) - elif id == 201: - co, = struct.unpack('>L', content) - if co < 1e7: - self.cover_offset = co - elif id == 202: - self.thumbnail_offset, = struct.unpack('>L', content) - elif id == 501: - # cdetype - pass - elif id == 502: - # last update time - pass - elif id == 503: # Long title - # Amazon seems to regard this as the definitive book title - # rather than the title from the PDB header. In fact when - # sending MOBI files through Amazon's email service if the - # title contains non ASCII chars or non filename safe chars - # they are messed up in the PDB header - try: - title = content.decode(codec) - except: - pass - #else: - # print 'unknown record', id, repr(content) - if title: - self.mi.title = replace_entities(title) - - def process_metadata(self, id, content, codec): - if id == 100: - if self.mi.authors == [_('Unknown')]: - self.mi.authors = [] - au = content.decode(codec, 'ignore').strip() - self.mi.authors.append(au) - if re.match(r'\S+?\s*,\s+\S+', au.strip()): - self.mi.author_sort = au.strip() - elif id == 101: - self.mi.publisher = content.decode(codec, 'ignore').strip() - elif id == 103: - self.mi.comments = content.decode(codec, 'ignore') - elif id == 104: - self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '') - elif id == 105: - if not self.mi.tags: - self.mi.tags = [] - self.mi.tags.extend([x.strip() for x in content.decode(codec, - 'ignore').split(';')]) - self.mi.tags = list(set(self.mi.tags)) - elif id == 106: - try: - self.mi.pubdate = parse_date(content, as_utc=False) - except: - pass - elif id == 108: - pass # Producer - elif id == 113: - pass # ASIN or UUID - #else: - # print 'unhandled metadata record', id, repr(content) - - -class BookHeader(object): - - def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False): - self.log = log - self.compression_type = raw[:2] - self.records, self.records_size = struct.unpack('>HH', raw[8:12]) - self.encryption_type, = struct.unpack('>H', raw[12:14]) - if ident == 'TEXTREAD': - self.codepage = 1252 - if len(raw) <= 16: - self.codec = 'cp1252' - self.extra_flags = 0 - self.title = _('Unknown') - self.language = 'ENGLISH' - self.sublanguage = 'NEUTRAL' - self.exth_flag, self.exth = 0, None - self.ancient = True - self.first_image_index = -1 - self.mobi_version = 1 - else: - self.ancient = False - self.doctype = raw[16:20] - self.length, self.type, self.codepage, self.unique_id, \ - self.version = struct.unpack('>LLLLL', raw[20:40]) - - try: - self.codec = { - 1252: 'cp1252', - 65001: 'utf-8', - }[self.codepage] - except (IndexError, KeyError): - self.codec = 'cp1252' if not user_encoding else user_encoding - log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, - self.codec)) - # There exists some broken DRM removal tool that removes DRM but - # leaves the DRM fields in the header yielding a header size of - # 0xF8. The actual value of max_header_length should be 0xE8 but - # it's changed to accommodate this silly tool. Hopefully that will - # not break anything else. - max_header_length = 0xF8 - - if (ident == 'TEXTREAD' or self.length < 0xE4 or - self.length > max_header_length or - (try_extra_data_fix and self.length == 0xE4)): - self.extra_flags = 0 - else: - self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) - - if self.compression_type == 'DH': - self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) - - toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) - tend = toff + tlen - self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') - langcode = struct.unpack('!L', raw[0x5C:0x60])[0] - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - self.language = main_language.get(langid, 'ENGLISH') - self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') - self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] - self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] - - self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) - self.exth = None - if not isinstance(self.title, unicode): - self.title = self.title.decode(self.codec, 'replace') - if self.exth_flag & 0x40: - try: - self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title) - self.exth.mi.uid = self.unique_id - try: - self.exth.mi.language = mobi2iana(langid, sublangid) - except: - self.log.exception('Unknown language code') - except: - self.log.exception('Invalid EXTH header') - self.exth_flag = 0 - - -class MetadataHeader(BookHeader): - def __init__(self, stream, log): - self.stream = stream - self.ident = self.identity() - self.num_sections = self.section_count() - if self.num_sections >= 2: - header = self.header() - BookHeader.__init__(self, header, self.ident, None, log) - else: - self.exth = None - - def identity(self): - self.stream.seek(60) - ident = self.stream.read(8).upper() - if ident not in ['BOOKMOBI', 'TEXTREAD']: - raise MobiError('Unknown book type: %s' % ident) - return ident - - def section_count(self): - self.stream.seek(76) - return struct.unpack('>H', self.stream.read(2))[0] - - def section_offset(self, number): - self.stream.seek(78 + number * 8) - return struct.unpack('>LBBBB', self.stream.read(8))[0] - - def header(self): - section_headers = [] - # First section with the metadata - section_headers.append(self.section_offset(0)) - # Second section used to get the lengh of the first - section_headers.append(self.section_offset(1)) - - end_off = section_headers[1] - off = section_headers[0] - self.stream.seek(off) - return self.stream.read(end_off - off) - - def section_data(self, number): - start = self.section_offset(number) - if number == self.num_sections -1: - end = os.stat(self.stream.name).st_size - else: - end = self.section_offset(number + 1) - self.stream.seek(start) - try: - return self.stream.read(end - start) - except OverflowError: - return self.stream.read(os.stat(self.stream.name).st_size - start) - - class MobiReader(object): PAGE_BREAK_PAT = re.compile( r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*', @@ -312,15 +101,46 @@ class MobiReader(object): self.sections.append((section(i), self.section_headers[i])) - self.book_header = BookHeader(self.sections[0][0], self.ident, + self.book_header = bh = BookHeader(self.sections[0][0], self.ident, user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) self.name = self.name.decode(self.book_header.codec, 'replace') + self.kf8_type = None + is_kf8 = self.book_header.mobi_version == 8 + if is_kf8: + self.kf8_type = 'standalone' + else: # Check for joint mobi 6 and kf 8 file + KF8_BOUNDARY = b'BOUNDARY' + for i, x in enumerate(self.sections[:-1]): + sec = x[0] + if (len(sec) == len(KF8_BOUNDARY) and sec == + KF8_BOUNDARY): + try: + self.book_header = BookHeader(self.sections[i+1][0], + self.ident, user_encoding, self.log) + # The following are only correct in the Mobi 6 + # header not the Mobi 8 header + for x in ('first_image_index',): + setattr(self.book_header, x, getattr(bh, x)) + self.book_header.huff_offset += i + 1 + self.kf8_type = 'joint' + self.kf8_boundary = i + except: + pass + break + + def check_for_drm(self): + if self.book_header.encryption_type != 0: + try: + name = self.book_header.exth.mi.title + except: + name = self.name + if not name: + name = self.name + raise DRMError(name) def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) - if self.book_header.encryption_type != 0: - raise DRMError(self.name) - + self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html @@ -916,11 +736,12 @@ class MobiReader(object): trail_size = self.sizeof_trailing_entries(data) return data[:len(data)-trail_size] - def extract_text(self): + def extract_text(self, offset=1): self.log.debug('Extracting text...') - text_sections = [self.text_section(i) for i in range(1, - min(self.book_header.records + 1, len(self.sections)))] - processed_records = list(range(0, self.book_header.records + 1)) + text_sections = [self.text_section(i) for i in xrange(offset, + min(self.book_header.records + offset, len(self.sections)))] + processed_records = list(range(offset-1, self.book_header.records + + offset)) self.mobi_html = '' @@ -1027,63 +848,6 @@ class MobiReader(object): self.image_names.append(os.path.basename(path)) im.save(open(path, 'wb'), format='JPEG') -def get_metadata(stream): - stream.seek(0) - try: - raw = stream.read(3) - except: - raw = '' - stream.seek(0) - if raw == 'TPZ': - from calibre.ebooks.metadata.topaz import get_metadata - return get_metadata(stream) - from calibre.utils.logging import Log - log = Log() - try: - mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) - except: - mi = MetaInformation(_('Unknown'), [_('Unknown')]) - mh = MetadataHeader(stream, log) - if mh.title and mh.title != _('Unknown'): - mi.title = mh.title - - if mh.exth is not None: - if mh.exth.mi is not None: - mi = mh.exth.mi - else: - size = sys.maxint - if hasattr(stream, 'seek') and hasattr(stream, 'tell'): - pos = stream.tell() - stream.seek(0, 2) - size = stream.tell() - stream.seek(pos) - if size < 4*1024*1024: - with TemporaryDirectory('_mobi_meta_reader') as tdir: - with CurrentDir(tdir): - mr = MobiReader(stream, log) - parse_cache = {} - mr.extract_content(tdir, parse_cache) - if mr.embedded_mi is not None: - mi = mr.embedded_mi - if hasattr(mh.exth, 'cover_offset'): - cover_index = mh.first_image_index + mh.exth.cover_offset - data = mh.section_data(int(cover_index)) - else: - try: - data = mh.section_data(mh.first_image_index) - except: - data = '' - buf = cStringIO.StringIO(data) - try: - im = PILImage.open(buf) - except: - log.exception('Failed to read MOBI cover') - else: - obuf = cStringIO.StringIO() - im.convert('RGB').save(obuf, format='JPEG') - mi.cover_data = ('jpg', obuf.getvalue()) - return mi - def test_mbp_regex(): for raw, m in { '':'', diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py new file mode 100644 index 0000000000..dbe027f521 --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import struct, re, os, zlib, imghdr +from collections import namedtuple +from itertools import repeat + +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import read_index +from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc +from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup +from calibre.ebooks.metadata.opf2 import Guide, OPFCreator + +Part = namedtuple('Part', + 'num type filename start end aid') + +Elem = namedtuple('Elem', + 'insert_pos toc_text file_number sequence_number start_pos ' + 'length') + +FlowInfo = namedtuple('FlowInfo', + 'type format dir fname') + +class Mobi8Reader(object): + + def __init__(self, mobi6_reader, log): + self.mobi6_reader, self.log = mobi6_reader, log + self.header = mobi6_reader.book_header + + def __call__(self): + self.mobi6_reader.check_for_drm() + offset = 1 + res_end = len(self.mobi6_reader.sections) + if self.mobi6_reader.kf8_type == 'joint': + offset = self.mobi6_reader.kf8_boundary + 2 + res_end = self.mobi6_reader.kf8_boundary + + self.processed_records = self.mobi6_reader.extract_text(offset=offset) + self.raw_ml = self.mobi6_reader.mobi_html + with open('debug-raw.html', 'wb') as f: + f.write(self.raw_ml) + + self.kf8_sections = self.mobi6_reader.sections[offset-1:] + first_resource_index = self.header.first_image_index + if first_resource_index in {-1, NULL_INDEX}: + first_resource_index = self.header.records + 1 + self.resource_sections = \ + self.mobi6_reader.sections[first_resource_index:res_end] + self.cover_offset = getattr(self.header.exth, 'cover_offset', None) + + self.read_indices() + self.build_parts() + guide = self.create_guide() + ncx = self.create_ncx() + resource_map = self.extract_resources() + spine = self.expand_text(resource_map) + return self.write_opf(guide, ncx, spine, resource_map) + + def read_indices(self): + self.flow_table = (0, NULL_INDEX) + + if self.header.fdstidx != NULL_INDEX: + header = self.kf8_sections[self.header.fdstidx][0] + if header[:4] != b'FDST': + raise ValueError('KF8 does not have a valid FDST record') + num_sections, = struct.unpack_from(b'>L', header, 0x08) + sections = header[0x0c:] + self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2), + sections, 0)[::2] + (NULL_INDEX,) + + self.files = [] + if self.header.skelidx != NULL_INDEX: + table = read_index(self.kf8_sections, self.header.skelidx, + self.header.codec)[0] + File = namedtuple('File', + 'file_number name divtbl_count start_position length') + + for i, text in enumerate(table.iterkeys()): + tag_map = table[text] + self.files.append(File(i, text, tag_map[1][0], + tag_map[6][0], tag_map[6][1])) + + self.elems = [] + if self.header.dividx != NULL_INDEX: + table, cncx = read_index(self.kf8_sections, self.header.dividx, + self.header.codec) + for i, text in enumerate(table.iterkeys()): + tag_map = table[text] + toc_text = cncx[tag_map[2][0]] + self.elems.append(Elem(int(text), toc_text, tag_map[3][0], + tag_map[4][0], tag_map[6][0], tag_map[6][1])) + + self.guide = [] + if self.header.othidx != NULL_INDEX: + table, cncx = read_index(self.kf8_sections, self.header.othidx, + self.header.codec) + Item = namedtuple('Item', + 'type title div_frag_num') + + for i, ref_type in enumerate(table.iterkeys()): + tag_map = table[ref_type] + # ref_type, ref_title, div/frag number + title = cncx[tag_map[1][0]] + fileno = None + if 3 in tag_map.keys(): + fileno = tag_map[3][0] + if 6 in tag_map.keys(): + fileno = tag_map[6][0] + self.guide.append(Item(ref_type.decode(self.header.codec), + title, fileno)) + + def build_parts(self): + raw_ml = self.mobi6_reader.mobi_html + self.flows = [] + self.flowinfo = [] + + # now split the raw_ml into its flow pieces + for j in xrange(0, len(self.flow_table)-1): + start = self.flow_table[j] + end = self.flow_table[j+1] + if end == NULL_INDEX: + end = len(raw_ml) + self.flows.append(raw_ml[start:end]) + + # the first piece represents the xhtml text + text = self.flows[0] + self.flows[0] = b'' + + # walk the and
tables to build original source xhtml + # files *without* destroying any file position information needed for + # later href processing and create final list of file separation start: + # stop points and etc in partinfo + self.parts = [] + self.partinfo = [] + divptr = 0 + baseptr = 0 + for skelnum, skelname, divcnt, skelpos, skellen in self.files: + baseptr = skelpos + skellen + skeleton = text[skelpos:baseptr] + for i in xrange(divcnt): + insertpos, idtext, filenum, seqnum, startpos, length = \ + self.elems[divptr] + if i == 0: + aidtext = idtext[12:-2] + filename = 'part%04d.html' % filenum + part = text[baseptr:baseptr + length] + insertpos = insertpos - skelpos + skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:] + baseptr = baseptr + length + divptr += 1 + self.parts.append(skeleton) + self.partinfo.append(Part(skelnum, 'text', filename, skelpos, + baseptr, aidtext)) + + # The primary css style sheet is typically stored next followed by any + # snippets of code that were previously inlined in the + # original xhtml but have been stripped out and placed here. + # This can include local CDATA snippets and and svg sections. + + # The problem is that for most browsers and ereaders, you can not + # use to import any svg image that itself + # properly uses an tag to import some raster image - it + # should work according to the spec but does not for almost all browsers + # and ereaders and causes epub validation issues because those raster + # images are in manifest but not in xhtml text - since they only + # referenced from an svg image + + # So we need to check the remaining flow pieces to see if they are css + # or svg images. if svg images, we must check if they have an + # and if so inline them into the xhtml text pieces. + + # there may be other sorts of pieces stored here but until we see one + # in the wild to reverse engineer we won't be able to tell + + self.flowinfo.append(FlowInfo(None, None, None, None)) + svg_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) + image_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) + for j in xrange(1, len(self.flows)): + flowpart = self.flows[j] + nstr = '%04d' % j + m = svg_tag_pattern.search(flowpart) + if m != None: + # svg + typ = 'svg' + start = m.start() + m2 = image_tag_pattern.search(flowpart) + if m2 != None: + format = 'inline' + dir = None + fname = None + # strip off anything before = 0: + typ = 'css' + flowpart = '\n' + format = 'inline' + dir = None + fname = None + else: + # css - assume as standalone css file + typ = 'css' + format = 'file' + dir = "styles" + fname = nstr + '.css' + + self.flows[j] = flowpart + self.flowinfo.append(FlowInfo(typ, format, dir, fname)) + + def get_file_info(self, pos): + ''' Get information about the part (file) that exists at pos in + the raw markup ''' + for part in self.partinfo: + if pos >= part.start and pos < part.end: + return part + return Part(*repeat(None, len(Part._fields))) + + def get_id_tag_by_pos_fid(self, posfid, offset): + # first convert kindle:pos:fid and offset info to position in file + row = int(posfid, 32) + off = int(offset, 32) + [insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row] + pos = insertpos + off + fname = self.get_file_info(pos).filename + # an existing "id=" must exist in original xhtml otherwise it would not + # have worked for linking. Amazon seems to have added its own + # additional "aid=" inside tags whose contents seem to represent some + # position information encoded into Base32 name. + + # so find the closest "id=" before position the file by actually + # searching in that file + idtext = self.get_id_tag(pos) + return fname, idtext + + def get_id_tag(self, pos): + # find the correct tag by actually searching in the destination + # textblock at position + fi = self.get_file_info(pos) + if fi.num is None and fi.start is None: + raise ValueError('No file contains pos: %d'%pos) + textblock = self.parts[fi.num] + id_map = [] + npos = pos - fi.start + # if npos inside a tag then search all text before the its end of tag + # marker + pgt = textblock.find(b'>', npos) + plt = textblock.find(b'<', npos) + if pgt < plt: + npos = pgt + 1 + # find id links only inside of tags + # inside any < > pair find all "id=' and return whatever is inside + # the quotes + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''', + re.IGNORECASE) + for m in re.finditer(id_pattern, textblock): + id_map.append((m.start(), m.group(1))) + + if not id_map: + # Found no id in the textblock, link must be to top of file + return b'' + # if npos is before first id= inside a tag, return the first + if npos < id_map[0][0]: + return id_map[0][1] + # if npos is after the last id= inside a tag, return the last + if npos > id_map[-1][0]: + return id_map[-1][1] + # otherwise find last id before npos + for i, item in enumerate(id_map): + if npos < item[0]: + return id_map[i-1][1] + return id_map[0][1] + + def create_guide(self): + guide = Guide() + for ref_type, ref_title, fileno in self.guide: + elem = self.elems[fileno] + fi = self.get_file_info(elem.insert_pos) + idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec) + linktgt = fi.filename + if idtext: + linktgt += b'#' + idtext + g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu()) + g.title, g.type = ref_title, ref_type + guide.append(g) + + so = self.header.exth.start_offset + if so not in {None, NULL_INDEX}: + fi = self.get_file_info(so) + if fi.filename is not None: + idtext = self.get_id_tag(so).decode(self.header.codec) + linktgt = fi.filename + if idtext: + linktgt += '#' + idtext + g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu()) + g.title, g.type = 'start', 'text' + guide.append(g) + + return guide + + def create_ncx(self): + index_entries = read_ncx(self.kf8_sections, self.header.ncxidx, + self.header.codec) + + # Add href and anchor info to the index entries + for entry in index_entries: + pos = entry['pos'] + fi = self.get_file_info(pos) + if fi.filename is None: + raise ValueError('Index entry has invalid pos: %d'%pos) + idtag = self.get_id_tag(pos).decode(self.header.codec) + entry['href'] = '%s/%s'%(fi.type, fi.filename) + entry['idtag'] = idtag + + # Build the TOC object + return build_toc(index_entries) + + def extract_resources(self): + resource_map = [] + for x in ('fonts', 'images'): + os.mkdir(x) + + for i, sec in enumerate(self.resource_sections): + fname_idx = i+1 + data = sec[0] + typ = data[:4] + href = None + if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', + b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: + pass # Ignore these records + elif typ == b'FONT': + # fonts only exist in K8 ebooks + # Format: + # bytes 0 - 3: 'FONT' + # bytes 4 - 7: ?? Expanded size in bytes ?? + # bytes 8 - 11: ?? number of files ?? + # bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24) + # bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib? + # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end + data = data[26:-4] + uncompressed_data = zlib.decompress(data, -15) + hdr = uncompressed_data[0:4] + ext = 'dat' + if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': + ext = 'ttf' + href = "fonts/%05d.%s" % (fname_idx, ext) + with open(href.replace('/', os.sep), 'wb') as f: + f.write(uncompressed_data) + else: + imgtype = imghdr.what(None, data) + if imgtype is None: + imgtype = 'unknown' + href = 'images/%05d.%s'%(fname_idx, imgtype) + with open(href.replace('/', os.sep), 'wb') as f: + f.write(data) + + resource_map.append(href) + + return resource_map + + def expand_text(self, resource_map): + return expand_mobi8_markup(self, resource_map, self.log) + + def write_opf(self, guide, toc, spine, resource_map): + mi = self.header.exth.mi + if (self.cover_offset is not None and self.cover_offset < + len(resource_map)): + mi.cover = resource_map[self.cover_offset] + + opf = OPFCreator(os.getcwdu(), mi) + opf.guide = guide + opf.create_manifest_from_files_in([os.getcwdu()]) + opf.create_spine(spine) + opf.set_toc(toc) + + with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx: + opf.render(of, ncx, 'toc.ncx') + return 'metadata.opf' + + diff --git a/src/calibre/ebooks/mobi/reader/ncx.py b/src/calibre/ebooks/mobi/reader/ncx.py new file mode 100644 index 0000000000..96ab4ac70d --- /dev/null +++ b/src/calibre/ebooks/mobi/reader/ncx.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.mobi.utils import to_base +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import read_index + +tag_fieldname_map = { + 1: ['pos',0], + 2: ['len',0], + 3: ['noffs',0], + 4: ['hlvl',0], + 5: ['koffs',0], + 6: ['pos_fid',0], + 21: ['parent',0], + 22: ['child1',0], + 23: ['childn',0] +} + +def read_ncx(sections, index, codec): + index_entries = [] + + if index != NULL_INDEX: + table, cncx = read_index(sections, index, codec) + + for num, x in enumerate(table.iteritems()): + text, tag_map = x + entry = { + 'name': text, + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Kind", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'num' : num + } + + for tag in tag_fieldname_map.keys(): + fieldname, i = tag_fieldname_map[tag] + if tag in tag_map: + fieldvalue = tag_map[tag][i] + if tag == 6: + fieldvalue = to_base(fieldvalue, base=32) + entry[fieldname] = fieldvalue + if tag == 3: + entry['text'] = cncx.get(fieldvalue, 'Unknown Text') + if tag == 5: + entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind') + index_entries.append(entry) + + return index_entries + +def build_toc(index_entries): + ans = TOC(base_path=os.getcwdu()) + levels = {x['hlvl'] for x in index_entries} + num_map = {-1: ans} + level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in + levels} + for lvl in sorted(levels): + for item in level_map[lvl]: + parent = num_map[item['parent']] + child = parent.add_item(item['href'], item['idtag'], item['text']) + num_map[item['num']] = child + + # Set play orders in depth first order + for i, item in enumerate(ans.flat()): + item.play_order = i + + return ans + diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 5e31cf0301..cc30991392 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct +import struct, string from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -340,4 +340,26 @@ def detect_periodical(toc, log=None): return False return True +def count_set_bits(num): + if num < 0: + num = -num + ans = 0 + while num > 0: + ans += (num & 0b1) + num >>= 1 + return ans + +def to_base(num, base=32): + digits = string.digits + string.ascii_uppercase + sign = 1 if num >= 0 else -1 + if num == 0: return '0' + num *= sign + ans = [] + while num: + ans.append(digits[(num % base)]) + num //= base + if sign < 0: + ans.append('-') + ans.reverse() + return ''.join(ans)