Support for reading KF8

This commit is contained in:
Kovid Goyal 2012-03-09 21:30:24 +05:30
parent 93bf57e6c4
commit d1b6bb705d
13 changed files with 1426 additions and 293 deletions

View File

@ -263,7 +263,7 @@ class MOBIMetadataReader(MetadataReaderPlugin):
description = _('Read metadata from %s files')%'MOBI' description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype): def get_metadata(self, stream, ftype):
from calibre.ebooks.mobi.reader import get_metadata from calibre.ebooks.metadata.mobi import get_metadata
return get_metadata(stream) return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin): class ODTMetadataReader(MetadataReaderPlugin):

View File

@ -10,7 +10,7 @@ Generates and writes an APNX page mapping file.
import struct import struct
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.utils.logging import default_log from calibre.utils.logging import default_log

View File

@ -3,7 +3,10 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import PersistentTemporaryDirectory
class MOBIInput(InputFormatPlugin): class MOBIInput(InputFormatPlugin):
@ -14,17 +17,43 @@ class MOBIInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.mobi.reader import MobiReader
if os.environ.get('USE_MOBIUNPACK', None) is not None:
try:
from mobiunpack.mobi_unpack import Mobi8Reader
from calibre.customize.ui import plugin_for_input_format
wdir = PersistentTemporaryDirectory('_unpack_space')
m8r = Mobi8Reader(stream, wdir)
if m8r.isK8():
epub_path = m8r.processMobi8()
epub_input = plugin_for_input_format('epub')
for opt in epub_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = m8r.getCodec()
return epub_input.convert(open(epub_path,'rb'), options,
'epub', log, accelerators)
except Exception:
log.exception('mobi_unpack code not working')
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html from lxml import html
parse_cache = {} parse_cache = {}
try: try:
mr = MobiReader(stream, log, options.input_encoding, mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline) options.debug_pipeline)
mr.extract_content(u'.', parse_cache) if mr.kf8_type is None:
mr.extract_content(u'.', parse_cache)
except: except:
mr = MobiReader(stream, log, options.input_encoding, mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True) options.debug_pipeline, try_extra_data_fix=True)
mr.extract_content(u'.', parse_cache) if mr.kf8_type is None:
mr.extract_content(u'.', parse_cache)
if mr.kf8_type is not None:
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
return os.path.abspath(Mobi8Reader(mr, log)())
raw = parse_cache.pop('calibre_raw_mobi_markup', False) raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw: if raw:

View File

@ -9,6 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
'Marshall T. Vandegrift <llasram@gmail.com>' 'Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, cStringIO
from struct import pack, unpack from struct import pack, unpack
from cStringIO import StringIO from cStringIO import StringIO
@ -433,3 +434,75 @@ def set_metadata(stream, mi):
mu = MetadataUpdater(stream) mu = MetadataUpdater(stream)
mu.update(mi) mu.update(mi)
return return
def get_metadata(stream):
from calibre.ebooks.metadata import MetaInformation
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre import CurrentDir
try:
from PIL import Image as PILImage
PILImage
except ImportError:
import Image as PILImage
stream.seek(0)
try:
raw = stream.read(3)
except:
raw = ''
stream.seek(0)
if raw == b'TPZ':
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
from calibre.utils.logging import Log
log = Log()
try:
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
except:
mi = MetaInformation(_('Unknown'), [_('Unknown')])
mh = MetadataHeader(stream, log)
if mh.title and mh.title != _('Unknown'):
mi.title = mh.title
if mh.exth is not None:
if mh.exth.mi is not None:
mi = mh.exth.mi
else:
size = 1024**3
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
pos = stream.tell()
stream.seek(0, 2)
size = stream.tell()
stream.seek(pos)
if size < 4*1024*1024:
with TemporaryDirectory('_mobi_meta_reader') as tdir:
with CurrentDir(tdir):
mr = MobiReader(stream, log)
parse_cache = {}
mr.extract_content(tdir, parse_cache)
if mr.embedded_mi is not None:
mi = mr.embedded_mi
if hasattr(mh.exth, 'cover_offset'):
cover_index = mh.first_image_index + mh.exth.cover_offset
data = mh.section_data(int(cover_index))
else:
try:
data = mh.section_data(mh.first_image_index)
except:
data = ''
buf = cStringIO.StringIO(data)
try:
im = PILImage.open(buf)
except:
log.exception('Failed to read MOBI cover')
else:
obuf = cStringIO.StringIO()
im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
return mi

View File

@ -46,7 +46,7 @@ class TOC(list):
self.toc_thumbnail = toc_thumbnail self.toc_thumbnail = toc_thumbnail
def __str__(self): def __str__(self):
lines = ['TOC: %s#%s'%(self.href, self.fragment)] lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
for child in self: for child in self:
c = str(child).splitlines() c = str(child).splitlines()
for l in c: for l in c:

View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,258 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (absolute_import, print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from calibre import replace_entities
from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
NULL_INDEX = 0xffffffff
class EXTHHeader(object): # {{{
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
self.start_offset = None
left = self.num_items
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
co, = struct.unpack('>L', content)
if co < NULL_INDEX:
self.cover_offset = co
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif id == 501:
# cdetype
pass
elif id == 502:
# last update time
pass
elif id == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = content.decode(codec)
except:
pass
#else:
# print 'unknown record', id, repr(content)
if title:
self.mi.title = replace_entities(title)
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
self.mi.authors = []
au = content.decode(codec, 'ignore').strip()
self.mi.authors.append(au)
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip()
elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore').strip()
elif id == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in content.decode(codec,
'ignore').split(';')])
self.mi.tags = list(set(self.mi.tags))
elif id == 106:
try:
self.mi.pubdate = parse_date(content, as_utc=False)
except:
pass
elif id == 108:
pass # Producer
elif id == 113:
pass # ASIN or UUID
elif id == 116:
self.start_offset, = struct.unpack(b'>L', content)
#else:
# print 'unhandled metadata record', id, repr(content)
# }}}
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == 'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# There exists some broken DRM removal tool that removes DRM but
# leaves the DRM fields in the header yielding a header size of
# 0xF8. The actual value of max_header_length should be 0xE8 but
# it's changed to accommodate this silly tool. Hopefully that will
# not break anything else.
max_header_length = 0xF8
if (ident == 'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL',
raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
self.title)
self.exth.mi.uid = self.unique_id
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
self.ncxidx = NULL_INDEX
if len(raw) >= 0xF8:
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
if self.mobi_version >= 8:
self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
# Index into <div> sections in raw_ml
self.dividx, = struct.unpack_from('>L', raw, 0xF8)
# Index into Other files
self.othidx, = struct.unpack_from('>L', raw, 0x104)
# need to use the FDST record to find out how to properly
# unpack the raw_ml into pieces it is simply a table of start
# and end locations for each flow piece
self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
# if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX
else: # Null values
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
NULL_INDEX
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the length of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
self.stream.seek(start)
return self.stream.read()

View File

@ -0,0 +1,195 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict
from calibre.ebooks.mobi.utils import decint, count_set_bits
class InvalidFile(ValueError):
pass
def check_signature(data, signature):
if data[:len(signature)] != signature:
raise InvalidFile('Not a valid %r section'%signature)
class NotAnINDXRecord(InvalidFile):
pass
class NotATAGXSection(InvalidFile):
pass
def format_bytes(byts):
byts = bytearray(byts)
byts = [hex(b)[2:] for b in byts]
return ' '.join(byts)
def parse_indx_header(data):
check_signature(data, b'INDX')
words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
)
num = len(words)
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
header = {words[i]:values[i] for i in xrange(num)}
return header
class CNCX(object): # {{{
'''
Parses the records that contain the compiled NCX (all strings from the
NCX). Presents a simple offset : string mapping interface to access the
data.
'''
def __init__(self, records, codec):
self.records = OrderedDict()
record_offset = 0
for raw in records:
pos = 0
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
try:
self.records[pos+record_offset] = raw[
pos+consumed:pos+consumed+length].decode(codec)
except:
byts = raw[pos:]
r = format_bytes(byts)
print ('CNCX entry at offset %d has unknown format %s'%(
pos+record_offset, r))
self.records[pos+record_offset] = r
pos = len(raw)
pos += consumed+length
record_offset += 0x10000
def __getitem__(self, offset):
return self.records.get(offset)
def get(self, offset, default=None):
return self.records.get(offset, default)
# }}}
def parse_tag_section(data):
check_signature(data, b'TAGX')
tags = []
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
# Skip the first 12 bytes already read above.
for i in xrange(12, first_entry_offset, 4):
pos = i
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
ord(data[pos+3])))
return control_byte_count, tags
def get_tag_map(control_byte_count, tags, data, start, end):
ptags = []
ans = {}
control_byte_index = 0
data_start = start + control_byte_count
for tag, values_per_entry, mask, end_flag in tags:
if end_flag == 0x01:
control_byte_index += 1
continue
value = ord(data[start + control_byte_index]) & mask
if value != 0:
if value == mask:
if count_set_bits(mask) > 1:
# If all bits of masked value are set and the mask has more than one bit, a variable width value
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
# which will contain the corresponding variable width values.
value, consumed = decint(data[data_start:])
data_start += consumed
ptags.append((tag, None, value, values_per_entry))
else:
ptags.append((tag, 1, None, values_per_entry))
else:
# Shift bits to get the masked value.
while mask & 0x01 == 0:
mask = mask >> 1
value = value >> 1
ptags.append((tag, value, None, values_per_entry))
for tag, value_count, value_bytes, values_per_entry in ptags:
values = []
if value_count != None:
# Read value_count * values_per_entry variable width values.
for _ in xrange(value_count*values_per_entry):
byts, consumed = decint(data[data_start:])
data_start += consumed
values.append(byts)
else:
# Convert value_bytes to variable width values.
total_consumed = 0
while total_consumed < value_bytes:
# Does this work for values_per_entry != 1?
byts, consumed = decint(data[data_start:])
data_start += consumed
total_consumed += consumed
values.append(byts)
if total_consumed != value_bytes:
print ("Error: Should consume %s bytes, but consumed %s" %
(value_bytes, total_consumed))
ans[tag] = values
# Test that all bytes have been processed if end is given.
if end is not None and data_start < end:
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
rest = data[data_start:end]
if rest.replace(b'\0', b''):
print ("Warning: There are unprocessed index bytes left: %s" %
format_bytes(rest))
return ans
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len']
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
data = sections[i][0]
header = parse_indx_header(data)
idxt_pos = header['start']
entry_count = header['count']
# loop through to build up the IDXT position starts
idx_positions= []
for j in xrange(entry_count):
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
idx_positions.append(pos)
# The last entry ends before the IDXT tag (but there might be zero fill
# bytes we need to ignore!)
idx_positions.append(idxt_pos)
# For each entry in the IDXT build up the tag map and any associated
# text
for j in xrange(entry_count):
start, end = idx_positions[j:j+2]
text_length = ord(data[start])
text = data[start+1:start+1+text_length]
tag_map = get_tag_map(control_byte_count, tags, data,
start+1+text_length, end)
table[text] = tag_map
return table, cncx

View File

@ -0,0 +1,307 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os
def update_internal_links(mobi8_reader):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
# XXXX is the offset in records into divtbl
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
mr = mobi8_reader
# pos:fid pattern
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
parts = []
for part in mr.parts:
srcpieces = posfid_pattern.split(part)
for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(b'<'):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.encode(mr.header.codec) + suffix
tag = posfid_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
parts.append(part)
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts):
# we can safely remove all of the Kindlegen generated aid tags
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_aid_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_aid_position_pattern.sub(replacement, tag,
1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
def update_flow_links(mobi8_reader, resource_map, log):
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
# kindle:embed:XXXX (used for fonts)
mr = mobi8_reader
flows = []
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
flows.append(flow)
continue
if not isinstance(flow, unicode):
flow = flow.decode(mr.header.codec)
# links to raster image files from image tags
# image_pattern
srcpieces = img_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized '
'as a valid image in %s' % (num, tag))
srcpieces[j] = tag
flow = "".join(srcpieces)
# replacements inside css url():
srcpieces = url_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
# process links to raster image files
for m in url_img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = url_img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as a '
'valid image in %s' % (num, tag))
# process links to fonts
for m in font_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href is None:
log.warn('Referenced font %s was not recognized as a '
'valid font in %s' % (num, tag))
else:
replacement = '"%s"'%('../'+ href)
tag = font_index_pattern.sub(replacement, tag, 1)
# process links to other css pieces
for m in url_css_index_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = url_css_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
# flow pattern not inside url()
srcpieces = re.split(tag_pattern, flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in re.finditer(flow_pattern, tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
flowtext = mr.flows[num]
tag = flowtext
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
flows.append(flow)
# All flows are now unicode and have links resolved
return flows
def insert_flows_into_markup(parts, flows, mobi8_reader):
mr = mobi8_reader
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# flow pattern
srcpieces = tag_pattern.split(part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in flow_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
tag = flows[num]
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
# links to raster image files
# image_pattern
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../' + href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(svg)[^>]*>)''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# tag pattern
srcpieces = re.split(tag_pattern, part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag[:4].lower() == '<svg':
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
tag = tag.replace('viewbox','viewBox')
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text
flows = update_flow_links(mobi8_reader, resource_map, log)
# Insert inline flows into the markup
insert_flows_into_markup(parts, flows, mobi8_reader)
# Insert raster images into markup
insert_images_into_markup(parts, resource_map, log)
# Perform general markup cleanups
upshift_markup(parts)
# Update the parts and flows stored in the reader
mobi8_reader.parts = parts
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
f.write(part.encode('utf-8'))
spine.append(f.name)
for i, flow in enumerate(flows):
fi = mobi8_reader.flowinfo[i]
if fi.format == 'file':
if not os.path.exists(fi.dir):
os.mkdir(fi.dir)
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
f.write(flow.encode('utf-8'))
return spine

View File

@ -1,10 +1,12 @@
__license__ = 'GPL v3' #!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
''' from __future__ import (absolute_import, print_function)
Read data from .mobi files
'''
import shutil, os, re, struct, textwrap, cStringIO, sys __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil, os, re, struct, textwrap, cStringIO
try: try:
from PIL import Image as PILImage from PIL import Image as PILImage
@ -14,235 +16,22 @@ except ImportError:
from lxml import html, etree from lxml import html, etree
from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode, \ from calibre import (xml_entity_to_unicode, entity_to_unicode)
replace_entities
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.date import parse_date
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks import DRMError, unit_convert from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import BookHeader
class TopazError(ValueError): class TopazError(ValueError):
pass pass
class EXTHHeader(object):
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
left = self.num_items
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
co, = struct.unpack('>L', content)
if co < 1e7:
self.cover_offset = co
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif id == 501:
# cdetype
pass
elif id == 502:
# last update time
pass
elif id == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = content.decode(codec)
except:
pass
#else:
# print 'unknown record', id, repr(content)
if title:
self.mi.title = replace_entities(title)
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
self.mi.authors = []
au = content.decode(codec, 'ignore').strip()
self.mi.authors.append(au)
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip()
elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore').strip()
elif id == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in content.decode(codec,
'ignore').split(';')])
self.mi.tags = list(set(self.mi.tags))
elif id == 106:
try:
self.mi.pubdate = parse_date(content, as_utc=False)
except:
pass
elif id == 108:
pass # Producer
elif id == 113:
pass # ASIN or UUID
#else:
# print 'unhandled metadata record', id, repr(content)
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == 'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# There exists some broken DRM removal tool that removes DRM but
# leaves the DRM fields in the header yielding a header size of
# 0xF8. The actual value of max_header_length should be 0xE8 but
# it's changed to accommodate this silly tool. Hopefully that will
# not break anything else.
max_header_length = 0xF8
if (ident == 'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the lengh of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
return self.stream.read(os.stat(self.stream.name).st_size - start)
class MobiReader(object): class MobiReader(object):
PAGE_BREAK_PAT = re.compile( PAGE_BREAK_PAT = re.compile(
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*', r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
@ -312,15 +101,46 @@ class MobiReader(object):
self.sections.append((section(i), self.section_headers[i])) self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident, self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace') self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
is_kf8 = self.book_header.mobi_version == 8
if is_kf8:
self.kf8_type = 'standalone'
else: # Check for joint mobi 6 and kf 8 file
KF8_BOUNDARY = b'BOUNDARY'
for i, x in enumerate(self.sections[:-1]):
sec = x[0]
if (len(sec) == len(KF8_BOUNDARY) and sec ==
KF8_BOUNDARY):
try:
self.book_header = BookHeader(self.sections[i+1][0],
self.ident, user_encoding, self.log)
# The following are only correct in the Mobi 6
# header not the Mobi 8 header
for x in ('first_image_index',):
setattr(self.book_header, x, getattr(bh, x))
self.book_header.huff_offset += i + 1
self.kf8_type = 'joint'
self.kf8_boundary = i
except:
pass
break
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
except:
name = self.name
if not name:
name = self.name
raise DRMError(name)
def extract_content(self, output_dir, parse_cache): def extract_content(self, output_dir, parse_cache):
output_dir = os.path.abspath(output_dir) output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0: self.check_for_drm()
raise DRMError(self.name)
processed_records = self.extract_text() processed_records = self.extract_text()
if self.debug is not None: if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
@ -916,11 +736,12 @@ class MobiReader(object):
trail_size = self.sizeof_trailing_entries(data) trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size] return data[:len(data)-trail_size]
def extract_text(self): def extract_text(self, offset=1):
self.log.debug('Extracting text...') self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1, text_sections = [self.text_section(i) for i in xrange(offset,
min(self.book_header.records + 1, len(self.sections)))] min(self.book_header.records + offset, len(self.sections)))]
processed_records = list(range(0, self.book_header.records + 1)) processed_records = list(range(offset-1, self.book_header.records +
offset))
self.mobi_html = '' self.mobi_html = ''
@ -1027,63 +848,6 @@ class MobiReader(object):
self.image_names.append(os.path.basename(path)) self.image_names.append(os.path.basename(path))
im.save(open(path, 'wb'), format='JPEG') im.save(open(path, 'wb'), format='JPEG')
def get_metadata(stream):
stream.seek(0)
try:
raw = stream.read(3)
except:
raw = ''
stream.seek(0)
if raw == 'TPZ':
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
from calibre.utils.logging import Log
log = Log()
try:
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
except:
mi = MetaInformation(_('Unknown'), [_('Unknown')])
mh = MetadataHeader(stream, log)
if mh.title and mh.title != _('Unknown'):
mi.title = mh.title
if mh.exth is not None:
if mh.exth.mi is not None:
mi = mh.exth.mi
else:
size = sys.maxint
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
pos = stream.tell()
stream.seek(0, 2)
size = stream.tell()
stream.seek(pos)
if size < 4*1024*1024:
with TemporaryDirectory('_mobi_meta_reader') as tdir:
with CurrentDir(tdir):
mr = MobiReader(stream, log)
parse_cache = {}
mr.extract_content(tdir, parse_cache)
if mr.embedded_mi is not None:
mi = mr.embedded_mi
if hasattr(mh.exth, 'cover_offset'):
cover_index = mh.first_image_index + mh.exth.cover_offset
data = mh.section_data(int(cover_index))
else:
try:
data = mh.section_data(mh.first_image_index)
except:
data = ''
buf = cStringIO.StringIO(data)
try:
im = PILImage.open(buf)
except:
log.exception('Failed to read MOBI cover')
else:
obuf = cStringIO.StringIO()
im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
return mi
def test_mbp_regex(): def test_mbp_regex():
for raw, m in { for raw, m in {
'<mbp:pagebreak></mbp:pagebreak>':'', '<mbp:pagebreak></mbp:pagebreak>':'',

View File

@ -0,0 +1,390 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os, zlib, imghdr
from collections import namedtuple
from itertools import repeat
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
Part = namedtuple('Part',
'num type filename start end aid')
Elem = namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = namedtuple('FlowInfo',
'type format dir fname')
class Mobi8Reader(object):
def __init__(self, mobi6_reader, log):
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
def __call__(self):
self.mobi6_reader.check_for_drm()
offset = 1
res_end = len(self.mobi6_reader.sections)
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
res_end = self.mobi6_reader.kf8_boundary
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html
with open('debug-raw.html', 'wb') as f:
f.write(self.raw_ml)
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
first_resource_index = self.header.first_image_index
if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = self.header.records + 1
self.resource_sections = \
self.mobi6_reader.sections[first_resource_index:res_end]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.read_indices()
self.build_parts()
guide = self.create_guide()
ncx = self.create_ncx()
resource_map = self.extract_resources()
spine = self.expand_text(resource_map)
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
self.flow_table = (0, NULL_INDEX)
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
num_sections, = struct.unpack_from(b'>L', header, 0x08)
sections = header[0x0c:]
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
sections, 0)[::2] + (NULL_INDEX,)
self.files = []
if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0]
File = namedtuple('File',
'file_number name divtbl_count start_position length')
for i, text in enumerate(table.iterkeys()):
tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1]))
self.elems = []
if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec)
for i, text in enumerate(table.iterkeys()):
tag_map = table[text]
toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
self.guide = []
if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
'type title div_frag_num')
for i, ref_type in enumerate(table.iterkeys()):
tag_map = table[ref_type]
# ref_type, ref_title, div/frag number
title = cncx[tag_map[1][0]]
fileno = None
if 3 in tag_map.keys():
fileno = tag_map[3][0]
if 6 in tag_map.keys():
fileno = tag_map[6][0]
self.guide.append(Item(ref_type.decode(self.header.codec),
title, fileno))
def build_parts(self):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
# now split the raw_ml into its flow pieces
for j in xrange(0, len(self.flow_table)-1):
start = self.flow_table[j]
end = self.flow_table[j+1]
if end == NULL_INDEX:
end = len(raw_ml)
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text
text = self.flows[0]
self.flows[0] = b''
# walk the <skeleton> and <div> tables to build original source xhtml
# files *without* destroying any file position information needed for
# later href processing and create final list of file separation start:
# stop points and etc in partinfo
self.parts = []
self.partinfo = []
divptr = 0
baseptr = 0
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
baseptr = skelpos + skellen
skeleton = text[skelpos:baseptr]
for i in xrange(divcnt):
insertpos, idtext, filenum, seqnum, startpos, length = \
self.elems[divptr]
if i == 0:
aidtext = idtext[12:-2]
filename = 'part%04d.html' % filenum
part = text[baseptr:baseptr + length]
insertpos = insertpos - skelpos
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
baseptr = baseptr + length
divptr += 1
self.parts.append(skeleton)
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the
# original xhtml but have been stripped out and placed here.
# This can include local CDATA snippets and and svg sections.
# The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers
# and ereaders and causes epub validation issues because those raster
# images are in manifest but not in xhtml text - since they only
# referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/>
# and if so inline them into the xhtml text pieces.
# there may be other sorts of pieces stored here but until we see one
# in the wild to reverse engineer we won't be able to tell
self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
for j in xrange(1, len(self.flows)):
flowpart = self.flows[j]
nstr = '%04d' % j
m = svg_tag_pattern.search(flowpart)
if m != None:
# svg
typ = 'svg'
start = m.start()
m2 = image_tag_pattern.search(flowpart)
if m2 != None:
format = 'inline'
dir = None
fname = None
# strip off anything before <svg if inlining
flowpart = flowpart[start:]
else:
format = 'file'
dir = "images"
fname = 'svgimg' + nstr + '.svg'
else:
# search for CDATA and if exists inline it
if flowpart.find('[CDATA[') >= 0:
typ = 'css'
flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
format = 'inline'
dir = None
fname = None
else:
# css - assume as standalone css file
typ = 'css'
format = 'file'
dir = "styles"
fname = nstr + '.css'
self.flows[j] = flowpart
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
def get_file_info(self, pos):
''' Get information about the part (file) that exists at pos in
the raw markup '''
for part in self.partinfo:
if pos >= part.start and pos < part.end:
return part
return Part(*repeat(None, len(Part._fields)))
def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file
row = int(posfid, 32)
off = int(offset, 32)
[insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
pos = insertpos + off
fname = self.get_file_info(pos).filename
# an existing "id=" must exist in original xhtml otherwise it would not
# have worked for linking. Amazon seems to have added its own
# additional "aid=" inside tags whose contents seem to represent some
# position information encoded into Base32 name.
# so find the closest "id=" before position the file by actually
# searching in that file
idtext = self.get_id_tag(pos)
return fname, idtext
def get_id_tag(self, pos):
# find the correct tag by actually searching in the destination
# textblock at position
fi = self.get_file_info(pos)
if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos)
textblock = self.parts[fi.num]
id_map = []
npos = pos - fi.start
# if npos inside a tag then search all text before the its end of tag
# marker
pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos)
if pgt < plt:
npos = pgt + 1
# find id links only inside of tags
# inside any < > pair find all "id=' and return whatever is inside
# the quotes
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
re.IGNORECASE)
for m in re.finditer(id_pattern, textblock):
id_map.append((m.start(), m.group(1)))
if not id_map:
# Found no id in the textblock, link must be to top of file
return b''
# if npos is before first id= inside a tag, return the first
if npos < id_map[0][0]:
return id_map[0][1]
# if npos is after the last id= inside a tag, return the last
if npos > id_map[-1][0]:
return id_map[-1][1]
# otherwise find last id before npos
for i, item in enumerate(id_map):
if npos < item[0]:
return id_map[i-1][1]
return id_map[0][1]
def create_guide(self):
guide = Guide()
for ref_type, ref_title, fileno in self.guide:
elem = self.elems[fileno]
fi = self.get_file_info(elem.insert_pos)
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += b'#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
g.title, g.type = ref_title, ref_type
guide.append(g)
so = self.header.exth.start_offset
if so not in {None, NULL_INDEX}:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
g.title, g.type = 'start', 'text'
guide.append(g)
return guide
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
# Add href and anchor info to the index entries
for entry in index_entries:
pos = entry['pos']
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos).decode(self.header.codec)
entry['href'] = '%s/%s'%(fi.type, fi.filename)
entry['idtag'] = idtag
# Build the TOC object
return build_toc(index_entries)
def extract_resources(self):
resource_map = []
for x in ('fonts', 'images'):
os.mkdir(x)
for i, sec in enumerate(self.resource_sections):
fname_idx = i+1
data = sec[0]
typ = data[:4]
href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
pass # Ignore these records
elif typ == b'FONT':
# fonts only exist in K8 ebooks
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: ?? Expanded size in bytes ??
# bytes 8 - 11: ?? number of files ??
# bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
# bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib?
# The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
data = data[26:-4]
uncompressed_data = zlib.decompress(data, -15)
hdr = uncompressed_data[0:4]
ext = 'dat'
if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
ext = 'ttf'
href = "fonts/%05d.%s" % (fname_idx, ext)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(uncompressed_data)
else:
imgtype = imghdr.what(None, data)
if imgtype is None:
imgtype = 'unknown'
href = 'images/%05d.%s'%(fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
resource_map.append(href)
return resource_map
def expand_text(self, resource_map):
return expand_mobi8_markup(self, resource_map, self.log)
def write_opf(self, guide, toc, spine, resource_map):
mi = self.header.exth.mi
if (self.cover_offset is not None and self.cover_offset <
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
opf = OPFCreator(os.getcwdu(), mi)
opf.guide = guide
opf.create_manifest_from_files_in([os.getcwdu()])
opf.create_spine(spine)
opf.set_toc(toc)
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'

View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
tag_fieldname_map = {
1: ['pos',0],
2: ['len',0],
3: ['noffs',0],
4: ['hlvl',0],
5: ['koffs',0],
6: ['pos_fid',0],
21: ['parent',0],
22: ['child1',0],
23: ['childn',0]
}
def read_ncx(sections, index, codec):
index_entries = []
if index != NULL_INDEX:
table, cncx = read_index(sections, index, codec)
for num, x in enumerate(table.iteritems()):
text, tag_map = x
entry = {
'name': text,
'pos': -1,
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Kind",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'num' : num
}
for tag in tag_fieldname_map.keys():
fieldname, i = tag_fieldname_map[tag]
if tag in tag_map:
fieldvalue = tag_map[tag][i]
if tag == 6:
fieldvalue = to_base(fieldvalue, base=32)
entry[fieldname] = fieldvalue
if tag == 3:
entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
if tag == 5:
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
index_entries.append(entry)
return index_entries
def build_toc(index_entries):
ans = TOC(base_path=os.getcwdu())
levels = {x['hlvl'] for x in index_entries}
num_map = {-1: ans}
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
levels}
for lvl in sorted(levels):
for item in level_map[lvl]:
parent = num_map[item['parent']]
child = parent.add_item(item['href'], item['idtag'], item['text'])
num_map[item['num']] = child
# Set play orders in depth first order
for i, item in enumerate(ans.flat()):
item.play_order = i
return ans

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct import struct, string
from collections import OrderedDict from collections import OrderedDict
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
@ -340,4 +340,26 @@ def detect_periodical(toc, log=None):
return False return False
return True return True
def count_set_bits(num):
if num < 0:
num = -num
ans = 0
while num > 0:
ans += (num & 0b1)
num >>= 1
return ans
def to_base(num, base=32):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
if num == 0: return '0'
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
if sign < 0:
ans.append('-')
ans.reverse()
return ''.join(ans)