Support for reading KF8

This commit is contained in:
Kovid Goyal 2012-03-09 21:30:24 +05:30
parent 93bf57e6c4
commit d1b6bb705d
13 changed files with 1426 additions and 293 deletions

View File

@ -263,7 +263,7 @@ class MOBIMetadataReader(MetadataReaderPlugin):
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.mobi.reader import get_metadata
from calibre.ebooks.metadata.mobi import get_metadata
return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin):

View File

@ -10,7 +10,7 @@ Generates and writes an APNX page mapping file.
import struct
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.utils.logging import default_log

View File

@ -3,7 +3,10 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import PersistentTemporaryDirectory
class MOBIInput(InputFormatPlugin):
@ -14,17 +17,43 @@ class MOBIInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.mobi.reader import MobiReader
if os.environ.get('USE_MOBIUNPACK', None) is not None:
try:
from mobiunpack.mobi_unpack import Mobi8Reader
from calibre.customize.ui import plugin_for_input_format
wdir = PersistentTemporaryDirectory('_unpack_space')
m8r = Mobi8Reader(stream, wdir)
if m8r.isK8():
epub_path = m8r.processMobi8()
epub_input = plugin_for_input_format('epub')
for opt in epub_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = m8r.getCodec()
return epub_input.convert(open(epub_path,'rb'), options,
'epub', log, accelerators)
except Exception:
log.exception('mobi_unpack code not working')
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html
parse_cache = {}
try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
mr.extract_content(u'.', parse_cache)
if mr.kf8_type is None:
mr.extract_content(u'.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
mr.extract_content(u'.', parse_cache)
if mr.kf8_type is None:
mr.extract_content(u'.', parse_cache)
if mr.kf8_type is not None:
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
return os.path.abspath(Mobi8Reader(mr, log)())
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:

View File

@ -9,6 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
'Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, cStringIO
from struct import pack, unpack
from cStringIO import StringIO
@ -433,3 +434,75 @@ def set_metadata(stream, mi):
mu = MetadataUpdater(stream)
mu.update(mi)
return
def get_metadata(stream):
from calibre.ebooks.metadata import MetaInformation
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre import CurrentDir
try:
from PIL import Image as PILImage
PILImage
except ImportError:
import Image as PILImage
stream.seek(0)
try:
raw = stream.read(3)
except:
raw = ''
stream.seek(0)
if raw == b'TPZ':
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
from calibre.utils.logging import Log
log = Log()
try:
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
except:
mi = MetaInformation(_('Unknown'), [_('Unknown')])
mh = MetadataHeader(stream, log)
if mh.title and mh.title != _('Unknown'):
mi.title = mh.title
if mh.exth is not None:
if mh.exth.mi is not None:
mi = mh.exth.mi
else:
size = 1024**3
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
pos = stream.tell()
stream.seek(0, 2)
size = stream.tell()
stream.seek(pos)
if size < 4*1024*1024:
with TemporaryDirectory('_mobi_meta_reader') as tdir:
with CurrentDir(tdir):
mr = MobiReader(stream, log)
parse_cache = {}
mr.extract_content(tdir, parse_cache)
if mr.embedded_mi is not None:
mi = mr.embedded_mi
if hasattr(mh.exth, 'cover_offset'):
cover_index = mh.first_image_index + mh.exth.cover_offset
data = mh.section_data(int(cover_index))
else:
try:
data = mh.section_data(mh.first_image_index)
except:
data = ''
buf = cStringIO.StringIO(data)
try:
im = PILImage.open(buf)
except:
log.exception('Failed to read MOBI cover')
else:
obuf = cStringIO.StringIO()
im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
return mi

View File

@ -46,7 +46,7 @@ class TOC(list):
self.toc_thumbnail = toc_thumbnail
def __str__(self):
lines = ['TOC: %s#%s'%(self.href, self.fragment)]
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
for child in self:
c = str(child).splitlines()
for l in c:

View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,258 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (absolute_import, print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from calibre import replace_entities
from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
NULL_INDEX = 0xffffffff
class EXTHHeader(object): # {{{
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
self.start_offset = None
left = self.num_items
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
co, = struct.unpack('>L', content)
if co < NULL_INDEX:
self.cover_offset = co
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif id == 501:
# cdetype
pass
elif id == 502:
# last update time
pass
elif id == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = content.decode(codec)
except:
pass
#else:
# print 'unknown record', id, repr(content)
if title:
self.mi.title = replace_entities(title)
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
self.mi.authors = []
au = content.decode(codec, 'ignore').strip()
self.mi.authors.append(au)
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip()
elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore').strip()
elif id == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in content.decode(codec,
'ignore').split(';')])
self.mi.tags = list(set(self.mi.tags))
elif id == 106:
try:
self.mi.pubdate = parse_date(content, as_utc=False)
except:
pass
elif id == 108:
pass # Producer
elif id == 113:
pass # ASIN or UUID
elif id == 116:
self.start_offset, = struct.unpack(b'>L', content)
#else:
# print 'unhandled metadata record', id, repr(content)
# }}}
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == 'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# There exists some broken DRM removal tool that removes DRM but
# leaves the DRM fields in the header yielding a header size of
# 0xF8. The actual value of max_header_length should be 0xE8 but
# it's changed to accommodate this silly tool. Hopefully that will
# not break anything else.
max_header_length = 0xF8
if (ident == 'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL',
raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
self.title)
self.exth.mi.uid = self.unique_id
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
self.ncxidx = NULL_INDEX
if len(raw) >= 0xF8:
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
if self.mobi_version >= 8:
self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
# Index into <div> sections in raw_ml
self.dividx, = struct.unpack_from('>L', raw, 0xF8)
# Index into Other files
self.othidx, = struct.unpack_from('>L', raw, 0x104)
# need to use the FDST record to find out how to properly
# unpack the raw_ml into pieces it is simply a table of start
# and end locations for each flow piece
self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
# if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX
else: # Null values
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
NULL_INDEX
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the length of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
self.stream.seek(start)
return self.stream.read()

View File

@ -0,0 +1,195 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict
from calibre.ebooks.mobi.utils import decint, count_set_bits
class InvalidFile(ValueError):
pass
def check_signature(data, signature):
if data[:len(signature)] != signature:
raise InvalidFile('Not a valid %r section'%signature)
class NotAnINDXRecord(InvalidFile):
pass
class NotATAGXSection(InvalidFile):
pass
def format_bytes(byts):
byts = bytearray(byts)
byts = [hex(b)[2:] for b in byts]
return ' '.join(byts)
def parse_indx_header(data):
check_signature(data, b'INDX')
words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
)
num = len(words)
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
header = {words[i]:values[i] for i in xrange(num)}
return header
class CNCX(object): # {{{
'''
Parses the records that contain the compiled NCX (all strings from the
NCX). Presents a simple offset : string mapping interface to access the
data.
'''
def __init__(self, records, codec):
self.records = OrderedDict()
record_offset = 0
for raw in records:
pos = 0
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
try:
self.records[pos+record_offset] = raw[
pos+consumed:pos+consumed+length].decode(codec)
except:
byts = raw[pos:]
r = format_bytes(byts)
print ('CNCX entry at offset %d has unknown format %s'%(
pos+record_offset, r))
self.records[pos+record_offset] = r
pos = len(raw)
pos += consumed+length
record_offset += 0x10000
def __getitem__(self, offset):
return self.records.get(offset)
def get(self, offset, default=None):
return self.records.get(offset, default)
# }}}
def parse_tag_section(data):
check_signature(data, b'TAGX')
tags = []
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
# Skip the first 12 bytes already read above.
for i in xrange(12, first_entry_offset, 4):
pos = i
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
ord(data[pos+3])))
return control_byte_count, tags
def get_tag_map(control_byte_count, tags, data, start, end):
ptags = []
ans = {}
control_byte_index = 0
data_start = start + control_byte_count
for tag, values_per_entry, mask, end_flag in tags:
if end_flag == 0x01:
control_byte_index += 1
continue
value = ord(data[start + control_byte_index]) & mask
if value != 0:
if value == mask:
if count_set_bits(mask) > 1:
# If all bits of masked value are set and the mask has more than one bit, a variable width value
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
# which will contain the corresponding variable width values.
value, consumed = decint(data[data_start:])
data_start += consumed
ptags.append((tag, None, value, values_per_entry))
else:
ptags.append((tag, 1, None, values_per_entry))
else:
# Shift bits to get the masked value.
while mask & 0x01 == 0:
mask = mask >> 1
value = value >> 1
ptags.append((tag, value, None, values_per_entry))
for tag, value_count, value_bytes, values_per_entry in ptags:
values = []
if value_count != None:
# Read value_count * values_per_entry variable width values.
for _ in xrange(value_count*values_per_entry):
byts, consumed = decint(data[data_start:])
data_start += consumed
values.append(byts)
else:
# Convert value_bytes to variable width values.
total_consumed = 0
while total_consumed < value_bytes:
# Does this work for values_per_entry != 1?
byts, consumed = decint(data[data_start:])
data_start += consumed
total_consumed += consumed
values.append(byts)
if total_consumed != value_bytes:
print ("Error: Should consume %s bytes, but consumed %s" %
(value_bytes, total_consumed))
ans[tag] = values
# Test that all bytes have been processed if end is given.
if end is not None and data_start < end:
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
rest = data[data_start:end]
if rest.replace(b'\0', b''):
print ("Warning: There are unprocessed index bytes left: %s" %
format_bytes(rest))
return ans
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len']
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
data = sections[i][0]
header = parse_indx_header(data)
idxt_pos = header['start']
entry_count = header['count']
# loop through to build up the IDXT position starts
idx_positions= []
for j in xrange(entry_count):
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
idx_positions.append(pos)
# The last entry ends before the IDXT tag (but there might be zero fill
# bytes we need to ignore!)
idx_positions.append(idxt_pos)
# For each entry in the IDXT build up the tag map and any associated
# text
for j in xrange(entry_count):
start, end = idx_positions[j:j+2]
text_length = ord(data[start])
text = data[start+1:start+1+text_length]
tag_map = get_tag_map(control_byte_count, tags, data,
start+1+text_length, end)
table[text] = tag_map
return table, cncx

View File

@ -0,0 +1,307 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os
def update_internal_links(mobi8_reader):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
# XXXX is the offset in records into divtbl
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
mr = mobi8_reader
# pos:fid pattern
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
parts = []
for part in mr.parts:
srcpieces = posfid_pattern.split(part)
for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(b'<'):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.encode(mr.header.codec) + suffix
tag = posfid_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
parts.append(part)
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts):
# we can safely remove all of the Kindlegen generated aid tags
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_aid_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_aid_position_pattern.sub(replacement, tag,
1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
def update_flow_links(mobi8_reader, resource_map, log):
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
# kindle:embed:XXXX (used for fonts)
mr = mobi8_reader
flows = []
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
flows.append(flow)
continue
if not isinstance(flow, unicode):
flow = flow.decode(mr.header.codec)
# links to raster image files from image tags
# image_pattern
srcpieces = img_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized '
'as a valid image in %s' % (num, tag))
srcpieces[j] = tag
flow = "".join(srcpieces)
# replacements inside css url():
srcpieces = url_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
# process links to raster image files
for m in url_img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = url_img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as a '
'valid image in %s' % (num, tag))
# process links to fonts
for m in font_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href is None:
log.warn('Referenced font %s was not recognized as a '
'valid font in %s' % (num, tag))
else:
replacement = '"%s"'%('../'+ href)
tag = font_index_pattern.sub(replacement, tag, 1)
# process links to other css pieces
for m in url_css_index_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = url_css_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
# flow pattern not inside url()
srcpieces = re.split(tag_pattern, flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in re.finditer(flow_pattern, tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
flowtext = mr.flows[num]
tag = flowtext
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
flows.append(flow)
# All flows are now unicode and have links resolved
return flows
def insert_flows_into_markup(parts, flows, mobi8_reader):
mr = mobi8_reader
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# flow pattern
srcpieces = tag_pattern.split(part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in flow_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
tag = flows[num]
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
# links to raster image files
# image_pattern
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../' + href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(svg)[^>]*>)''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# tag pattern
srcpieces = re.split(tag_pattern, part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag[:4].lower() == '<svg':
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
tag = tag.replace('viewbox','viewBox')
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text
flows = update_flow_links(mobi8_reader, resource_map, log)
# Insert inline flows into the markup
insert_flows_into_markup(parts, flows, mobi8_reader)
# Insert raster images into markup
insert_images_into_markup(parts, resource_map, log)
# Perform general markup cleanups
upshift_markup(parts)
# Update the parts and flows stored in the reader
mobi8_reader.parts = parts
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
f.write(part.encode('utf-8'))
spine.append(f.name)
for i, flow in enumerate(flows):
fi = mobi8_reader.flowinfo[i]
if fi.format == 'file':
if not os.path.exists(fi.dir):
os.mkdir(fi.dir)
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
f.write(flow.encode('utf-8'))
return spine

View File

@ -1,10 +1,12 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Read data from .mobi files
'''
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (absolute_import, print_function)
import shutil, os, re, struct, textwrap, cStringIO, sys
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil, os, re, struct, textwrap, cStringIO
try:
from PIL import Image as PILImage
@ -14,235 +16,22 @@ except ImportError:
from lxml import html, etree
from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode, \
replace_entities
from calibre import (xml_entity_to_unicode, entity_to_unicode)
from calibre.utils.filenames import ascii_filename
from calibre.utils.date import parse_date
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import BookHeader
class TopazError(ValueError):
pass
class EXTHHeader(object):
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
left = self.num_items
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
co, = struct.unpack('>L', content)
if co < 1e7:
self.cover_offset = co
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif id == 501:
# cdetype
pass
elif id == 502:
# last update time
pass
elif id == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = content.decode(codec)
except:
pass
#else:
# print 'unknown record', id, repr(content)
if title:
self.mi.title = replace_entities(title)
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
self.mi.authors = []
au = content.decode(codec, 'ignore').strip()
self.mi.authors.append(au)
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip()
elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore').strip()
elif id == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in content.decode(codec,
'ignore').split(';')])
self.mi.tags = list(set(self.mi.tags))
elif id == 106:
try:
self.mi.pubdate = parse_date(content, as_utc=False)
except:
pass
elif id == 108:
pass # Producer
elif id == 113:
pass # ASIN or UUID
#else:
# print 'unhandled metadata record', id, repr(content)
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == 'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# There exists some broken DRM removal tool that removes DRM but
# leaves the DRM fields in the header yielding a header size of
# 0xF8. The actual value of max_header_length should be 0xE8 but
# it's changed to accommodate this silly tool. Hopefully that will
# not break anything else.
max_header_length = 0xF8
if (ident == 'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the lengh of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
return self.stream.read(os.stat(self.stream.name).st_size - start)
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
@ -312,15 +101,46 @@ class MobiReader(object):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident,
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
is_kf8 = self.book_header.mobi_version == 8
if is_kf8:
self.kf8_type = 'standalone'
else: # Check for joint mobi 6 and kf 8 file
KF8_BOUNDARY = b'BOUNDARY'
for i, x in enumerate(self.sections[:-1]):
sec = x[0]
if (len(sec) == len(KF8_BOUNDARY) and sec ==
KF8_BOUNDARY):
try:
self.book_header = BookHeader(self.sections[i+1][0],
self.ident, user_encoding, self.log)
# The following are only correct in the Mobi 6
# header not the Mobi 8 header
for x in ('first_image_index',):
setattr(self.book_header, x, getattr(bh, x))
self.book_header.huff_offset += i + 1
self.kf8_type = 'joint'
self.kf8_boundary = i
except:
pass
break
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
except:
name = self.name
if not name:
name = self.name
raise DRMError(name)
def extract_content(self, output_dir, parse_cache):
output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0:
raise DRMError(self.name)
self.check_for_drm()
processed_records = self.extract_text()
if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
@ -916,11 +736,12 @@ class MobiReader(object):
trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size]
def extract_text(self):
def extract_text(self, offset=1):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1,
min(self.book_header.records + 1, len(self.sections)))]
processed_records = list(range(0, self.book_header.records + 1))
text_sections = [self.text_section(i) for i in xrange(offset,
min(self.book_header.records + offset, len(self.sections)))]
processed_records = list(range(offset-1, self.book_header.records +
offset))
self.mobi_html = ''
@ -1027,63 +848,6 @@ class MobiReader(object):
self.image_names.append(os.path.basename(path))
im.save(open(path, 'wb'), format='JPEG')
def get_metadata(stream):
stream.seek(0)
try:
raw = stream.read(3)
except:
raw = ''
stream.seek(0)
if raw == 'TPZ':
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
from calibre.utils.logging import Log
log = Log()
try:
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
except:
mi = MetaInformation(_('Unknown'), [_('Unknown')])
mh = MetadataHeader(stream, log)
if mh.title and mh.title != _('Unknown'):
mi.title = mh.title
if mh.exth is not None:
if mh.exth.mi is not None:
mi = mh.exth.mi
else:
size = sys.maxint
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
pos = stream.tell()
stream.seek(0, 2)
size = stream.tell()
stream.seek(pos)
if size < 4*1024*1024:
with TemporaryDirectory('_mobi_meta_reader') as tdir:
with CurrentDir(tdir):
mr = MobiReader(stream, log)
parse_cache = {}
mr.extract_content(tdir, parse_cache)
if mr.embedded_mi is not None:
mi = mr.embedded_mi
if hasattr(mh.exth, 'cover_offset'):
cover_index = mh.first_image_index + mh.exth.cover_offset
data = mh.section_data(int(cover_index))
else:
try:
data = mh.section_data(mh.first_image_index)
except:
data = ''
buf = cStringIO.StringIO(data)
try:
im = PILImage.open(buf)
except:
log.exception('Failed to read MOBI cover')
else:
obuf = cStringIO.StringIO()
im.convert('RGB').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
return mi
def test_mbp_regex():
for raw, m in {
'<mbp:pagebreak></mbp:pagebreak>':'',

View File

@ -0,0 +1,390 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os, zlib, imghdr
from collections import namedtuple
from itertools import repeat
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
Part = namedtuple('Part',
'num type filename start end aid')
Elem = namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = namedtuple('FlowInfo',
'type format dir fname')
class Mobi8Reader(object):
def __init__(self, mobi6_reader, log):
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
def __call__(self):
self.mobi6_reader.check_for_drm()
offset = 1
res_end = len(self.mobi6_reader.sections)
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
res_end = self.mobi6_reader.kf8_boundary
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html
with open('debug-raw.html', 'wb') as f:
f.write(self.raw_ml)
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
first_resource_index = self.header.first_image_index
if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = self.header.records + 1
self.resource_sections = \
self.mobi6_reader.sections[first_resource_index:res_end]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.read_indices()
self.build_parts()
guide = self.create_guide()
ncx = self.create_ncx()
resource_map = self.extract_resources()
spine = self.expand_text(resource_map)
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
self.flow_table = (0, NULL_INDEX)
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
num_sections, = struct.unpack_from(b'>L', header, 0x08)
sections = header[0x0c:]
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
sections, 0)[::2] + (NULL_INDEX,)
self.files = []
if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0]
File = namedtuple('File',
'file_number name divtbl_count start_position length')
for i, text in enumerate(table.iterkeys()):
tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1]))
self.elems = []
if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec)
for i, text in enumerate(table.iterkeys()):
tag_map = table[text]
toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
self.guide = []
if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
'type title div_frag_num')
for i, ref_type in enumerate(table.iterkeys()):
tag_map = table[ref_type]
# ref_type, ref_title, div/frag number
title = cncx[tag_map[1][0]]
fileno = None
if 3 in tag_map.keys():
fileno = tag_map[3][0]
if 6 in tag_map.keys():
fileno = tag_map[6][0]
self.guide.append(Item(ref_type.decode(self.header.codec),
title, fileno))
def build_parts(self):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
# now split the raw_ml into its flow pieces
for j in xrange(0, len(self.flow_table)-1):
start = self.flow_table[j]
end = self.flow_table[j+1]
if end == NULL_INDEX:
end = len(raw_ml)
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text
text = self.flows[0]
self.flows[0] = b''
# walk the <skeleton> and <div> tables to build original source xhtml
# files *without* destroying any file position information needed for
# later href processing and create final list of file separation start:
# stop points and etc in partinfo
self.parts = []
self.partinfo = []
divptr = 0
baseptr = 0
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
baseptr = skelpos + skellen
skeleton = text[skelpos:baseptr]
for i in xrange(divcnt):
insertpos, idtext, filenum, seqnum, startpos, length = \
self.elems[divptr]
if i == 0:
aidtext = idtext[12:-2]
filename = 'part%04d.html' % filenum
part = text[baseptr:baseptr + length]
insertpos = insertpos - skelpos
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
baseptr = baseptr + length
divptr += 1
self.parts.append(skeleton)
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the
# original xhtml but have been stripped out and placed here.
# This can include local CDATA snippets and and svg sections.
# The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers
# and ereaders and causes epub validation issues because those raster
# images are in manifest but not in xhtml text - since they only
# referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/>
# and if so inline them into the xhtml text pieces.
# there may be other sorts of pieces stored here but until we see one
# in the wild to reverse engineer we won't be able to tell
self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
for j in xrange(1, len(self.flows)):
flowpart = self.flows[j]
nstr = '%04d' % j
m = svg_tag_pattern.search(flowpart)
if m != None:
# svg
typ = 'svg'
start = m.start()
m2 = image_tag_pattern.search(flowpart)
if m2 != None:
format = 'inline'
dir = None
fname = None
# strip off anything before <svg if inlining
flowpart = flowpart[start:]
else:
format = 'file'
dir = "images"
fname = 'svgimg' + nstr + '.svg'
else:
# search for CDATA and if exists inline it
if flowpart.find('[CDATA[') >= 0:
typ = 'css'
flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
format = 'inline'
dir = None
fname = None
else:
# css - assume as standalone css file
typ = 'css'
format = 'file'
dir = "styles"
fname = nstr + '.css'
self.flows[j] = flowpart
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
def get_file_info(self, pos):
''' Get information about the part (file) that exists at pos in
the raw markup '''
for part in self.partinfo:
if pos >= part.start and pos < part.end:
return part
return Part(*repeat(None, len(Part._fields)))
def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file
row = int(posfid, 32)
off = int(offset, 32)
[insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
pos = insertpos + off
fname = self.get_file_info(pos).filename
# an existing "id=" must exist in original xhtml otherwise it would not
# have worked for linking. Amazon seems to have added its own
# additional "aid=" inside tags whose contents seem to represent some
# position information encoded into Base32 name.
# so find the closest "id=" before position the file by actually
# searching in that file
idtext = self.get_id_tag(pos)
return fname, idtext
def get_id_tag(self, pos):
# find the correct tag by actually searching in the destination
# textblock at position
fi = self.get_file_info(pos)
if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos)
textblock = self.parts[fi.num]
id_map = []
npos = pos - fi.start
# if npos inside a tag then search all text before the its end of tag
# marker
pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos)
if pgt < plt:
npos = pgt + 1
# find id links only inside of tags
# inside any < > pair find all "id=' and return whatever is inside
# the quotes
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
re.IGNORECASE)
for m in re.finditer(id_pattern, textblock):
id_map.append((m.start(), m.group(1)))
if not id_map:
# Found no id in the textblock, link must be to top of file
return b''
# if npos is before first id= inside a tag, return the first
if npos < id_map[0][0]:
return id_map[0][1]
# if npos is after the last id= inside a tag, return the last
if npos > id_map[-1][0]:
return id_map[-1][1]
# otherwise find last id before npos
for i, item in enumerate(id_map):
if npos < item[0]:
return id_map[i-1][1]
return id_map[0][1]
def create_guide(self):
guide = Guide()
for ref_type, ref_title, fileno in self.guide:
elem = self.elems[fileno]
fi = self.get_file_info(elem.insert_pos)
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += b'#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
g.title, g.type = ref_title, ref_type
guide.append(g)
so = self.header.exth.start_offset
if so not in {None, NULL_INDEX}:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
g.title, g.type = 'start', 'text'
guide.append(g)
return guide
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
# Add href and anchor info to the index entries
for entry in index_entries:
pos = entry['pos']
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos).decode(self.header.codec)
entry['href'] = '%s/%s'%(fi.type, fi.filename)
entry['idtag'] = idtag
# Build the TOC object
return build_toc(index_entries)
def extract_resources(self):
resource_map = []
for x in ('fonts', 'images'):
os.mkdir(x)
for i, sec in enumerate(self.resource_sections):
fname_idx = i+1
data = sec[0]
typ = data[:4]
href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
pass # Ignore these records
elif typ == b'FONT':
# fonts only exist in K8 ebooks
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: ?? Expanded size in bytes ??
# bytes 8 - 11: ?? number of files ??
# bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
# bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib?
# The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
data = data[26:-4]
uncompressed_data = zlib.decompress(data, -15)
hdr = uncompressed_data[0:4]
ext = 'dat'
if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
ext = 'ttf'
href = "fonts/%05d.%s" % (fname_idx, ext)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(uncompressed_data)
else:
imgtype = imghdr.what(None, data)
if imgtype is None:
imgtype = 'unknown'
href = 'images/%05d.%s'%(fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
resource_map.append(href)
return resource_map
def expand_text(self, resource_map):
return expand_mobi8_markup(self, resource_map, self.log)
def write_opf(self, guide, toc, spine, resource_map):
mi = self.header.exth.mi
if (self.cover_offset is not None and self.cover_offset <
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
opf = OPFCreator(os.getcwdu(), mi)
opf.guide = guide
opf.create_manifest_from_files_in([os.getcwdu()])
opf.create_spine(spine)
opf.set_toc(toc)
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'

View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
tag_fieldname_map = {
1: ['pos',0],
2: ['len',0],
3: ['noffs',0],
4: ['hlvl',0],
5: ['koffs',0],
6: ['pos_fid',0],
21: ['parent',0],
22: ['child1',0],
23: ['childn',0]
}
def read_ncx(sections, index, codec):
index_entries = []
if index != NULL_INDEX:
table, cncx = read_index(sections, index, codec)
for num, x in enumerate(table.iteritems()):
text, tag_map = x
entry = {
'name': text,
'pos': -1,
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Kind",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'num' : num
}
for tag in tag_fieldname_map.keys():
fieldname, i = tag_fieldname_map[tag]
if tag in tag_map:
fieldvalue = tag_map[tag][i]
if tag == 6:
fieldvalue = to_base(fieldvalue, base=32)
entry[fieldname] = fieldvalue
if tag == 3:
entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
if tag == 5:
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
index_entries.append(entry)
return index_entries
def build_toc(index_entries):
ans = TOC(base_path=os.getcwdu())
levels = {x['hlvl'] for x in index_entries}
num_map = {-1: ans}
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
levels}
for lvl in sorted(levels):
for item in level_map[lvl]:
parent = num_map[item['parent']]
child = parent.add_item(item['href'], item['idtag'], item['text'])
num_map[item['num']] = child
# Set play orders in depth first order
for i, item in enumerate(ans.flat()):
item.play_order = i
return ans

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
import struct, string
from collections import OrderedDict
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
@ -340,4 +340,26 @@ def detect_periodical(toc, log=None):
return False
return True
def count_set_bits(num):
if num < 0:
num = -num
ans = 0
while num > 0:
ans += (num & 0b1)
num >>= 1
return ans
def to_base(num, base=32):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
if num == 0: return '0'
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
if sign < 0:
ans.append('-')
ans.reverse()
return ''.join(ans)