mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Support for reading KF8
This commit is contained in:
parent
93bf57e6c4
commit
d1b6bb705d
@ -263,7 +263,7 @@ class MOBIMetadataReader(MetadataReaderPlugin):
|
|||||||
description = _('Read metadata from %s files')%'MOBI'
|
description = _('Read metadata from %s files')%'MOBI'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.mobi.reader import get_metadata
|
from calibre.ebooks.metadata.mobi import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class ODTMetadataReader(MetadataReaderPlugin):
|
class ODTMetadataReader(MetadataReaderPlugin):
|
||||||
|
@ -10,7 +10,7 @@ Generates and writes an APNX page mapping file.
|
|||||||
|
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
from calibre.ebooks.mobi.reader import MobiReader
|
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
|
@ -3,7 +3,10 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -14,18 +17,44 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.mobi.reader import MobiReader
|
|
||||||
|
if os.environ.get('USE_MOBIUNPACK', None) is not None:
|
||||||
|
try:
|
||||||
|
from mobiunpack.mobi_unpack import Mobi8Reader
|
||||||
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
|
||||||
|
wdir = PersistentTemporaryDirectory('_unpack_space')
|
||||||
|
m8r = Mobi8Reader(stream, wdir)
|
||||||
|
if m8r.isK8():
|
||||||
|
epub_path = m8r.processMobi8()
|
||||||
|
epub_input = plugin_for_input_format('epub')
|
||||||
|
for opt in epub_input.options:
|
||||||
|
setattr(options, opt.option.name, opt.recommended_value)
|
||||||
|
options.input_encoding = m8r.getCodec()
|
||||||
|
return epub_input.convert(open(epub_path,'rb'), options,
|
||||||
|
'epub', log, accelerators)
|
||||||
|
except Exception:
|
||||||
|
log.exception('mobi_unpack code not working')
|
||||||
|
|
||||||
|
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||||
from lxml import html
|
from lxml import html
|
||||||
parse_cache = {}
|
parse_cache = {}
|
||||||
try:
|
try:
|
||||||
mr = MobiReader(stream, log, options.input_encoding,
|
mr = MobiReader(stream, log, options.input_encoding,
|
||||||
options.debug_pipeline)
|
options.debug_pipeline)
|
||||||
|
if mr.kf8_type is None:
|
||||||
mr.extract_content(u'.', parse_cache)
|
mr.extract_content(u'.', parse_cache)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
mr = MobiReader(stream, log, options.input_encoding,
|
mr = MobiReader(stream, log, options.input_encoding,
|
||||||
options.debug_pipeline, try_extra_data_fix=True)
|
options.debug_pipeline, try_extra_data_fix=True)
|
||||||
|
if mr.kf8_type is None:
|
||||||
mr.extract_content(u'.', parse_cache)
|
mr.extract_content(u'.', parse_cache)
|
||||||
|
|
||||||
|
if mr.kf8_type is not None:
|
||||||
|
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||||
|
return os.path.abspath(Mobi8Reader(mr, log)())
|
||||||
|
|
||||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||||
if raw:
|
if raw:
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
|
@ -9,6 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
|
|||||||
'Marshall T. Vandegrift <llasram@gmail.com>'
|
'Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os, cStringIO
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
||||||
@ -433,3 +434,75 @@ def set_metadata(stream, mi):
|
|||||||
mu = MetadataUpdater(stream)
|
mu = MetadataUpdater(stream)
|
||||||
mu.update(mi)
|
mu.update(mi)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def get_metadata(stream):
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
||||||
|
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||||
|
from calibre import CurrentDir
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
PILImage
|
||||||
|
except ImportError:
|
||||||
|
import Image as PILImage
|
||||||
|
|
||||||
|
|
||||||
|
stream.seek(0)
|
||||||
|
try:
|
||||||
|
raw = stream.read(3)
|
||||||
|
except:
|
||||||
|
raw = ''
|
||||||
|
stream.seek(0)
|
||||||
|
if raw == b'TPZ':
|
||||||
|
from calibre.ebooks.metadata.topaz import get_metadata
|
||||||
|
return get_metadata(stream)
|
||||||
|
from calibre.utils.logging import Log
|
||||||
|
log = Log()
|
||||||
|
try:
|
||||||
|
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
|
||||||
|
except:
|
||||||
|
mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||||
|
mh = MetadataHeader(stream, log)
|
||||||
|
if mh.title and mh.title != _('Unknown'):
|
||||||
|
mi.title = mh.title
|
||||||
|
|
||||||
|
if mh.exth is not None:
|
||||||
|
if mh.exth.mi is not None:
|
||||||
|
mi = mh.exth.mi
|
||||||
|
else:
|
||||||
|
size = 1024**3
|
||||||
|
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
|
||||||
|
pos = stream.tell()
|
||||||
|
stream.seek(0, 2)
|
||||||
|
size = stream.tell()
|
||||||
|
stream.seek(pos)
|
||||||
|
if size < 4*1024*1024:
|
||||||
|
with TemporaryDirectory('_mobi_meta_reader') as tdir:
|
||||||
|
with CurrentDir(tdir):
|
||||||
|
mr = MobiReader(stream, log)
|
||||||
|
parse_cache = {}
|
||||||
|
mr.extract_content(tdir, parse_cache)
|
||||||
|
if mr.embedded_mi is not None:
|
||||||
|
mi = mr.embedded_mi
|
||||||
|
if hasattr(mh.exth, 'cover_offset'):
|
||||||
|
cover_index = mh.first_image_index + mh.exth.cover_offset
|
||||||
|
data = mh.section_data(int(cover_index))
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
data = mh.section_data(mh.first_image_index)
|
||||||
|
except:
|
||||||
|
data = ''
|
||||||
|
buf = cStringIO.StringIO(data)
|
||||||
|
try:
|
||||||
|
im = PILImage.open(buf)
|
||||||
|
except:
|
||||||
|
log.exception('Failed to read MOBI cover')
|
||||||
|
else:
|
||||||
|
obuf = cStringIO.StringIO()
|
||||||
|
im.convert('RGB').save(obuf, format='JPEG')
|
||||||
|
mi.cover_data = ('jpg', obuf.getvalue())
|
||||||
|
return mi
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ class TOC(list):
|
|||||||
self.toc_thumbnail = toc_thumbnail
|
self.toc_thumbnail = toc_thumbnail
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
lines = ['TOC: %s#%s'%(self.href, self.fragment)]
|
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
|
||||||
for child in self:
|
for child in self:
|
||||||
c = str(child).splitlines()
|
c = str(child).splitlines()
|
||||||
for l in c:
|
for l in c:
|
||||||
|
11
src/calibre/ebooks/mobi/reader/__init__.py
Normal file
11
src/calibre/ebooks/mobi/reader/__init__.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
|
258
src/calibre/ebooks/mobi/reader/headers.py
Normal file
258
src/calibre/ebooks/mobi/reader/headers.py
Normal file
@ -0,0 +1,258 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (absolute_import, print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import struct, re, os
|
||||||
|
|
||||||
|
from calibre import replace_entities
|
||||||
|
from calibre.utils.date import parse_date
|
||||||
|
from calibre.ebooks.mobi import MobiError
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
||||||
|
|
||||||
|
NULL_INDEX = 0xffffffff
|
||||||
|
|
||||||
|
class EXTHHeader(object): # {{{
|
||||||
|
|
||||||
|
def __init__(self, raw, codec, title):
|
||||||
|
self.doctype = raw[:4]
|
||||||
|
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
||||||
|
raw = raw[12:]
|
||||||
|
pos = 0
|
||||||
|
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||||
|
self.has_fake_cover = True
|
||||||
|
self.start_offset = None
|
||||||
|
left = self.num_items
|
||||||
|
|
||||||
|
while left > 0:
|
||||||
|
left -= 1
|
||||||
|
id, size = struct.unpack('>LL', raw[pos:pos + 8])
|
||||||
|
content = raw[pos + 8:pos + size]
|
||||||
|
pos += size
|
||||||
|
if id >= 100 and id < 200:
|
||||||
|
self.process_metadata(id, content, codec)
|
||||||
|
elif id == 203:
|
||||||
|
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
||||||
|
elif id == 201:
|
||||||
|
co, = struct.unpack('>L', content)
|
||||||
|
if co < NULL_INDEX:
|
||||||
|
self.cover_offset = co
|
||||||
|
elif id == 202:
|
||||||
|
self.thumbnail_offset, = struct.unpack('>L', content)
|
||||||
|
elif id == 501:
|
||||||
|
# cdetype
|
||||||
|
pass
|
||||||
|
elif id == 502:
|
||||||
|
# last update time
|
||||||
|
pass
|
||||||
|
elif id == 503: # Long title
|
||||||
|
# Amazon seems to regard this as the definitive book title
|
||||||
|
# rather than the title from the PDB header. In fact when
|
||||||
|
# sending MOBI files through Amazon's email service if the
|
||||||
|
# title contains non ASCII chars or non filename safe chars
|
||||||
|
# they are messed up in the PDB header
|
||||||
|
try:
|
||||||
|
title = content.decode(codec)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
#else:
|
||||||
|
# print 'unknown record', id, repr(content)
|
||||||
|
if title:
|
||||||
|
self.mi.title = replace_entities(title)
|
||||||
|
|
||||||
|
def process_metadata(self, id, content, codec):
|
||||||
|
if id == 100:
|
||||||
|
if self.mi.authors == [_('Unknown')]:
|
||||||
|
self.mi.authors = []
|
||||||
|
au = content.decode(codec, 'ignore').strip()
|
||||||
|
self.mi.authors.append(au)
|
||||||
|
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
||||||
|
self.mi.author_sort = au.strip()
|
||||||
|
elif id == 101:
|
||||||
|
self.mi.publisher = content.decode(codec, 'ignore').strip()
|
||||||
|
elif id == 103:
|
||||||
|
self.mi.comments = content.decode(codec, 'ignore')
|
||||||
|
elif id == 104:
|
||||||
|
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
|
||||||
|
elif id == 105:
|
||||||
|
if not self.mi.tags:
|
||||||
|
self.mi.tags = []
|
||||||
|
self.mi.tags.extend([x.strip() for x in content.decode(codec,
|
||||||
|
'ignore').split(';')])
|
||||||
|
self.mi.tags = list(set(self.mi.tags))
|
||||||
|
elif id == 106:
|
||||||
|
try:
|
||||||
|
self.mi.pubdate = parse_date(content, as_utc=False)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
elif id == 108:
|
||||||
|
pass # Producer
|
||||||
|
elif id == 113:
|
||||||
|
pass # ASIN or UUID
|
||||||
|
elif id == 116:
|
||||||
|
self.start_offset, = struct.unpack(b'>L', content)
|
||||||
|
#else:
|
||||||
|
# print 'unhandled metadata record', id, repr(content)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
class BookHeader(object):
|
||||||
|
|
||||||
|
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
|
||||||
|
self.log = log
|
||||||
|
self.compression_type = raw[:2]
|
||||||
|
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
||||||
|
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
||||||
|
if ident == 'TEXTREAD':
|
||||||
|
self.codepage = 1252
|
||||||
|
if len(raw) <= 16:
|
||||||
|
self.codec = 'cp1252'
|
||||||
|
self.extra_flags = 0
|
||||||
|
self.title = _('Unknown')
|
||||||
|
self.language = 'ENGLISH'
|
||||||
|
self.sublanguage = 'NEUTRAL'
|
||||||
|
self.exth_flag, self.exth = 0, None
|
||||||
|
self.ancient = True
|
||||||
|
self.first_image_index = -1
|
||||||
|
self.mobi_version = 1
|
||||||
|
else:
|
||||||
|
self.ancient = False
|
||||||
|
self.doctype = raw[16:20]
|
||||||
|
self.length, self.type, self.codepage, self.unique_id, \
|
||||||
|
self.version = struct.unpack('>LLLLL', raw[20:40])
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.codec = {
|
||||||
|
1252: 'cp1252',
|
||||||
|
65001: 'utf-8',
|
||||||
|
}[self.codepage]
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
self.codec = 'cp1252' if not user_encoding else user_encoding
|
||||||
|
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
||||||
|
self.codec))
|
||||||
|
# There exists some broken DRM removal tool that removes DRM but
|
||||||
|
# leaves the DRM fields in the header yielding a header size of
|
||||||
|
# 0xF8. The actual value of max_header_length should be 0xE8 but
|
||||||
|
# it's changed to accommodate this silly tool. Hopefully that will
|
||||||
|
# not break anything else.
|
||||||
|
max_header_length = 0xF8
|
||||||
|
|
||||||
|
if (ident == 'TEXTREAD' or self.length < 0xE4 or
|
||||||
|
self.length > max_header_length or
|
||||||
|
(try_extra_data_fix and self.length == 0xE4)):
|
||||||
|
self.extra_flags = 0
|
||||||
|
else:
|
||||||
|
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
||||||
|
|
||||||
|
if self.compression_type == 'DH':
|
||||||
|
self.huff_offset, self.huff_number = struct.unpack('>LL',
|
||||||
|
raw[0x70:0x78])
|
||||||
|
|
||||||
|
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
||||||
|
tend = toff + tlen
|
||||||
|
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
||||||
|
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
|
||||||
|
langid = langcode & 0xFF
|
||||||
|
sublangid = (langcode >> 10) & 0xFF
|
||||||
|
self.language = main_language.get(langid, 'ENGLISH')
|
||||||
|
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
||||||
|
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
||||||
|
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
|
||||||
|
|
||||||
|
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
||||||
|
self.exth = None
|
||||||
|
if not isinstance(self.title, unicode):
|
||||||
|
self.title = self.title.decode(self.codec, 'replace')
|
||||||
|
if self.exth_flag & 0x40:
|
||||||
|
try:
|
||||||
|
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
|
||||||
|
self.title)
|
||||||
|
self.exth.mi.uid = self.unique_id
|
||||||
|
try:
|
||||||
|
self.exth.mi.language = mobi2iana(langid, sublangid)
|
||||||
|
except:
|
||||||
|
self.log.exception('Unknown language code')
|
||||||
|
except:
|
||||||
|
self.log.exception('Invalid EXTH header')
|
||||||
|
self.exth_flag = 0
|
||||||
|
|
||||||
|
self.ncxidx = NULL_INDEX
|
||||||
|
if len(raw) >= 0xF8:
|
||||||
|
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
|
||||||
|
|
||||||
|
if self.mobi_version >= 8:
|
||||||
|
self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
|
||||||
|
|
||||||
|
# Index into <div> sections in raw_ml
|
||||||
|
self.dividx, = struct.unpack_from('>L', raw, 0xF8)
|
||||||
|
|
||||||
|
# Index into Other files
|
||||||
|
self.othidx, = struct.unpack_from('>L', raw, 0x104)
|
||||||
|
|
||||||
|
# need to use the FDST record to find out how to properly
|
||||||
|
# unpack the raw_ml into pieces it is simply a table of start
|
||||||
|
# and end locations for each flow piece
|
||||||
|
self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
|
||||||
|
self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
|
||||||
|
# if cnt is 1 or less, fdst section number can be garbage
|
||||||
|
if self.fdstcnt <= 1:
|
||||||
|
self.fdstidx = NULL_INDEX
|
||||||
|
else: # Null values
|
||||||
|
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
|
||||||
|
NULL_INDEX
|
||||||
|
|
||||||
|
class MetadataHeader(BookHeader):
|
||||||
|
|
||||||
|
def __init__(self, stream, log):
|
||||||
|
self.stream = stream
|
||||||
|
self.ident = self.identity()
|
||||||
|
self.num_sections = self.section_count()
|
||||||
|
if self.num_sections >= 2:
|
||||||
|
header = self.header()
|
||||||
|
BookHeader.__init__(self, header, self.ident, None, log)
|
||||||
|
else:
|
||||||
|
self.exth = None
|
||||||
|
|
||||||
|
def identity(self):
|
||||||
|
self.stream.seek(60)
|
||||||
|
ident = self.stream.read(8).upper()
|
||||||
|
if ident not in ['BOOKMOBI', 'TEXTREAD']:
|
||||||
|
raise MobiError('Unknown book type: %s' % ident)
|
||||||
|
return ident
|
||||||
|
|
||||||
|
def section_count(self):
|
||||||
|
self.stream.seek(76)
|
||||||
|
return struct.unpack('>H', self.stream.read(2))[0]
|
||||||
|
|
||||||
|
def section_offset(self, number):
|
||||||
|
self.stream.seek(78 + number * 8)
|
||||||
|
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
||||||
|
|
||||||
|
def header(self):
|
||||||
|
section_headers = []
|
||||||
|
# First section with the metadata
|
||||||
|
section_headers.append(self.section_offset(0))
|
||||||
|
# Second section used to get the length of the first
|
||||||
|
section_headers.append(self.section_offset(1))
|
||||||
|
|
||||||
|
end_off = section_headers[1]
|
||||||
|
off = section_headers[0]
|
||||||
|
self.stream.seek(off)
|
||||||
|
return self.stream.read(end_off - off)
|
||||||
|
|
||||||
|
def section_data(self, number):
|
||||||
|
start = self.section_offset(number)
|
||||||
|
if number == self.num_sections -1:
|
||||||
|
end = os.stat(self.stream.name).st_size
|
||||||
|
else:
|
||||||
|
end = self.section_offset(number + 1)
|
||||||
|
self.stream.seek(start)
|
||||||
|
try:
|
||||||
|
return self.stream.read(end - start)
|
||||||
|
except OverflowError:
|
||||||
|
self.stream.seek(start)
|
||||||
|
return self.stream.read()
|
||||||
|
|
195
src/calibre/ebooks/mobi/reader/index.py
Normal file
195
src/calibre/ebooks/mobi/reader/index.py
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import struct
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from calibre.ebooks.mobi.utils import decint, count_set_bits
|
||||||
|
|
||||||
|
class InvalidFile(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def check_signature(data, signature):
|
||||||
|
if data[:len(signature)] != signature:
|
||||||
|
raise InvalidFile('Not a valid %r section'%signature)
|
||||||
|
|
||||||
|
class NotAnINDXRecord(InvalidFile):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NotATAGXSection(InvalidFile):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def format_bytes(byts):
|
||||||
|
byts = bytearray(byts)
|
||||||
|
byts = [hex(b)[2:] for b in byts]
|
||||||
|
return ' '.join(byts)
|
||||||
|
|
||||||
|
def parse_indx_header(data):
|
||||||
|
check_signature(data, b'INDX')
|
||||||
|
words = (
|
||||||
|
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||||
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||||
|
)
|
||||||
|
num = len(words)
|
||||||
|
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
|
||||||
|
header = {words[i]:values[i] for i in xrange(num)}
|
||||||
|
return header
|
||||||
|
|
||||||
|
class CNCX(object): # {{{
|
||||||
|
|
||||||
|
'''
|
||||||
|
Parses the records that contain the compiled NCX (all strings from the
|
||||||
|
NCX). Presents a simple offset : string mapping interface to access the
|
||||||
|
data.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, records, codec):
|
||||||
|
self.records = OrderedDict()
|
||||||
|
record_offset = 0
|
||||||
|
for raw in records:
|
||||||
|
pos = 0
|
||||||
|
while pos < len(raw):
|
||||||
|
length, consumed = decint(raw[pos:])
|
||||||
|
if length > 0:
|
||||||
|
try:
|
||||||
|
self.records[pos+record_offset] = raw[
|
||||||
|
pos+consumed:pos+consumed+length].decode(codec)
|
||||||
|
except:
|
||||||
|
byts = raw[pos:]
|
||||||
|
r = format_bytes(byts)
|
||||||
|
print ('CNCX entry at offset %d has unknown format %s'%(
|
||||||
|
pos+record_offset, r))
|
||||||
|
self.records[pos+record_offset] = r
|
||||||
|
pos = len(raw)
|
||||||
|
pos += consumed+length
|
||||||
|
record_offset += 0x10000
|
||||||
|
|
||||||
|
def __getitem__(self, offset):
|
||||||
|
return self.records.get(offset)
|
||||||
|
|
||||||
|
def get(self, offset, default=None):
|
||||||
|
return self.records.get(offset, default)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def parse_tag_section(data):
|
||||||
|
check_signature(data, b'TAGX')
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
|
||||||
|
control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
|
||||||
|
|
||||||
|
# Skip the first 12 bytes already read above.
|
||||||
|
for i in xrange(12, first_entry_offset, 4):
|
||||||
|
pos = i
|
||||||
|
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
|
||||||
|
ord(data[pos+3])))
|
||||||
|
return control_byte_count, tags
|
||||||
|
|
||||||
|
def get_tag_map(control_byte_count, tags, data, start, end):
|
||||||
|
ptags = []
|
||||||
|
ans = {}
|
||||||
|
control_byte_index = 0
|
||||||
|
data_start = start + control_byte_count
|
||||||
|
|
||||||
|
for tag, values_per_entry, mask, end_flag in tags:
|
||||||
|
if end_flag == 0x01:
|
||||||
|
control_byte_index += 1
|
||||||
|
continue
|
||||||
|
value = ord(data[start + control_byte_index]) & mask
|
||||||
|
if value != 0:
|
||||||
|
if value == mask:
|
||||||
|
if count_set_bits(mask) > 1:
|
||||||
|
# If all bits of masked value are set and the mask has more than one bit, a variable width value
|
||||||
|
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
|
||||||
|
# which will contain the corresponding variable width values.
|
||||||
|
value, consumed = decint(data[data_start:])
|
||||||
|
data_start += consumed
|
||||||
|
ptags.append((tag, None, value, values_per_entry))
|
||||||
|
else:
|
||||||
|
ptags.append((tag, 1, None, values_per_entry))
|
||||||
|
else:
|
||||||
|
# Shift bits to get the masked value.
|
||||||
|
while mask & 0x01 == 0:
|
||||||
|
mask = mask >> 1
|
||||||
|
value = value >> 1
|
||||||
|
ptags.append((tag, value, None, values_per_entry))
|
||||||
|
for tag, value_count, value_bytes, values_per_entry in ptags:
|
||||||
|
values = []
|
||||||
|
if value_count != None:
|
||||||
|
# Read value_count * values_per_entry variable width values.
|
||||||
|
for _ in xrange(value_count*values_per_entry):
|
||||||
|
byts, consumed = decint(data[data_start:])
|
||||||
|
data_start += consumed
|
||||||
|
values.append(byts)
|
||||||
|
else:
|
||||||
|
# Convert value_bytes to variable width values.
|
||||||
|
total_consumed = 0
|
||||||
|
while total_consumed < value_bytes:
|
||||||
|
# Does this work for values_per_entry != 1?
|
||||||
|
byts, consumed = decint(data[data_start:])
|
||||||
|
data_start += consumed
|
||||||
|
total_consumed += consumed
|
||||||
|
values.append(byts)
|
||||||
|
if total_consumed != value_bytes:
|
||||||
|
print ("Error: Should consume %s bytes, but consumed %s" %
|
||||||
|
(value_bytes, total_consumed))
|
||||||
|
ans[tag] = values
|
||||||
|
# Test that all bytes have been processed if end is given.
|
||||||
|
if end is not None and data_start < end:
|
||||||
|
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
||||||
|
rest = data[data_start:end]
|
||||||
|
if rest.replace(b'\0', b''):
|
||||||
|
print ("Warning: There are unprocessed index bytes left: %s" %
|
||||||
|
format_bytes(rest))
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def read_index(sections, idx, codec):
|
||||||
|
table, cncx = OrderedDict(), CNCX([], codec)
|
||||||
|
|
||||||
|
data = sections[idx][0]
|
||||||
|
|
||||||
|
indx_header = parse_indx_header(data)
|
||||||
|
indx_count = indx_header['count']
|
||||||
|
|
||||||
|
if indx_header['ncncx'] > 0:
|
||||||
|
off = idx + indx_count + 1
|
||||||
|
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
||||||
|
cncx = CNCX(cncx_records, codec)
|
||||||
|
|
||||||
|
tag_section_start = indx_header['len']
|
||||||
|
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
|
||||||
|
|
||||||
|
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||||
|
data = sections[i][0]
|
||||||
|
header = parse_indx_header(data)
|
||||||
|
idxt_pos = header['start']
|
||||||
|
entry_count = header['count']
|
||||||
|
|
||||||
|
# loop through to build up the IDXT position starts
|
||||||
|
idx_positions= []
|
||||||
|
for j in xrange(entry_count):
|
||||||
|
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
|
||||||
|
idx_positions.append(pos)
|
||||||
|
# The last entry ends before the IDXT tag (but there might be zero fill
|
||||||
|
# bytes we need to ignore!)
|
||||||
|
idx_positions.append(idxt_pos)
|
||||||
|
|
||||||
|
# For each entry in the IDXT build up the tag map and any associated
|
||||||
|
# text
|
||||||
|
for j in xrange(entry_count):
|
||||||
|
start, end = idx_positions[j:j+2]
|
||||||
|
text_length = ord(data[start])
|
||||||
|
text = data[start+1:start+1+text_length]
|
||||||
|
tag_map = get_tag_map(control_byte_count, tags, data,
|
||||||
|
start+1+text_length, end)
|
||||||
|
table[text] = tag_map
|
||||||
|
|
||||||
|
return table, cncx
|
||||||
|
|
307
src/calibre/ebooks/mobi/reader/markup.py
Normal file
307
src/calibre/ebooks/mobi/reader/markup.py
Normal file
@ -0,0 +1,307 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re, os
|
||||||
|
|
||||||
|
def update_internal_links(mobi8_reader):
|
||||||
|
# need to update all links that are internal which
|
||||||
|
# are based on positions within the xhtml files **BEFORE**
|
||||||
|
# cutting and pasting any pieces into the xhtml text files
|
||||||
|
|
||||||
|
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
|
||||||
|
# XXXX is the offset in records into divtbl
|
||||||
|
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
|
||||||
|
|
||||||
|
mr = mobi8_reader
|
||||||
|
|
||||||
|
# pos:fid pattern
|
||||||
|
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
|
||||||
|
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for part in mr.parts:
|
||||||
|
srcpieces = posfid_pattern.split(part)
|
||||||
|
for j in xrange(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<'):
|
||||||
|
for m in posfid_index_pattern.finditer(tag):
|
||||||
|
posfid = m.group(1)
|
||||||
|
offset = m.group(2)
|
||||||
|
filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
|
||||||
|
suffix = (b'#' + idtag) if idtag else b''
|
||||||
|
replacement = filename.encode(mr.header.codec) + suffix
|
||||||
|
tag = posfid_index_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
|
||||||
|
parts.append(part)
|
||||||
|
|
||||||
|
# All parts are now unicode and have no internal links
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def remove_kindlegen_markup(parts):
|
||||||
|
|
||||||
|
# we can safely remove all of the Kindlegen generated aid tags
|
||||||
|
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
|
||||||
|
re.IGNORECASE)
|
||||||
|
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
|
||||||
|
|
||||||
|
for i in xrange(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
srcpieces = find_tag_with_aid_pattern.split(part)
|
||||||
|
for j in range(len(srcpieces)):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<'):
|
||||||
|
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||||
|
replacement = ''
|
||||||
|
tag = within_tag_aid_position_pattern.sub(replacement, tag,
|
||||||
|
1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = "".join(srcpieces)
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
|
||||||
|
find_tag_with_AmznPageBreak_pattern = re.compile(
|
||||||
|
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
|
||||||
|
within_tag_AmznPageBreak_position_pattern = re.compile(
|
||||||
|
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
|
||||||
|
|
||||||
|
for i in xrange(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
|
||||||
|
for j in range(len(srcpieces)):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<'):
|
||||||
|
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
|
||||||
|
replacement = ''
|
||||||
|
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = "".join(srcpieces)
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
def update_flow_links(mobi8_reader, resource_map, log):
|
||||||
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||||
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||||
|
# kindle:embed:XXXX (used for fonts)
|
||||||
|
|
||||||
|
mr = mobi8_reader
|
||||||
|
flows = []
|
||||||
|
|
||||||
|
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||||
|
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
||||||
|
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
|
||||||
|
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
|
||||||
|
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
|
||||||
|
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
|
||||||
|
|
||||||
|
for flow in mr.flows:
|
||||||
|
if flow is None: # 0th flow is None
|
||||||
|
flows.append(flow)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not isinstance(flow, unicode):
|
||||||
|
flow = flow.decode(mr.header.codec)
|
||||||
|
|
||||||
|
# links to raster image files from image tags
|
||||||
|
# image_pattern
|
||||||
|
srcpieces = img_pattern.split(flow)
|
||||||
|
for j in range(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<im'):
|
||||||
|
for m in img_index_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
href = resource_map[num-1]
|
||||||
|
if href:
|
||||||
|
replacement = '"%s"'%('../'+ href)
|
||||||
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
log.warn('Referenced image %s was not recognized '
|
||||||
|
'as a valid image in %s' % (num, tag))
|
||||||
|
srcpieces[j] = tag
|
||||||
|
flow = "".join(srcpieces)
|
||||||
|
|
||||||
|
# replacements inside css url():
|
||||||
|
srcpieces = url_pattern.split(flow)
|
||||||
|
for j in range(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
|
||||||
|
# process links to raster image files
|
||||||
|
for m in url_img_index_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
href = resource_map[num-1]
|
||||||
|
if href:
|
||||||
|
replacement = '"%s"'%('../'+ href)
|
||||||
|
tag = url_img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
log.warn('Referenced image %s was not recognized as a '
|
||||||
|
'valid image in %s' % (num, tag))
|
||||||
|
|
||||||
|
# process links to fonts
|
||||||
|
for m in font_index_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
href = resource_map[num-1]
|
||||||
|
if href is None:
|
||||||
|
log.warn('Referenced font %s was not recognized as a '
|
||||||
|
'valid font in %s' % (num, tag))
|
||||||
|
else:
|
||||||
|
replacement = '"%s"'%('../'+ href)
|
||||||
|
tag = font_index_pattern.sub(replacement, tag, 1)
|
||||||
|
|
||||||
|
# process links to other css pieces
|
||||||
|
for m in url_css_index_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
fi = mr.flowinfo[num]
|
||||||
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||||
|
tag = url_css_index_pattern.sub(replacement, tag, 1)
|
||||||
|
|
||||||
|
srcpieces[j] = tag
|
||||||
|
flow = "".join(srcpieces)
|
||||||
|
|
||||||
|
# flow pattern not inside url()
|
||||||
|
srcpieces = re.split(tag_pattern, flow)
|
||||||
|
for j in range(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<'):
|
||||||
|
for m in re.finditer(flow_pattern, tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
fi = mr.flowinfo[num]
|
||||||
|
if fi.format == 'inline':
|
||||||
|
flowtext = mr.flows[num]
|
||||||
|
tag = flowtext
|
||||||
|
else:
|
||||||
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||||
|
tag = flow_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
flow = "".join(srcpieces)
|
||||||
|
|
||||||
|
flows.append(flow)
|
||||||
|
|
||||||
|
# All flows are now unicode and have links resolved
|
||||||
|
return flows
|
||||||
|
|
||||||
|
def insert_flows_into_markup(parts, flows, mobi8_reader):
|
||||||
|
mr = mobi8_reader
|
||||||
|
|
||||||
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||||
|
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
||||||
|
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||||
|
for i in xrange(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
|
||||||
|
# flow pattern
|
||||||
|
srcpieces = tag_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<'):
|
||||||
|
for m in flow_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
fi = mr.flowinfo[num]
|
||||||
|
if fi.format == 'inline':
|
||||||
|
tag = flows[num]
|
||||||
|
else:
|
||||||
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||||
|
tag = flow_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = "".join(srcpieces)
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
def insert_images_into_markup(parts, resource_map, log):
|
||||||
|
# Handle any embedded raster images links in the xhtml text
|
||||||
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||||
|
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||||
|
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
|
||||||
|
for i in xrange(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||||
|
|
||||||
|
# links to raster image files
|
||||||
|
# image_pattern
|
||||||
|
srcpieces = img_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith('<im'):
|
||||||
|
for m in img_index_pattern.finditer(tag):
|
||||||
|
num = int(m.group(1), 32)
|
||||||
|
href = resource_map[num-1]
|
||||||
|
if href:
|
||||||
|
replacement = '"%s"'%('../' + href)
|
||||||
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
log.warn('Referenced image %s was not recognized as '
|
||||||
|
'a valid image in %s' % (num, tag))
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = "".join(srcpieces)
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
def upshift_markup(parts):
|
||||||
|
tag_pattern = re.compile(r'''(<(svg)[^>]*>)''', re.IGNORECASE)
|
||||||
|
|
||||||
|
for i in xrange(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
|
||||||
|
# tag pattern
|
||||||
|
srcpieces = re.split(tag_pattern, part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag[:4].lower() == '<svg':
|
||||||
|
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
|
||||||
|
tag = tag.replace('viewbox','viewBox')
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = "".join(srcpieces)
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
||||||
|
# First update all internal links that are based on offsets
|
||||||
|
parts = update_internal_links(mobi8_reader)
|
||||||
|
|
||||||
|
# Remove pointless markup inserted by kindlegen
|
||||||
|
remove_kindlegen_markup(parts)
|
||||||
|
|
||||||
|
# Handle substitutions for the flows pieces first as they may
|
||||||
|
# be inlined into the xhtml text
|
||||||
|
flows = update_flow_links(mobi8_reader, resource_map, log)
|
||||||
|
|
||||||
|
# Insert inline flows into the markup
|
||||||
|
insert_flows_into_markup(parts, flows, mobi8_reader)
|
||||||
|
|
||||||
|
# Insert raster images into markup
|
||||||
|
insert_images_into_markup(parts, resource_map, log)
|
||||||
|
|
||||||
|
# Perform general markup cleanups
|
||||||
|
upshift_markup(parts)
|
||||||
|
|
||||||
|
# Update the parts and flows stored in the reader
|
||||||
|
mobi8_reader.parts = parts
|
||||||
|
mobi8_reader.flows = flows
|
||||||
|
|
||||||
|
# write out the parts and file flows
|
||||||
|
os.mkdir('text') # directory containing all parts
|
||||||
|
spine = []
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
pi = mobi8_reader.partinfo[i]
|
||||||
|
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
|
||||||
|
f.write(part.encode('utf-8'))
|
||||||
|
spine.append(f.name)
|
||||||
|
|
||||||
|
for i, flow in enumerate(flows):
|
||||||
|
fi = mobi8_reader.flowinfo[i]
|
||||||
|
if fi.format == 'file':
|
||||||
|
if not os.path.exists(fi.dir):
|
||||||
|
os.mkdir(fi.dir)
|
||||||
|
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
|
||||||
|
f.write(flow.encode('utf-8'))
|
||||||
|
|
||||||
|
return spine
|
||||||
|
|
@ -1,10 +1,12 @@
|
|||||||
__license__ = 'GPL v3'
|
#!/usr/bin/env python
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
'''
|
from __future__ import (absolute_import, print_function)
|
||||||
Read data from .mobi files
|
|
||||||
'''
|
|
||||||
|
|
||||||
import shutil, os, re, struct, textwrap, cStringIO, sys
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import shutil, os, re, struct, textwrap, cStringIO
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
@ -14,235 +16,22 @@ except ImportError:
|
|||||||
|
|
||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
|
|
||||||
from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode, \
|
from calibre import (xml_entity_to_unicode, entity_to_unicode)
|
||||||
replace_entities
|
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.utils.date import parse_date
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
|
||||||
from calibre.ebooks import DRMError, unit_convert
|
from calibre.ebooks import DRMError, unit_convert
|
||||||
from calibre.ebooks.chardet import ENCODING_PATS
|
from calibre.ebooks.chardet import ENCODING_PATS
|
||||||
from calibre.ebooks.mobi import MobiError
|
from calibre.ebooks.mobi import MobiError
|
||||||
from calibre.ebooks.mobi.huffcdic import HuffReader
|
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
|
||||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.mobi.reader.headers import BookHeader
|
||||||
|
|
||||||
class TopazError(ValueError):
|
class TopazError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class EXTHHeader(object):
|
|
||||||
|
|
||||||
def __init__(self, raw, codec, title):
|
|
||||||
self.doctype = raw[:4]
|
|
||||||
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
|
||||||
raw = raw[12:]
|
|
||||||
pos = 0
|
|
||||||
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
|
||||||
self.has_fake_cover = True
|
|
||||||
left = self.num_items
|
|
||||||
|
|
||||||
while left > 0:
|
|
||||||
left -= 1
|
|
||||||
id, size = struct.unpack('>LL', raw[pos:pos + 8])
|
|
||||||
content = raw[pos + 8:pos + size]
|
|
||||||
pos += size
|
|
||||||
if id >= 100 and id < 200:
|
|
||||||
self.process_metadata(id, content, codec)
|
|
||||||
elif id == 203:
|
|
||||||
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
|
||||||
elif id == 201:
|
|
||||||
co, = struct.unpack('>L', content)
|
|
||||||
if co < 1e7:
|
|
||||||
self.cover_offset = co
|
|
||||||
elif id == 202:
|
|
||||||
self.thumbnail_offset, = struct.unpack('>L', content)
|
|
||||||
elif id == 501:
|
|
||||||
# cdetype
|
|
||||||
pass
|
|
||||||
elif id == 502:
|
|
||||||
# last update time
|
|
||||||
pass
|
|
||||||
elif id == 503: # Long title
|
|
||||||
# Amazon seems to regard this as the definitive book title
|
|
||||||
# rather than the title from the PDB header. In fact when
|
|
||||||
# sending MOBI files through Amazon's email service if the
|
|
||||||
# title contains non ASCII chars or non filename safe chars
|
|
||||||
# they are messed up in the PDB header
|
|
||||||
try:
|
|
||||||
title = content.decode(codec)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
#else:
|
|
||||||
# print 'unknown record', id, repr(content)
|
|
||||||
if title:
|
|
||||||
self.mi.title = replace_entities(title)
|
|
||||||
|
|
||||||
def process_metadata(self, id, content, codec):
|
|
||||||
if id == 100:
|
|
||||||
if self.mi.authors == [_('Unknown')]:
|
|
||||||
self.mi.authors = []
|
|
||||||
au = content.decode(codec, 'ignore').strip()
|
|
||||||
self.mi.authors.append(au)
|
|
||||||
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
|
||||||
self.mi.author_sort = au.strip()
|
|
||||||
elif id == 101:
|
|
||||||
self.mi.publisher = content.decode(codec, 'ignore').strip()
|
|
||||||
elif id == 103:
|
|
||||||
self.mi.comments = content.decode(codec, 'ignore')
|
|
||||||
elif id == 104:
|
|
||||||
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
|
|
||||||
elif id == 105:
|
|
||||||
if not self.mi.tags:
|
|
||||||
self.mi.tags = []
|
|
||||||
self.mi.tags.extend([x.strip() for x in content.decode(codec,
|
|
||||||
'ignore').split(';')])
|
|
||||||
self.mi.tags = list(set(self.mi.tags))
|
|
||||||
elif id == 106:
|
|
||||||
try:
|
|
||||||
self.mi.pubdate = parse_date(content, as_utc=False)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
elif id == 108:
|
|
||||||
pass # Producer
|
|
||||||
elif id == 113:
|
|
||||||
pass # ASIN or UUID
|
|
||||||
#else:
|
|
||||||
# print 'unhandled metadata record', id, repr(content)
|
|
||||||
|
|
||||||
|
|
||||||
class BookHeader(object):
|
|
||||||
|
|
||||||
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
|
|
||||||
self.log = log
|
|
||||||
self.compression_type = raw[:2]
|
|
||||||
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
|
||||||
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
|
||||||
if ident == 'TEXTREAD':
|
|
||||||
self.codepage = 1252
|
|
||||||
if len(raw) <= 16:
|
|
||||||
self.codec = 'cp1252'
|
|
||||||
self.extra_flags = 0
|
|
||||||
self.title = _('Unknown')
|
|
||||||
self.language = 'ENGLISH'
|
|
||||||
self.sublanguage = 'NEUTRAL'
|
|
||||||
self.exth_flag, self.exth = 0, None
|
|
||||||
self.ancient = True
|
|
||||||
self.first_image_index = -1
|
|
||||||
self.mobi_version = 1
|
|
||||||
else:
|
|
||||||
self.ancient = False
|
|
||||||
self.doctype = raw[16:20]
|
|
||||||
self.length, self.type, self.codepage, self.unique_id, \
|
|
||||||
self.version = struct.unpack('>LLLLL', raw[20:40])
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.codec = {
|
|
||||||
1252: 'cp1252',
|
|
||||||
65001: 'utf-8',
|
|
||||||
}[self.codepage]
|
|
||||||
except (IndexError, KeyError):
|
|
||||||
self.codec = 'cp1252' if not user_encoding else user_encoding
|
|
||||||
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
|
||||||
self.codec))
|
|
||||||
# There exists some broken DRM removal tool that removes DRM but
|
|
||||||
# leaves the DRM fields in the header yielding a header size of
|
|
||||||
# 0xF8. The actual value of max_header_length should be 0xE8 but
|
|
||||||
# it's changed to accommodate this silly tool. Hopefully that will
|
|
||||||
# not break anything else.
|
|
||||||
max_header_length = 0xF8
|
|
||||||
|
|
||||||
if (ident == 'TEXTREAD' or self.length < 0xE4 or
|
|
||||||
self.length > max_header_length or
|
|
||||||
(try_extra_data_fix and self.length == 0xE4)):
|
|
||||||
self.extra_flags = 0
|
|
||||||
else:
|
|
||||||
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
|
||||||
|
|
||||||
if self.compression_type == 'DH':
|
|
||||||
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
|
|
||||||
|
|
||||||
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
|
||||||
tend = toff + tlen
|
|
||||||
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
|
||||||
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
|
|
||||||
langid = langcode & 0xFF
|
|
||||||
sublangid = (langcode >> 10) & 0xFF
|
|
||||||
self.language = main_language.get(langid, 'ENGLISH')
|
|
||||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
|
||||||
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
|
||||||
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
|
|
||||||
|
|
||||||
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
|
||||||
self.exth = None
|
|
||||||
if not isinstance(self.title, unicode):
|
|
||||||
self.title = self.title.decode(self.codec, 'replace')
|
|
||||||
if self.exth_flag & 0x40:
|
|
||||||
try:
|
|
||||||
self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
|
|
||||||
self.exth.mi.uid = self.unique_id
|
|
||||||
try:
|
|
||||||
self.exth.mi.language = mobi2iana(langid, sublangid)
|
|
||||||
except:
|
|
||||||
self.log.exception('Unknown language code')
|
|
||||||
except:
|
|
||||||
self.log.exception('Invalid EXTH header')
|
|
||||||
self.exth_flag = 0
|
|
||||||
|
|
||||||
|
|
||||||
class MetadataHeader(BookHeader):
|
|
||||||
def __init__(self, stream, log):
|
|
||||||
self.stream = stream
|
|
||||||
self.ident = self.identity()
|
|
||||||
self.num_sections = self.section_count()
|
|
||||||
if self.num_sections >= 2:
|
|
||||||
header = self.header()
|
|
||||||
BookHeader.__init__(self, header, self.ident, None, log)
|
|
||||||
else:
|
|
||||||
self.exth = None
|
|
||||||
|
|
||||||
def identity(self):
|
|
||||||
self.stream.seek(60)
|
|
||||||
ident = self.stream.read(8).upper()
|
|
||||||
if ident not in ['BOOKMOBI', 'TEXTREAD']:
|
|
||||||
raise MobiError('Unknown book type: %s' % ident)
|
|
||||||
return ident
|
|
||||||
|
|
||||||
def section_count(self):
|
|
||||||
self.stream.seek(76)
|
|
||||||
return struct.unpack('>H', self.stream.read(2))[0]
|
|
||||||
|
|
||||||
def section_offset(self, number):
|
|
||||||
self.stream.seek(78 + number * 8)
|
|
||||||
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
|
||||||
|
|
||||||
def header(self):
|
|
||||||
section_headers = []
|
|
||||||
# First section with the metadata
|
|
||||||
section_headers.append(self.section_offset(0))
|
|
||||||
# Second section used to get the lengh of the first
|
|
||||||
section_headers.append(self.section_offset(1))
|
|
||||||
|
|
||||||
end_off = section_headers[1]
|
|
||||||
off = section_headers[0]
|
|
||||||
self.stream.seek(off)
|
|
||||||
return self.stream.read(end_off - off)
|
|
||||||
|
|
||||||
def section_data(self, number):
|
|
||||||
start = self.section_offset(number)
|
|
||||||
if number == self.num_sections -1:
|
|
||||||
end = os.stat(self.stream.name).st_size
|
|
||||||
else:
|
|
||||||
end = self.section_offset(number + 1)
|
|
||||||
self.stream.seek(start)
|
|
||||||
try:
|
|
||||||
return self.stream.read(end - start)
|
|
||||||
except OverflowError:
|
|
||||||
return self.stream.read(os.stat(self.stream.name).st_size - start)
|
|
||||||
|
|
||||||
|
|
||||||
class MobiReader(object):
|
class MobiReader(object):
|
||||||
PAGE_BREAK_PAT = re.compile(
|
PAGE_BREAK_PAT = re.compile(
|
||||||
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||||
@ -312,15 +101,46 @@ class MobiReader(object):
|
|||||||
self.sections.append((section(i), self.section_headers[i]))
|
self.sections.append((section(i), self.section_headers[i]))
|
||||||
|
|
||||||
|
|
||||||
self.book_header = BookHeader(self.sections[0][0], self.ident,
|
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
|
||||||
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
|
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
|
||||||
self.name = self.name.decode(self.book_header.codec, 'replace')
|
self.name = self.name.decode(self.book_header.codec, 'replace')
|
||||||
|
self.kf8_type = None
|
||||||
|
is_kf8 = self.book_header.mobi_version == 8
|
||||||
|
if is_kf8:
|
||||||
|
self.kf8_type = 'standalone'
|
||||||
|
else: # Check for joint mobi 6 and kf 8 file
|
||||||
|
KF8_BOUNDARY = b'BOUNDARY'
|
||||||
|
for i, x in enumerate(self.sections[:-1]):
|
||||||
|
sec = x[0]
|
||||||
|
if (len(sec) == len(KF8_BOUNDARY) and sec ==
|
||||||
|
KF8_BOUNDARY):
|
||||||
|
try:
|
||||||
|
self.book_header = BookHeader(self.sections[i+1][0],
|
||||||
|
self.ident, user_encoding, self.log)
|
||||||
|
# The following are only correct in the Mobi 6
|
||||||
|
# header not the Mobi 8 header
|
||||||
|
for x in ('first_image_index',):
|
||||||
|
setattr(self.book_header, x, getattr(bh, x))
|
||||||
|
self.book_header.huff_offset += i + 1
|
||||||
|
self.kf8_type = 'joint'
|
||||||
|
self.kf8_boundary = i
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
|
||||||
|
def check_for_drm(self):
|
||||||
|
if self.book_header.encryption_type != 0:
|
||||||
|
try:
|
||||||
|
name = self.book_header.exth.mi.title
|
||||||
|
except:
|
||||||
|
name = self.name
|
||||||
|
if not name:
|
||||||
|
name = self.name
|
||||||
|
raise DRMError(name)
|
||||||
|
|
||||||
def extract_content(self, output_dir, parse_cache):
|
def extract_content(self, output_dir, parse_cache):
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
if self.book_header.encryption_type != 0:
|
self.check_for_drm()
|
||||||
raise DRMError(self.name)
|
|
||||||
|
|
||||||
processed_records = self.extract_text()
|
processed_records = self.extract_text()
|
||||||
if self.debug is not None:
|
if self.debug is not None:
|
||||||
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
||||||
@ -916,11 +736,12 @@ class MobiReader(object):
|
|||||||
trail_size = self.sizeof_trailing_entries(data)
|
trail_size = self.sizeof_trailing_entries(data)
|
||||||
return data[:len(data)-trail_size]
|
return data[:len(data)-trail_size]
|
||||||
|
|
||||||
def extract_text(self):
|
def extract_text(self, offset=1):
|
||||||
self.log.debug('Extracting text...')
|
self.log.debug('Extracting text...')
|
||||||
text_sections = [self.text_section(i) for i in range(1,
|
text_sections = [self.text_section(i) for i in xrange(offset,
|
||||||
min(self.book_header.records + 1, len(self.sections)))]
|
min(self.book_header.records + offset, len(self.sections)))]
|
||||||
processed_records = list(range(0, self.book_header.records + 1))
|
processed_records = list(range(offset-1, self.book_header.records +
|
||||||
|
offset))
|
||||||
|
|
||||||
self.mobi_html = ''
|
self.mobi_html = ''
|
||||||
|
|
||||||
@ -1027,63 +848,6 @@ class MobiReader(object):
|
|||||||
self.image_names.append(os.path.basename(path))
|
self.image_names.append(os.path.basename(path))
|
||||||
im.save(open(path, 'wb'), format='JPEG')
|
im.save(open(path, 'wb'), format='JPEG')
|
||||||
|
|
||||||
def get_metadata(stream):
|
|
||||||
stream.seek(0)
|
|
||||||
try:
|
|
||||||
raw = stream.read(3)
|
|
||||||
except:
|
|
||||||
raw = ''
|
|
||||||
stream.seek(0)
|
|
||||||
if raw == 'TPZ':
|
|
||||||
from calibre.ebooks.metadata.topaz import get_metadata
|
|
||||||
return get_metadata(stream)
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
log = Log()
|
|
||||||
try:
|
|
||||||
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
|
|
||||||
except:
|
|
||||||
mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
|
||||||
mh = MetadataHeader(stream, log)
|
|
||||||
if mh.title and mh.title != _('Unknown'):
|
|
||||||
mi.title = mh.title
|
|
||||||
|
|
||||||
if mh.exth is not None:
|
|
||||||
if mh.exth.mi is not None:
|
|
||||||
mi = mh.exth.mi
|
|
||||||
else:
|
|
||||||
size = sys.maxint
|
|
||||||
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
|
|
||||||
pos = stream.tell()
|
|
||||||
stream.seek(0, 2)
|
|
||||||
size = stream.tell()
|
|
||||||
stream.seek(pos)
|
|
||||||
if size < 4*1024*1024:
|
|
||||||
with TemporaryDirectory('_mobi_meta_reader') as tdir:
|
|
||||||
with CurrentDir(tdir):
|
|
||||||
mr = MobiReader(stream, log)
|
|
||||||
parse_cache = {}
|
|
||||||
mr.extract_content(tdir, parse_cache)
|
|
||||||
if mr.embedded_mi is not None:
|
|
||||||
mi = mr.embedded_mi
|
|
||||||
if hasattr(mh.exth, 'cover_offset'):
|
|
||||||
cover_index = mh.first_image_index + mh.exth.cover_offset
|
|
||||||
data = mh.section_data(int(cover_index))
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
data = mh.section_data(mh.first_image_index)
|
|
||||||
except:
|
|
||||||
data = ''
|
|
||||||
buf = cStringIO.StringIO(data)
|
|
||||||
try:
|
|
||||||
im = PILImage.open(buf)
|
|
||||||
except:
|
|
||||||
log.exception('Failed to read MOBI cover')
|
|
||||||
else:
|
|
||||||
obuf = cStringIO.StringIO()
|
|
||||||
im.convert('RGB').save(obuf, format='JPEG')
|
|
||||||
mi.cover_data = ('jpg', obuf.getvalue())
|
|
||||||
return mi
|
|
||||||
|
|
||||||
def test_mbp_regex():
|
def test_mbp_regex():
|
||||||
for raw, m in {
|
for raw, m in {
|
||||||
'<mbp:pagebreak></mbp:pagebreak>':'',
|
'<mbp:pagebreak></mbp:pagebreak>':'',
|
390
src/calibre/ebooks/mobi/reader/mobi8.py
Normal file
390
src/calibre/ebooks/mobi/reader/mobi8.py
Normal file
@ -0,0 +1,390 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import struct, re, os, zlib, imghdr
|
||||||
|
from collections import namedtuple
|
||||||
|
from itertools import repeat
|
||||||
|
|
||||||
|
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
|
from calibre.ebooks.mobi.reader.index import read_index
|
||||||
|
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
||||||
|
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
|
||||||
|
|
||||||
|
Part = namedtuple('Part',
|
||||||
|
'num type filename start end aid')
|
||||||
|
|
||||||
|
Elem = namedtuple('Elem',
|
||||||
|
'insert_pos toc_text file_number sequence_number start_pos '
|
||||||
|
'length')
|
||||||
|
|
||||||
|
FlowInfo = namedtuple('FlowInfo',
|
||||||
|
'type format dir fname')
|
||||||
|
|
||||||
|
class Mobi8Reader(object):
|
||||||
|
|
||||||
|
def __init__(self, mobi6_reader, log):
|
||||||
|
self.mobi6_reader, self.log = mobi6_reader, log
|
||||||
|
self.header = mobi6_reader.book_header
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
self.mobi6_reader.check_for_drm()
|
||||||
|
offset = 1
|
||||||
|
res_end = len(self.mobi6_reader.sections)
|
||||||
|
if self.mobi6_reader.kf8_type == 'joint':
|
||||||
|
offset = self.mobi6_reader.kf8_boundary + 2
|
||||||
|
res_end = self.mobi6_reader.kf8_boundary
|
||||||
|
|
||||||
|
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
|
||||||
|
self.raw_ml = self.mobi6_reader.mobi_html
|
||||||
|
with open('debug-raw.html', 'wb') as f:
|
||||||
|
f.write(self.raw_ml)
|
||||||
|
|
||||||
|
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
|
||||||
|
first_resource_index = self.header.first_image_index
|
||||||
|
if first_resource_index in {-1, NULL_INDEX}:
|
||||||
|
first_resource_index = self.header.records + 1
|
||||||
|
self.resource_sections = \
|
||||||
|
self.mobi6_reader.sections[first_resource_index:res_end]
|
||||||
|
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
|
||||||
|
|
||||||
|
self.read_indices()
|
||||||
|
self.build_parts()
|
||||||
|
guide = self.create_guide()
|
||||||
|
ncx = self.create_ncx()
|
||||||
|
resource_map = self.extract_resources()
|
||||||
|
spine = self.expand_text(resource_map)
|
||||||
|
return self.write_opf(guide, ncx, spine, resource_map)
|
||||||
|
|
||||||
|
def read_indices(self):
|
||||||
|
self.flow_table = (0, NULL_INDEX)
|
||||||
|
|
||||||
|
if self.header.fdstidx != NULL_INDEX:
|
||||||
|
header = self.kf8_sections[self.header.fdstidx][0]
|
||||||
|
if header[:4] != b'FDST':
|
||||||
|
raise ValueError('KF8 does not have a valid FDST record')
|
||||||
|
num_sections, = struct.unpack_from(b'>L', header, 0x08)
|
||||||
|
sections = header[0x0c:]
|
||||||
|
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||||
|
sections, 0)[::2] + (NULL_INDEX,)
|
||||||
|
|
||||||
|
self.files = []
|
||||||
|
if self.header.skelidx != NULL_INDEX:
|
||||||
|
table = read_index(self.kf8_sections, self.header.skelidx,
|
||||||
|
self.header.codec)[0]
|
||||||
|
File = namedtuple('File',
|
||||||
|
'file_number name divtbl_count start_position length')
|
||||||
|
|
||||||
|
for i, text in enumerate(table.iterkeys()):
|
||||||
|
tag_map = table[text]
|
||||||
|
self.files.append(File(i, text, tag_map[1][0],
|
||||||
|
tag_map[6][0], tag_map[6][1]))
|
||||||
|
|
||||||
|
self.elems = []
|
||||||
|
if self.header.dividx != NULL_INDEX:
|
||||||
|
table, cncx = read_index(self.kf8_sections, self.header.dividx,
|
||||||
|
self.header.codec)
|
||||||
|
for i, text in enumerate(table.iterkeys()):
|
||||||
|
tag_map = table[text]
|
||||||
|
toc_text = cncx[tag_map[2][0]]
|
||||||
|
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
|
||||||
|
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
|
||||||
|
|
||||||
|
self.guide = []
|
||||||
|
if self.header.othidx != NULL_INDEX:
|
||||||
|
table, cncx = read_index(self.kf8_sections, self.header.othidx,
|
||||||
|
self.header.codec)
|
||||||
|
Item = namedtuple('Item',
|
||||||
|
'type title div_frag_num')
|
||||||
|
|
||||||
|
for i, ref_type in enumerate(table.iterkeys()):
|
||||||
|
tag_map = table[ref_type]
|
||||||
|
# ref_type, ref_title, div/frag number
|
||||||
|
title = cncx[tag_map[1][0]]
|
||||||
|
fileno = None
|
||||||
|
if 3 in tag_map.keys():
|
||||||
|
fileno = tag_map[3][0]
|
||||||
|
if 6 in tag_map.keys():
|
||||||
|
fileno = tag_map[6][0]
|
||||||
|
self.guide.append(Item(ref_type.decode(self.header.codec),
|
||||||
|
title, fileno))
|
||||||
|
|
||||||
|
def build_parts(self):
|
||||||
|
raw_ml = self.mobi6_reader.mobi_html
|
||||||
|
self.flows = []
|
||||||
|
self.flowinfo = []
|
||||||
|
|
||||||
|
# now split the raw_ml into its flow pieces
|
||||||
|
for j in xrange(0, len(self.flow_table)-1):
|
||||||
|
start = self.flow_table[j]
|
||||||
|
end = self.flow_table[j+1]
|
||||||
|
if end == NULL_INDEX:
|
||||||
|
end = len(raw_ml)
|
||||||
|
self.flows.append(raw_ml[start:end])
|
||||||
|
|
||||||
|
# the first piece represents the xhtml text
|
||||||
|
text = self.flows[0]
|
||||||
|
self.flows[0] = b''
|
||||||
|
|
||||||
|
# walk the <skeleton> and <div> tables to build original source xhtml
|
||||||
|
# files *without* destroying any file position information needed for
|
||||||
|
# later href processing and create final list of file separation start:
|
||||||
|
# stop points and etc in partinfo
|
||||||
|
self.parts = []
|
||||||
|
self.partinfo = []
|
||||||
|
divptr = 0
|
||||||
|
baseptr = 0
|
||||||
|
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
|
||||||
|
baseptr = skelpos + skellen
|
||||||
|
skeleton = text[skelpos:baseptr]
|
||||||
|
for i in xrange(divcnt):
|
||||||
|
insertpos, idtext, filenum, seqnum, startpos, length = \
|
||||||
|
self.elems[divptr]
|
||||||
|
if i == 0:
|
||||||
|
aidtext = idtext[12:-2]
|
||||||
|
filename = 'part%04d.html' % filenum
|
||||||
|
part = text[baseptr:baseptr + length]
|
||||||
|
insertpos = insertpos - skelpos
|
||||||
|
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
|
||||||
|
baseptr = baseptr + length
|
||||||
|
divptr += 1
|
||||||
|
self.parts.append(skeleton)
|
||||||
|
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
|
||||||
|
baseptr, aidtext))
|
||||||
|
|
||||||
|
# The primary css style sheet is typically stored next followed by any
|
||||||
|
# snippets of code that were previously inlined in the
|
||||||
|
# original xhtml but have been stripped out and placed here.
|
||||||
|
# This can include local CDATA snippets and and svg sections.
|
||||||
|
|
||||||
|
# The problem is that for most browsers and ereaders, you can not
|
||||||
|
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||||
|
# properly uses an <image/> tag to import some raster image - it
|
||||||
|
# should work according to the spec but does not for almost all browsers
|
||||||
|
# and ereaders and causes epub validation issues because those raster
|
||||||
|
# images are in manifest but not in xhtml text - since they only
|
||||||
|
# referenced from an svg image
|
||||||
|
|
||||||
|
# So we need to check the remaining flow pieces to see if they are css
|
||||||
|
# or svg images. if svg images, we must check if they have an <image/>
|
||||||
|
# and if so inline them into the xhtml text pieces.
|
||||||
|
|
||||||
|
# there may be other sorts of pieces stored here but until we see one
|
||||||
|
# in the wild to reverse engineer we won't be able to tell
|
||||||
|
|
||||||
|
self.flowinfo.append(FlowInfo(None, None, None, None))
|
||||||
|
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
|
||||||
|
image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
|
||||||
|
for j in xrange(1, len(self.flows)):
|
||||||
|
flowpart = self.flows[j]
|
||||||
|
nstr = '%04d' % j
|
||||||
|
m = svg_tag_pattern.search(flowpart)
|
||||||
|
if m != None:
|
||||||
|
# svg
|
||||||
|
typ = 'svg'
|
||||||
|
start = m.start()
|
||||||
|
m2 = image_tag_pattern.search(flowpart)
|
||||||
|
if m2 != None:
|
||||||
|
format = 'inline'
|
||||||
|
dir = None
|
||||||
|
fname = None
|
||||||
|
# strip off anything before <svg if inlining
|
||||||
|
flowpart = flowpart[start:]
|
||||||
|
else:
|
||||||
|
format = 'file'
|
||||||
|
dir = "images"
|
||||||
|
fname = 'svgimg' + nstr + '.svg'
|
||||||
|
else:
|
||||||
|
# search for CDATA and if exists inline it
|
||||||
|
if flowpart.find('[CDATA[') >= 0:
|
||||||
|
typ = 'css'
|
||||||
|
flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
|
||||||
|
format = 'inline'
|
||||||
|
dir = None
|
||||||
|
fname = None
|
||||||
|
else:
|
||||||
|
# css - assume as standalone css file
|
||||||
|
typ = 'css'
|
||||||
|
format = 'file'
|
||||||
|
dir = "styles"
|
||||||
|
fname = nstr + '.css'
|
||||||
|
|
||||||
|
self.flows[j] = flowpart
|
||||||
|
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
|
||||||
|
|
||||||
|
def get_file_info(self, pos):
|
||||||
|
''' Get information about the part (file) that exists at pos in
|
||||||
|
the raw markup '''
|
||||||
|
for part in self.partinfo:
|
||||||
|
if pos >= part.start and pos < part.end:
|
||||||
|
return part
|
||||||
|
return Part(*repeat(None, len(Part._fields)))
|
||||||
|
|
||||||
|
def get_id_tag_by_pos_fid(self, posfid, offset):
|
||||||
|
# first convert kindle:pos:fid and offset info to position in file
|
||||||
|
row = int(posfid, 32)
|
||||||
|
off = int(offset, 32)
|
||||||
|
[insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
|
||||||
|
pos = insertpos + off
|
||||||
|
fname = self.get_file_info(pos).filename
|
||||||
|
# an existing "id=" must exist in original xhtml otherwise it would not
|
||||||
|
# have worked for linking. Amazon seems to have added its own
|
||||||
|
# additional "aid=" inside tags whose contents seem to represent some
|
||||||
|
# position information encoded into Base32 name.
|
||||||
|
|
||||||
|
# so find the closest "id=" before position the file by actually
|
||||||
|
# searching in that file
|
||||||
|
idtext = self.get_id_tag(pos)
|
||||||
|
return fname, idtext
|
||||||
|
|
||||||
|
def get_id_tag(self, pos):
|
||||||
|
# find the correct tag by actually searching in the destination
|
||||||
|
# textblock at position
|
||||||
|
fi = self.get_file_info(pos)
|
||||||
|
if fi.num is None and fi.start is None:
|
||||||
|
raise ValueError('No file contains pos: %d'%pos)
|
||||||
|
textblock = self.parts[fi.num]
|
||||||
|
id_map = []
|
||||||
|
npos = pos - fi.start
|
||||||
|
# if npos inside a tag then search all text before the its end of tag
|
||||||
|
# marker
|
||||||
|
pgt = textblock.find(b'>', npos)
|
||||||
|
plt = textblock.find(b'<', npos)
|
||||||
|
if pgt < plt:
|
||||||
|
npos = pgt + 1
|
||||||
|
# find id links only inside of tags
|
||||||
|
# inside any < > pair find all "id=' and return whatever is inside
|
||||||
|
# the quotes
|
||||||
|
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
|
||||||
|
re.IGNORECASE)
|
||||||
|
for m in re.finditer(id_pattern, textblock):
|
||||||
|
id_map.append((m.start(), m.group(1)))
|
||||||
|
|
||||||
|
if not id_map:
|
||||||
|
# Found no id in the textblock, link must be to top of file
|
||||||
|
return b''
|
||||||
|
# if npos is before first id= inside a tag, return the first
|
||||||
|
if npos < id_map[0][0]:
|
||||||
|
return id_map[0][1]
|
||||||
|
# if npos is after the last id= inside a tag, return the last
|
||||||
|
if npos > id_map[-1][0]:
|
||||||
|
return id_map[-1][1]
|
||||||
|
# otherwise find last id before npos
|
||||||
|
for i, item in enumerate(id_map):
|
||||||
|
if npos < item[0]:
|
||||||
|
return id_map[i-1][1]
|
||||||
|
return id_map[0][1]
|
||||||
|
|
||||||
|
def create_guide(self):
|
||||||
|
guide = Guide()
|
||||||
|
for ref_type, ref_title, fileno in self.guide:
|
||||||
|
elem = self.elems[fileno]
|
||||||
|
fi = self.get_file_info(elem.insert_pos)
|
||||||
|
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
|
||||||
|
linktgt = fi.filename
|
||||||
|
if idtext:
|
||||||
|
linktgt += b'#' + idtext
|
||||||
|
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
|
||||||
|
g.title, g.type = ref_title, ref_type
|
||||||
|
guide.append(g)
|
||||||
|
|
||||||
|
so = self.header.exth.start_offset
|
||||||
|
if so not in {None, NULL_INDEX}:
|
||||||
|
fi = self.get_file_info(so)
|
||||||
|
if fi.filename is not None:
|
||||||
|
idtext = self.get_id_tag(so).decode(self.header.codec)
|
||||||
|
linktgt = fi.filename
|
||||||
|
if idtext:
|
||||||
|
linktgt += '#' + idtext
|
||||||
|
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
|
||||||
|
g.title, g.type = 'start', 'text'
|
||||||
|
guide.append(g)
|
||||||
|
|
||||||
|
return guide
|
||||||
|
|
||||||
|
def create_ncx(self):
|
||||||
|
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
|
||||||
|
self.header.codec)
|
||||||
|
|
||||||
|
# Add href and anchor info to the index entries
|
||||||
|
for entry in index_entries:
|
||||||
|
pos = entry['pos']
|
||||||
|
fi = self.get_file_info(pos)
|
||||||
|
if fi.filename is None:
|
||||||
|
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||||
|
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
||||||
|
entry['href'] = '%s/%s'%(fi.type, fi.filename)
|
||||||
|
entry['idtag'] = idtag
|
||||||
|
|
||||||
|
# Build the TOC object
|
||||||
|
return build_toc(index_entries)
|
||||||
|
|
||||||
|
def extract_resources(self):
|
||||||
|
resource_map = []
|
||||||
|
for x in ('fonts', 'images'):
|
||||||
|
os.mkdir(x)
|
||||||
|
|
||||||
|
for i, sec in enumerate(self.resource_sections):
|
||||||
|
fname_idx = i+1
|
||||||
|
data = sec[0]
|
||||||
|
typ = data[:4]
|
||||||
|
href = None
|
||||||
|
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
|
||||||
|
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
|
||||||
|
pass # Ignore these records
|
||||||
|
elif typ == b'FONT':
|
||||||
|
# fonts only exist in K8 ebooks
|
||||||
|
# Format:
|
||||||
|
# bytes 0 - 3: 'FONT'
|
||||||
|
# bytes 4 - 7: ?? Expanded size in bytes ??
|
||||||
|
# bytes 8 - 11: ?? number of files ??
|
||||||
|
# bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
|
||||||
|
# bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib?
|
||||||
|
# The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
|
||||||
|
data = data[26:-4]
|
||||||
|
uncompressed_data = zlib.decompress(data, -15)
|
||||||
|
hdr = uncompressed_data[0:4]
|
||||||
|
ext = 'dat'
|
||||||
|
if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
|
||||||
|
ext = 'ttf'
|
||||||
|
href = "fonts/%05d.%s" % (fname_idx, ext)
|
||||||
|
with open(href.replace('/', os.sep), 'wb') as f:
|
||||||
|
f.write(uncompressed_data)
|
||||||
|
else:
|
||||||
|
imgtype = imghdr.what(None, data)
|
||||||
|
if imgtype is None:
|
||||||
|
imgtype = 'unknown'
|
||||||
|
href = 'images/%05d.%s'%(fname_idx, imgtype)
|
||||||
|
with open(href.replace('/', os.sep), 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
resource_map.append(href)
|
||||||
|
|
||||||
|
return resource_map
|
||||||
|
|
||||||
|
def expand_text(self, resource_map):
|
||||||
|
return expand_mobi8_markup(self, resource_map, self.log)
|
||||||
|
|
||||||
|
def write_opf(self, guide, toc, spine, resource_map):
|
||||||
|
mi = self.header.exth.mi
|
||||||
|
if (self.cover_offset is not None and self.cover_offset <
|
||||||
|
len(resource_map)):
|
||||||
|
mi.cover = resource_map[self.cover_offset]
|
||||||
|
|
||||||
|
opf = OPFCreator(os.getcwdu(), mi)
|
||||||
|
opf.guide = guide
|
||||||
|
opf.create_manifest_from_files_in([os.getcwdu()])
|
||||||
|
opf.create_spine(spine)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
|
||||||
|
opf.render(of, ncx, 'toc.ncx')
|
||||||
|
return 'metadata.opf'
|
||||||
|
|
||||||
|
|
84
src/calibre/ebooks/mobi/reader/ncx.py
Normal file
84
src/calibre/ebooks/mobi/reader/ncx.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.mobi.utils import to_base
|
||||||
|
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
|
from calibre.ebooks.mobi.reader.index import read_index
|
||||||
|
|
||||||
|
tag_fieldname_map = {
|
||||||
|
1: ['pos',0],
|
||||||
|
2: ['len',0],
|
||||||
|
3: ['noffs',0],
|
||||||
|
4: ['hlvl',0],
|
||||||
|
5: ['koffs',0],
|
||||||
|
6: ['pos_fid',0],
|
||||||
|
21: ['parent',0],
|
||||||
|
22: ['child1',0],
|
||||||
|
23: ['childn',0]
|
||||||
|
}
|
||||||
|
|
||||||
|
def read_ncx(sections, index, codec):
|
||||||
|
index_entries = []
|
||||||
|
|
||||||
|
if index != NULL_INDEX:
|
||||||
|
table, cncx = read_index(sections, index, codec)
|
||||||
|
|
||||||
|
for num, x in enumerate(table.iteritems()):
|
||||||
|
text, tag_map = x
|
||||||
|
entry = {
|
||||||
|
'name': text,
|
||||||
|
'pos': -1,
|
||||||
|
'len': 0,
|
||||||
|
'noffs': -1,
|
||||||
|
'text' : "Unknown Text",
|
||||||
|
'hlvl' : -1,
|
||||||
|
'kind' : "Unknown Kind",
|
||||||
|
'pos_fid' : None,
|
||||||
|
'parent' : -1,
|
||||||
|
'child1' : -1,
|
||||||
|
'childn' : -1,
|
||||||
|
'num' : num
|
||||||
|
}
|
||||||
|
|
||||||
|
for tag in tag_fieldname_map.keys():
|
||||||
|
fieldname, i = tag_fieldname_map[tag]
|
||||||
|
if tag in tag_map:
|
||||||
|
fieldvalue = tag_map[tag][i]
|
||||||
|
if tag == 6:
|
||||||
|
fieldvalue = to_base(fieldvalue, base=32)
|
||||||
|
entry[fieldname] = fieldvalue
|
||||||
|
if tag == 3:
|
||||||
|
entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
|
||||||
|
if tag == 5:
|
||||||
|
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
|
||||||
|
index_entries.append(entry)
|
||||||
|
|
||||||
|
return index_entries
|
||||||
|
|
||||||
|
def build_toc(index_entries):
|
||||||
|
ans = TOC(base_path=os.getcwdu())
|
||||||
|
levels = {x['hlvl'] for x in index_entries}
|
||||||
|
num_map = {-1: ans}
|
||||||
|
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
|
||||||
|
levels}
|
||||||
|
for lvl in sorted(levels):
|
||||||
|
for item in level_map[lvl]:
|
||||||
|
parent = num_map[item['parent']]
|
||||||
|
child = parent.add_item(item['href'], item['idtag'], item['text'])
|
||||||
|
num_map[item['num']] = child
|
||||||
|
|
||||||
|
# Set play orders in depth first order
|
||||||
|
for i, item in enumerate(ans.flat()):
|
||||||
|
item.play_order = i
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import struct
|
import struct, string
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||||
@ -340,4 +340,26 @@ def detect_periodical(toc, log=None):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def count_set_bits(num):
|
||||||
|
if num < 0:
|
||||||
|
num = -num
|
||||||
|
ans = 0
|
||||||
|
while num > 0:
|
||||||
|
ans += (num & 0b1)
|
||||||
|
num >>= 1
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def to_base(num, base=32):
|
||||||
|
digits = string.digits + string.ascii_uppercase
|
||||||
|
sign = 1 if num >= 0 else -1
|
||||||
|
if num == 0: return '0'
|
||||||
|
num *= sign
|
||||||
|
ans = []
|
||||||
|
while num:
|
||||||
|
ans.append(digits[(num % base)])
|
||||||
|
num //= base
|
||||||
|
if sign < 0:
|
||||||
|
ans.append('-')
|
||||||
|
ans.reverse()
|
||||||
|
return ''.join(ans)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user