mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start refactoring of MobiWriter
This commit is contained in:
parent
48929a4dbd
commit
4805fa7c77
@ -184,7 +184,12 @@ class MOBIOutput(OutputFormatPlugin):
|
|||||||
mobimlizer(oeb, opts)
|
mobimlizer(oeb, opts)
|
||||||
self.check_for_periodical()
|
self.check_for_periodical()
|
||||||
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
||||||
from calibre.ebooks.mobi.writer import MobiWriter
|
from calibre.utils.config import tweaks
|
||||||
|
if tweaks.get('new_mobi_writer', False):
|
||||||
|
from calibre.ebooks.mobi.writer2.main import MobiWriter
|
||||||
|
MobiWriter
|
||||||
|
else:
|
||||||
|
from calibre.ebooks.mobi.writer import MobiWriter
|
||||||
writer = MobiWriter(opts,
|
writer = MobiWriter(opts,
|
||||||
write_page_breaks_after_item=write_page_breaks_after_item)
|
write_page_breaks_after_item=write_page_breaks_after_item)
|
||||||
writer(oeb, output_path)
|
writer(oeb, output_path)
|
||||||
|
15
src/calibre/ebooks/mobi/writer2/__init__.py
Normal file
15
src/calibre/ebooks/mobi/writer2/__init__.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
UNCOMPRESSED = 1
|
||||||
|
PALMDOC = 2
|
||||||
|
HUFFDIC = 17480
|
||||||
|
PALM_MAX_IMAGE_SIZE = 63 * 1024
|
||||||
|
|
579
src/calibre/ebooks/mobi/writer2/main.py
Normal file
579
src/calibre/ebooks/mobi/writer2/main.py
Normal file
@ -0,0 +1,579 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re, random, time
|
||||||
|
from cStringIO import StringIO
|
||||||
|
from struct import pack
|
||||||
|
|
||||||
|
from calibre.ebooks import normalize
|
||||||
|
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||||
|
from calibre.ebooks.mobi.writer2.serializer import Serializer
|
||||||
|
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||||
|
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||||
|
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||||
|
from calibre.utils.filenames import ascii_filename
|
||||||
|
from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
|
||||||
|
|
||||||
|
EXTH_CODES = {
|
||||||
|
'creator': 100,
|
||||||
|
'publisher': 101,
|
||||||
|
'description': 103,
|
||||||
|
'identifier': 104,
|
||||||
|
'subject': 105,
|
||||||
|
'pubdate': 106,
|
||||||
|
'date': 106,
|
||||||
|
'review': 107,
|
||||||
|
'contributor': 108,
|
||||||
|
'rights': 109,
|
||||||
|
'type': 111,
|
||||||
|
'source': 112,
|
||||||
|
'title': 503,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Disabled as I dont care about uncrossable breaks
|
||||||
|
WRITE_UNCROSSABLE_BREAKS = False
|
||||||
|
|
||||||
|
RECORD_SIZE = 0x1000 # 4096
|
||||||
|
|
||||||
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||||
|
MAX_THUMB_SIZE = 16 * 1024
|
||||||
|
MAX_THUMB_DIMEN = (180, 240)
|
||||||
|
|
||||||
|
# Almost like the one for MS LIT, but not quite.
|
||||||
|
DECINT_FORWARD = 0
|
||||||
|
DECINT_BACKWARD = 1
|
||||||
|
|
||||||
|
def decint(value, direction):
|
||||||
|
'''
|
||||||
|
Some parts of the Mobipocket format encode data as variable-width integers.
|
||||||
|
These integers are represented big-endian with 7 bits per byte in bits 1-7.
|
||||||
|
They may be either forward-encoded, in which case only the LSB has bit 8 set,
|
||||||
|
or backward-encoded, in which case only the MSB has bit 8 set.
|
||||||
|
For example, the number 0x11111 would be represented forward-encoded as:
|
||||||
|
|
||||||
|
0x04 0x22 0x91
|
||||||
|
|
||||||
|
And backward-encoded as:
|
||||||
|
|
||||||
|
0x84 0x22 0x11
|
||||||
|
|
||||||
|
This function encodes the integer ``value`` as a variable width integer and
|
||||||
|
returns the bytestring corresponding to it.
|
||||||
|
'''
|
||||||
|
# Encode vwi
|
||||||
|
byts = bytearray()
|
||||||
|
while True:
|
||||||
|
b = value & 0x7f
|
||||||
|
value >>= 7
|
||||||
|
byts.append(b)
|
||||||
|
if value == 0:
|
||||||
|
break
|
||||||
|
if direction == DECINT_FORWARD:
|
||||||
|
byts[0] |= 0x80
|
||||||
|
elif direction == DECINT_BACKWARD:
|
||||||
|
byts[-1] |= 0x80
|
||||||
|
return bytes(byts)
|
||||||
|
|
||||||
|
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
|
||||||
|
'''
|
||||||
|
Convert image setting all transparent pixels to white and changing format
|
||||||
|
to JPEG. Ensure the resultant image has a byte size less than
|
||||||
|
maxsizeb.
|
||||||
|
|
||||||
|
If dimen is not None, generate a thumbnail of width=dimen, height=dimen
|
||||||
|
|
||||||
|
Returns the image as a bytestring
|
||||||
|
'''
|
||||||
|
if dimen is not None:
|
||||||
|
data = thumbnail(data, width=dimen, height=dimen,
|
||||||
|
compression_quality=90)[-1]
|
||||||
|
else:
|
||||||
|
# Replace transparent pixels with white pixels and convert to JPEG
|
||||||
|
data = save_cover_data_to(data, 'img.jpg', return_data=True)
|
||||||
|
if len(data) <= maxsizeb:
|
||||||
|
return data
|
||||||
|
orig_data = data
|
||||||
|
img = Image()
|
||||||
|
quality = 95
|
||||||
|
|
||||||
|
img.load(data)
|
||||||
|
while len(data) >= maxsizeb and quality >= 10:
|
||||||
|
quality -= 5
|
||||||
|
img.set_compression_quality(quality)
|
||||||
|
data = img.export('jpg')
|
||||||
|
if len(data) <= maxsizeb:
|
||||||
|
return data
|
||||||
|
orig_data = data
|
||||||
|
|
||||||
|
scale = 0.9
|
||||||
|
while len(data) >= maxsizeb and scale >= 0.05:
|
||||||
|
img = Image()
|
||||||
|
img.load(orig_data)
|
||||||
|
w, h = img.size
|
||||||
|
img.size = (int(scale*w), int(scale*h))
|
||||||
|
img.set_compression_quality(quality)
|
||||||
|
data = img.export('jpg')
|
||||||
|
scale -= 0.05
|
||||||
|
return data
|
||||||
|
|
||||||
|
class MobiWriter(object):
|
||||||
|
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||||
|
|
||||||
|
def __init__(self, opts, write_page_breaks_after_item=True):
|
||||||
|
self.opts = opts
|
||||||
|
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||||
|
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
|
||||||
|
self.prefer_author_sort = opts.prefer_author_sort
|
||||||
|
|
||||||
|
def __call__(self, oeb, path_or_stream):
|
||||||
|
if hasattr(path_or_stream, 'write'):
|
||||||
|
return self.dump_stream(oeb, path_or_stream)
|
||||||
|
with open(path_or_stream, 'w+b') as stream:
|
||||||
|
return self.dump_stream(oeb, stream)
|
||||||
|
|
||||||
|
def write(self, *args):
|
||||||
|
for datum in args:
|
||||||
|
self.stream.write(datum)
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
return self.stream.tell()
|
||||||
|
|
||||||
|
def dump_stream(self, oeb, stream):
|
||||||
|
self.oeb = oeb
|
||||||
|
self.stream = stream
|
||||||
|
self.records = [None]
|
||||||
|
self.generate_content()
|
||||||
|
self.generate_record0()
|
||||||
|
self.write_header()
|
||||||
|
self.write_content()
|
||||||
|
|
||||||
|
def generate_content(self):
|
||||||
|
self.map_image_names()
|
||||||
|
self.generate_text()
|
||||||
|
# Image records come after text records
|
||||||
|
self.generate_images()
|
||||||
|
|
||||||
|
def map_image_names(self):
|
||||||
|
'''
|
||||||
|
Map image names to record indices, ensuring that the masthead image if
|
||||||
|
present has index number 1.
|
||||||
|
'''
|
||||||
|
index = 1
|
||||||
|
self.images = images = {}
|
||||||
|
mh_href = None
|
||||||
|
|
||||||
|
if 'masthead' in self.oeb.guide:
|
||||||
|
mh_href = self.oeb.guide['masthead'].href
|
||||||
|
images[mh_href] = 1
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
for item in self.oeb.manifest.values():
|
||||||
|
if item.media_type in OEB_RASTER_IMAGES:
|
||||||
|
if item.href == mh_href: continue
|
||||||
|
images[item.href] = index
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
def generate_images(self):
|
||||||
|
self.oeb.logger.info('Serializing images...')
|
||||||
|
images = [(index, href) for href, index in self.images.iteritems()]
|
||||||
|
images.sort()
|
||||||
|
self.first_image_record = None
|
||||||
|
for _, href in images:
|
||||||
|
item = self.oeb.manifest.hrefs[href]
|
||||||
|
try:
|
||||||
|
data = rescale_image(item.data)
|
||||||
|
except:
|
||||||
|
self.oeb.logger.warn('Bad image file %r' % item.href)
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
item.unload_data_from_memory()
|
||||||
|
self.records.append(data)
|
||||||
|
if self.first_image_record is None:
|
||||||
|
self.first_image_record = len(self.records) - 1
|
||||||
|
|
||||||
|
def generate_text(self):
|
||||||
|
self.oeb.logger.info('Serializing markup content...')
|
||||||
|
serializer = Serializer(self.oeb, self.images,
|
||||||
|
write_page_breaks_after_item=self.write_page_breaks_after_item)
|
||||||
|
text = serializer()
|
||||||
|
breaks = serializer.breaks
|
||||||
|
self.anchor_offset_kindle = serializer.anchor_offset_kindle
|
||||||
|
self.id_offsets = serializer.id_offsets
|
||||||
|
self.content_length = len(text)
|
||||||
|
self.text_length = len(text)
|
||||||
|
text = StringIO(text)
|
||||||
|
buf = []
|
||||||
|
nrecords = 0
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
if self.compression != UNCOMPRESSED:
|
||||||
|
self.oeb.logger.info(' Compressing markup content...')
|
||||||
|
data, overlap = self.read_text_record(text)
|
||||||
|
|
||||||
|
while len(data) > 0:
|
||||||
|
if self.compression == PALMDOC:
|
||||||
|
data = compress_doc(data)
|
||||||
|
record = StringIO()
|
||||||
|
record.write(data)
|
||||||
|
|
||||||
|
self.records.append(record.getvalue())
|
||||||
|
buf.append(self.records[-1])
|
||||||
|
nrecords += 1
|
||||||
|
offset += RECORD_SIZE
|
||||||
|
data, overlap = self.read_text_record(text)
|
||||||
|
|
||||||
|
# Write information about the mutibyte character overlap, if any
|
||||||
|
record.write(overlap)
|
||||||
|
record.write(pack('>B', len(overlap)))
|
||||||
|
|
||||||
|
# Write information about uncrossable breaks (non linear items in
|
||||||
|
# the spine)
|
||||||
|
if WRITE_UNCROSSABLE_BREAKS:
|
||||||
|
nextra = 0
|
||||||
|
pbreak = 0
|
||||||
|
running = offset
|
||||||
|
|
||||||
|
# Write information about every uncrossable break that occurs in
|
||||||
|
# the next record.
|
||||||
|
while breaks and (breaks[0] - offset) < RECORD_SIZE:
|
||||||
|
pbreak = (breaks.pop(0) - running) >> 3
|
||||||
|
encoded = decint(pbreak, DECINT_FORWARD)
|
||||||
|
record.write(encoded)
|
||||||
|
running += pbreak << 3
|
||||||
|
nextra += len(encoded)
|
||||||
|
lsize = 1
|
||||||
|
while True:
|
||||||
|
size = decint(nextra + lsize, DECINT_BACKWARD)
|
||||||
|
if len(size) == lsize:
|
||||||
|
break
|
||||||
|
lsize += 1
|
||||||
|
record.write(size)
|
||||||
|
|
||||||
|
self.text_nrecords = nrecords + 1
|
||||||
|
|
||||||
|
def read_text_record(self, text):
|
||||||
|
'''
|
||||||
|
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
||||||
|
In case the record ends in the middle of a multibyte character return
|
||||||
|
the overlap as well.
|
||||||
|
|
||||||
|
Returns data, overlap: where both are byte strings. overlap is the
|
||||||
|
extra bytes needed to complete the truncated multibyte character.
|
||||||
|
'''
|
||||||
|
opos = text.tell()
|
||||||
|
text.seek(0, 2)
|
||||||
|
# npos is the position of the next record
|
||||||
|
npos = min((opos + RECORD_SIZE, text.tell()))
|
||||||
|
# Number of bytes from the next record needed to complete the last
|
||||||
|
# character in this record
|
||||||
|
extra = 0
|
||||||
|
|
||||||
|
last = b''
|
||||||
|
while not last.decode('utf-8', 'ignore'):
|
||||||
|
# last contains no valid utf-8 characters
|
||||||
|
size = len(last) + 1
|
||||||
|
text.seek(npos - size)
|
||||||
|
last = text.read(size)
|
||||||
|
|
||||||
|
# last now has one valid utf-8 char and possibly some bytes that belong
|
||||||
|
# to a truncated char
|
||||||
|
|
||||||
|
try:
|
||||||
|
last.decode('utf-8', 'strict')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# There are some truncated bytes in last
|
||||||
|
prev = len(last)
|
||||||
|
while True:
|
||||||
|
text.seek(npos - prev)
|
||||||
|
last = text.read(len(last) + 1)
|
||||||
|
try:
|
||||||
|
last.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
extra = len(last) - prev
|
||||||
|
|
||||||
|
text.seek(opos)
|
||||||
|
data = text.read(RECORD_SIZE)
|
||||||
|
overlap = text.read(extra)
|
||||||
|
text.seek(npos)
|
||||||
|
|
||||||
|
return data, overlap
|
||||||
|
|
||||||
|
def generate_end_records(self):
|
||||||
|
self.flis_number = len(self.records)
|
||||||
|
self.records.append('\xE9\x8E\x0D\x0A')
|
||||||
|
|
||||||
|
def generate_record0(self): # {{{
|
||||||
|
metadata = self.oeb.metadata
|
||||||
|
exth = self.build_exth()
|
||||||
|
last_content_record = len(self.records) - 1
|
||||||
|
|
||||||
|
self.generate_end_records()
|
||||||
|
|
||||||
|
record0 = StringIO()
|
||||||
|
# The PalmDOC Header
|
||||||
|
record0.write(pack('>HHIHHHH', self.compression, 0,
|
||||||
|
self.text_length,
|
||||||
|
self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
|
||||||
|
uid = random.randint(0, 0xffffffff)
|
||||||
|
title = normalize(unicode(metadata.title[0])).encode('utf-8')
|
||||||
|
# The MOBI Header
|
||||||
|
|
||||||
|
# 0x0 - 0x3
|
||||||
|
record0.write(b'MOBI')
|
||||||
|
|
||||||
|
# 0x4 - 0x7 : Length of header
|
||||||
|
# 0x8 - 0x11 : MOBI type
|
||||||
|
# type meaning
|
||||||
|
# 0x002 MOBI book (chapter - chapter navigation)
|
||||||
|
# 0x101 News - Hierarchical navigation with sections and articles
|
||||||
|
# 0x102 News feed - Flat navigation
|
||||||
|
# 0x103 News magazine - same as 0x101
|
||||||
|
# 0xC - 0xF : Text encoding (65001 is utf-8)
|
||||||
|
# 0x10 - 0x13 : UID
|
||||||
|
# 0x14 - 0x17 : Generator version
|
||||||
|
|
||||||
|
record0.write(pack('>IIIII',
|
||||||
|
0xe8, 0x002, 65001, uid, 6))
|
||||||
|
|
||||||
|
# 0x18 - 0x1f : Unknown
|
||||||
|
record0.write(b'\xff' * 8)
|
||||||
|
|
||||||
|
|
||||||
|
# 0x20 - 0x23 : Secondary index record
|
||||||
|
record0.write(pack('>I', 0xffffffff))
|
||||||
|
|
||||||
|
# 0x24 - 0x3f : Unknown
|
||||||
|
record0.write(b'\xff' * 28)
|
||||||
|
|
||||||
|
# 0x40 - 0x43 : Offset of first non-text record
|
||||||
|
record0.write(pack('>I',
|
||||||
|
self.text_nrecords + 1))
|
||||||
|
|
||||||
|
# 0x44 - 0x4b : title offset, title length
|
||||||
|
record0.write(pack('>II',
|
||||||
|
0xe8 + 16 + len(exth), len(title)))
|
||||||
|
|
||||||
|
# 0x4c - 0x4f : Language specifier
|
||||||
|
record0.write(iana2mobi(
|
||||||
|
str(metadata.language[0])))
|
||||||
|
|
||||||
|
# 0x50 - 0x57 : Unknown
|
||||||
|
record0.write(b'\0' * 8)
|
||||||
|
|
||||||
|
# 0x58 - 0x5b : Format version
|
||||||
|
# 0x5c - 0x5f : First image record number
|
||||||
|
record0.write(pack('>II',
|
||||||
|
6, self.first_image_record if self.first_image_record else 0))
|
||||||
|
|
||||||
|
# 0x60 - 0x63 : First HUFF/CDIC record number
|
||||||
|
# 0x64 - 0x67 : Number of HUFF/CDIC records
|
||||||
|
# 0x68 - 0x6b : First DATP record number
|
||||||
|
# 0x6c - 0x6f : Number of DATP records
|
||||||
|
record0.write(b'\0' * 16)
|
||||||
|
|
||||||
|
# 0x70 - 0x73 : EXTH flags
|
||||||
|
record0.write(pack('>I', 0x50))
|
||||||
|
|
||||||
|
# 0x74 - 0x93 : Unknown
|
||||||
|
record0.write(b'\0' * 32)
|
||||||
|
|
||||||
|
# 0x94 - 0x97 : DRM offset
|
||||||
|
# 0x98 - 0x9b : DRM count
|
||||||
|
# 0x9c - 0x9f : DRM size
|
||||||
|
# 0xa0 - 0xa3 : DRM flags
|
||||||
|
record0.write(pack('>IIII',
|
||||||
|
0xffffffff, 0xffffffff, 0, 0))
|
||||||
|
|
||||||
|
|
||||||
|
# 0xa4 - 0xaf : Unknown
|
||||||
|
record0.write(b'\0'*12)
|
||||||
|
|
||||||
|
# 0xb0 - 0xb1 : First content record number
|
||||||
|
# 0xb2 - 0xb3 : last content record number
|
||||||
|
# (Includes Image, DATP, HUFF, DRM)
|
||||||
|
record0.write(pack('>HH', 1, last_content_record))
|
||||||
|
|
||||||
|
# 0xb4 - 0xb7 : Unknown
|
||||||
|
record0.write(b'\0\0\0\x01')
|
||||||
|
|
||||||
|
# 0xb8 - 0xbb : FCIS record number
|
||||||
|
record0.write(pack('>I', 0xffffffff))
|
||||||
|
|
||||||
|
# 0xbc - 0xbf : Unknown (FCIS record count?)
|
||||||
|
record0.write(pack('>I', 0xffffffff))
|
||||||
|
|
||||||
|
# 0xc0 - 0xc3 : FLIS record number
|
||||||
|
record0.write(pack('>I', 0xffffffff))
|
||||||
|
|
||||||
|
# 0xc4 - 0xc7 : Unknown (FLIS record count?)
|
||||||
|
record0.write(pack('>I', 1))
|
||||||
|
|
||||||
|
# 0xc8 - 0xcf : Unknown
|
||||||
|
record0.write(b'\0'*8)
|
||||||
|
|
||||||
|
# 0xd0 - 0xdf : Unknown
|
||||||
|
record0.write(pack('>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
|
||||||
|
|
||||||
|
# 0xe0 - 0xe3 : Extra record data
|
||||||
|
# Extra record data flags:
|
||||||
|
# - 0x1: <extra multibyte bytes><size> (?)
|
||||||
|
# - 0x2: <TBS indexing description of this HTML record><size> GR
|
||||||
|
# - 0x4: <uncrossable breaks><size>
|
||||||
|
# GR: Use 7 for indexed files, 5 for unindexed
|
||||||
|
# Setting bit 2 (0x4) disables <guide><reference type="start"> functionality
|
||||||
|
|
||||||
|
trailingDataFlags = 1
|
||||||
|
if WRITE_UNCROSSABLE_BREAKS:
|
||||||
|
trailingDataFlags |= 4
|
||||||
|
record0.write(pack('>I', trailingDataFlags))
|
||||||
|
|
||||||
|
# 0xe4 - 0xe7 : Primary index record
|
||||||
|
record0.write(pack('>I', 0xffffffff))
|
||||||
|
|
||||||
|
record0.write(exth)
|
||||||
|
record0.write(title)
|
||||||
|
record0 = record0.getvalue()
|
||||||
|
# Add some buffer so that Amazon can add encryption information if this
|
||||||
|
# MOBI is submitted for publication
|
||||||
|
record0 += (b'\0' * (1024*8))
|
||||||
|
self.records[0] = record0
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def build_exth(self): # {{{
|
||||||
|
oeb = self.oeb
|
||||||
|
exth = StringIO()
|
||||||
|
nrecs = 0
|
||||||
|
for term in oeb.metadata:
|
||||||
|
if term not in EXTH_CODES: continue
|
||||||
|
code = EXTH_CODES[term]
|
||||||
|
items = oeb.metadata[term]
|
||||||
|
if term == 'creator':
|
||||||
|
if self.prefer_author_sort:
|
||||||
|
creators = [normalize(unicode(c.file_as or c)) for c in items]
|
||||||
|
else:
|
||||||
|
creators = [normalize(unicode(c)) for c in items]
|
||||||
|
items = ['; '.join(creators)]
|
||||||
|
for item in items:
|
||||||
|
data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item)))
|
||||||
|
if term == 'identifier':
|
||||||
|
if data.lower().startswith('urn:isbn:'):
|
||||||
|
data = data[9:]
|
||||||
|
elif item.scheme.lower() == 'isbn':
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
exth.write(pack('>II', code, len(data) + 8))
|
||||||
|
exth.write(data)
|
||||||
|
nrecs += 1
|
||||||
|
if term == 'rights' :
|
||||||
|
try:
|
||||||
|
rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
|
||||||
|
except:
|
||||||
|
rights = b'Unknown'
|
||||||
|
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||||
|
exth.write(rights)
|
||||||
|
nrecs += 1
|
||||||
|
|
||||||
|
# Write UUID as ASIN
|
||||||
|
uuid = None
|
||||||
|
from calibre.ebooks.oeb.base import OPF
|
||||||
|
for x in oeb.metadata['identifier']:
|
||||||
|
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
|
||||||
|
unicode(x).startswith('urn:uuid:')):
|
||||||
|
uuid = unicode(x).split(':')[-1]
|
||||||
|
break
|
||||||
|
if uuid is None:
|
||||||
|
from uuid import uuid4
|
||||||
|
uuid = str(uuid4())
|
||||||
|
|
||||||
|
if isinstance(uuid, unicode):
|
||||||
|
uuid = uuid.encode('utf-8')
|
||||||
|
exth.write(pack('>II', 113, len(uuid) + 8))
|
||||||
|
exth.write(uuid)
|
||||||
|
nrecs += 1
|
||||||
|
|
||||||
|
# Write cdetype
|
||||||
|
if not self.opts.mobi_periodical:
|
||||||
|
data = b'EBOK'
|
||||||
|
exth.write(pack('>II', 501, len(data)+8))
|
||||||
|
exth.write(data)
|
||||||
|
nrecs += 1
|
||||||
|
|
||||||
|
# Add a publication date entry
|
||||||
|
if oeb.metadata['date'] != [] :
|
||||||
|
datestr = str(oeb.metadata['date'][0])
|
||||||
|
elif oeb.metadata['timestamp'] != [] :
|
||||||
|
datestr = str(oeb.metadata['timestamp'][0])
|
||||||
|
|
||||||
|
if datestr is not None:
|
||||||
|
exth.write(pack('>II', EXTH_CODES['pubdate'], len(datestr) + 8))
|
||||||
|
exth.write(datestr)
|
||||||
|
nrecs += 1
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
|
||||||
|
|
||||||
|
if (oeb.metadata.cover and
|
||||||
|
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||||
|
id = unicode(oeb.metadata.cover[0])
|
||||||
|
item = oeb.manifest.ids[id]
|
||||||
|
href = item.href
|
||||||
|
if href in self.images:
|
||||||
|
index = self.images[href] - 1
|
||||||
|
exth.write(pack('>III', 0xc9, 0x0c, index))
|
||||||
|
exth.write(pack('>III', 0xcb, 0x0c, 0))
|
||||||
|
nrecs += 2
|
||||||
|
index = self.add_thumbnail(item)
|
||||||
|
if index is not None:
|
||||||
|
exth.write(pack('>III', 0xca, 0x0c, index - 1))
|
||||||
|
nrecs += 1
|
||||||
|
|
||||||
|
exth = exth.getvalue()
|
||||||
|
trail = len(exth) % 4
|
||||||
|
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||||
|
exth = [b'EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
|
||||||
|
return b''.join(exth)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def add_thumbnail(self, item):
|
||||||
|
try:
|
||||||
|
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
|
||||||
|
maxsizeb=MAX_THUMB_SIZE)
|
||||||
|
except IOError:
|
||||||
|
self.oeb.logger.warn('Bad image file %r' % item.href)
|
||||||
|
return None
|
||||||
|
manifest = self.oeb.manifest
|
||||||
|
id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
|
||||||
|
manifest.add(id, href, 'image/jpeg', data=data)
|
||||||
|
index = len(self.images) + 1
|
||||||
|
self.images[href] = index
|
||||||
|
self.records.append(data)
|
||||||
|
return index
|
||||||
|
|
||||||
|
def write_header(self):
|
||||||
|
title = ascii_filename(unicode(self.oeb.metadata.title[0]))
|
||||||
|
title = title + (b'\0' * (32 - len(title)))
|
||||||
|
now = int(time.time())
|
||||||
|
nrecords = len(self.records)
|
||||||
|
self.write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
|
||||||
|
b'BOOK', b'MOBI', pack('>IIH', nrecords, 0, nrecords))
|
||||||
|
offset = self.tell() + (8 * nrecords) + 2
|
||||||
|
for i, record in enumerate(self.records):
|
||||||
|
self.write(pack('>I', offset), b'\0', pack('>I', 2*i)[1:])
|
||||||
|
offset += len(record)
|
||||||
|
self.write(b'\0\0')
|
||||||
|
|
||||||
|
def write_content(self):
|
||||||
|
for record in self.records:
|
||||||
|
self.write(record)
|
||||||
|
|
||||||
|
|
246
src/calibre/ebooks/mobi/writer2/serializer.py
Normal file
246
src/calibre/ebooks/mobi/writer2/serializer.py
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
|
||||||
|
namespace, prefixname, urlnormalize)
|
||||||
|
from calibre.ebooks.mobi.mobiml import MBP_NS
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from urlparse import urldefrag
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
|
|
||||||
|
class Serializer(object):
|
||||||
|
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
|
||||||
|
|
||||||
|
def __init__(self, oeb, images, write_page_breaks_after_item=True):
|
||||||
|
'''
|
||||||
|
Write all the HTML markup in oeb into a single in memory buffer
|
||||||
|
containing a single html document with links replaced by offsets into
|
||||||
|
the buffer.
|
||||||
|
|
||||||
|
:param oeb: OEBBook object that encapsulates the document to be
|
||||||
|
processed.
|
||||||
|
|
||||||
|
:param images: Mapping of image hrefs (urlnormalized) to image record
|
||||||
|
indices.
|
||||||
|
|
||||||
|
:param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
|
||||||
|
is written after every element of the spine in ``oeb``.
|
||||||
|
'''
|
||||||
|
self.oeb = oeb
|
||||||
|
self.images = images
|
||||||
|
self.logger = oeb.logger
|
||||||
|
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||||
|
|
||||||
|
# Mapping of hrefs (urlnormalized) to the offset in the buffer where
|
||||||
|
# the resource pointed to by the href lives. Used at the end to fill in
|
||||||
|
# the correct values into all filepos="..." links.
|
||||||
|
self.id_offsets = {}
|
||||||
|
|
||||||
|
# Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
|
||||||
|
# where filepos="..." elements are written corresponding to links that
|
||||||
|
# point to the href. This is used at the end to fill in the correct values.
|
||||||
|
self.href_offsets = defaultdict(list)
|
||||||
|
|
||||||
|
# List of offsets in the buffer of non linear items in the spine. These
|
||||||
|
# become uncrossable breaks in the MOBI
|
||||||
|
self.breaks = []
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
'''
|
||||||
|
Return the document serialized as a single UTF-8 encoded bytestring.
|
||||||
|
'''
|
||||||
|
buf = self.buf = StringIO()
|
||||||
|
buf.write(b'<html>')
|
||||||
|
self.serialize_head()
|
||||||
|
self.serialize_body()
|
||||||
|
buf.write(b'</html>')
|
||||||
|
self.fixup_links()
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
def serialize_head(self):
|
||||||
|
buf = self.buf
|
||||||
|
buf.write(b'<head>')
|
||||||
|
if len(self.oeb.guide) > 0:
|
||||||
|
self.serialize_guide()
|
||||||
|
buf.write(b'</head>')
|
||||||
|
|
||||||
|
def serialize_guide(self):
|
||||||
|
'''
|
||||||
|
The Kindle decides where to open a book based on the presence of
|
||||||
|
an item in the guide that looks like
|
||||||
|
<reference type="text" title="Start" href="chapter-one.xhtml"/>
|
||||||
|
|
||||||
|
Similarly an item with type="toc" controls where the Goto Table of
|
||||||
|
Contents operation on the kindle goes.
|
||||||
|
'''
|
||||||
|
|
||||||
|
buf = self.buf
|
||||||
|
hrefs = self.oeb.manifest.hrefs
|
||||||
|
buf.write(b'<guide>')
|
||||||
|
for ref in self.oeb.guide.values():
|
||||||
|
path = urldefrag(ref.href)[0]
|
||||||
|
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
buf.write(b'<reference type="')
|
||||||
|
if ref.type.startswith('other.') :
|
||||||
|
self.serialize_text(ref.type.replace('other.',''), quot=True)
|
||||||
|
else:
|
||||||
|
self.serialize_text(ref.type, quot=True)
|
||||||
|
buf.write(b'" ')
|
||||||
|
if ref.title is not None:
|
||||||
|
buf.write(b'title="')
|
||||||
|
self.serialize_text(ref.title, quot=True)
|
||||||
|
buf.write(b'" ')
|
||||||
|
self.serialize_href(ref.href)
|
||||||
|
# Space required or won't work, I kid you not
|
||||||
|
buf.write(b' />')
|
||||||
|
|
||||||
|
buf.write(b'</guide>')
|
||||||
|
|
||||||
|
def serialize_href(self, href, base=None):
|
||||||
|
'''
|
||||||
|
Serialize the href attribute of an <a> or <reference> tag. It is
|
||||||
|
serialized as filepos="000000000" and a pointer to its location is
|
||||||
|
stored in self.href_offsets so that the correct value can be filled in
|
||||||
|
at the end.
|
||||||
|
'''
|
||||||
|
hrefs = self.oeb.manifest.hrefs
|
||||||
|
path, frag = urldefrag(urlnormalize(href))
|
||||||
|
if path and base:
|
||||||
|
path = base.abshref(path)
|
||||||
|
if path and path not in hrefs:
|
||||||
|
return False
|
||||||
|
buf = self.buf
|
||||||
|
item = hrefs[path] if path else None
|
||||||
|
if item and item.spine_position is None:
|
||||||
|
return False
|
||||||
|
path = item.href if item else base.href
|
||||||
|
href = '#'.join((path, frag)) if frag else path
|
||||||
|
buf.write(b'filepos=')
|
||||||
|
self.href_offsets[href].append(buf.tell())
|
||||||
|
buf.write(b'0000000000')
|
||||||
|
return True
|
||||||
|
|
||||||
|
def serialize_body(self):
|
||||||
|
'''
|
||||||
|
Serialize all items in the spine of the document. Non linear items are
|
||||||
|
moved to the end.
|
||||||
|
'''
|
||||||
|
buf = self.buf
|
||||||
|
self.anchor_offset = buf.tell()
|
||||||
|
buf.write(b'<body>')
|
||||||
|
self.anchor_offset_kindle = buf.tell()
|
||||||
|
spine = [item for item in self.oeb.spine if item.linear]
|
||||||
|
spine.extend([item for item in self.oeb.spine if not item.linear])
|
||||||
|
for item in spine:
|
||||||
|
self.serialize_item(item)
|
||||||
|
buf.write(b'</body>')
|
||||||
|
|
||||||
|
def serialize_item(self, item):
|
||||||
|
'''
|
||||||
|
Serialize an individual item from the spine of the input document.
|
||||||
|
A reference to this item is stored in self.href_offsets
|
||||||
|
'''
|
||||||
|
buf = self.buf
|
||||||
|
if not item.linear:
|
||||||
|
self.breaks.append(buf.tell() - 1)
|
||||||
|
self.id_offsets[urlnormalize(item.href)] = buf.tell()
|
||||||
|
# Kindle periodical articles are contained in a <div> tag
|
||||||
|
buf.write(b'<div>')
|
||||||
|
for elem in item.data.find(XHTML('body')):
|
||||||
|
self.serialize_elem(elem, item)
|
||||||
|
# Kindle periodical article end marker
|
||||||
|
buf.write(b'<div></div>')
|
||||||
|
if self.write_page_breaks_after_item:
|
||||||
|
buf.write(b'<mbp:pagebreak/>')
|
||||||
|
buf.write(b'</div>')
|
||||||
|
self.anchor_offset = None
|
||||||
|
|
||||||
|
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
|
||||||
|
buf = self.buf
|
||||||
|
if not isinstance(elem.tag, basestring) \
|
||||||
|
or namespace(elem.tag) not in nsrmap:
|
||||||
|
return
|
||||||
|
tag = prefixname(elem.tag, nsrmap)
|
||||||
|
# Previous layers take care of @name
|
||||||
|
id_ = elem.attrib.pop('id', None)
|
||||||
|
if id_:
|
||||||
|
href = '#'.join((item.href, id_))
|
||||||
|
offset = self.anchor_offset or buf.tell()
|
||||||
|
self.id_offsets[urlnormalize(href)] = offset
|
||||||
|
if self.anchor_offset is not None and \
|
||||||
|
tag == 'a' and not elem.attrib and \
|
||||||
|
not len(elem) and not elem.text:
|
||||||
|
return
|
||||||
|
self.anchor_offset = buf.tell()
|
||||||
|
buf.write(b'<')
|
||||||
|
buf.write(tag.encode('utf-8'))
|
||||||
|
if elem.attrib:
|
||||||
|
for attr, val in elem.attrib.items():
|
||||||
|
if namespace(attr) not in nsrmap:
|
||||||
|
continue
|
||||||
|
attr = prefixname(attr, nsrmap)
|
||||||
|
buf.write(b' ')
|
||||||
|
if attr == 'href':
|
||||||
|
if self.serialize_href(val, item):
|
||||||
|
continue
|
||||||
|
elif attr == 'src':
|
||||||
|
href = urlnormalize(item.abshref(val))
|
||||||
|
if href in self.images:
|
||||||
|
index = self.images[href]
|
||||||
|
buf.write(b'recindex="%05d"' % index)
|
||||||
|
continue
|
||||||
|
buf.write(attr.encode('utf-8'))
|
||||||
|
buf.write(b'="')
|
||||||
|
self.serialize_text(val, quot=True)
|
||||||
|
buf.write(b'"')
|
||||||
|
buf.write(b'>')
|
||||||
|
if elem.text or len(elem) > 0:
|
||||||
|
if elem.text:
|
||||||
|
self.anchor_offset = None
|
||||||
|
self.serialize_text(elem.text)
|
||||||
|
for child in elem:
|
||||||
|
self.serialize_elem(child, item)
|
||||||
|
if child.tail:
|
||||||
|
self.anchor_offset = None
|
||||||
|
self.serialize_text(child.tail)
|
||||||
|
buf.write(b'</%s>' % tag.encode('utf-8'))
|
||||||
|
|
||||||
|
def serialize_text(self, text, quot=False):
|
||||||
|
text = text.replace('&', '&')
|
||||||
|
text = text.replace('<', '<')
|
||||||
|
text = text.replace('>', '>')
|
||||||
|
text = text.replace(u'\u00AD', '') # Soft-hyphen
|
||||||
|
if quot:
|
||||||
|
text = text.replace('"', '"')
|
||||||
|
self.buf.write(text.encode('utf-8'))
|
||||||
|
|
||||||
|
def fixup_links(self):
|
||||||
|
'''
|
||||||
|
Fill in the correct values for all filepos="..." links with the offsets
|
||||||
|
of the linked to content (as stored in id_offsets).
|
||||||
|
'''
|
||||||
|
buf = self.buf
|
||||||
|
id_offsets = self.id_offsets
|
||||||
|
for href, hoffs in self.href_offsets.items():
|
||||||
|
# Iterate over all filepos items
|
||||||
|
if href not in id_offsets:
|
||||||
|
self.logger.warn('Hyperlink target %r not found' % href)
|
||||||
|
# Link to the top of the document, better than just ignoring
|
||||||
|
href, _ = urldefrag(href)
|
||||||
|
if href in self.id_offsets:
|
||||||
|
ioff = self.id_offsets[href]
|
||||||
|
for hoff in hoffs:
|
||||||
|
buf.seek(hoff)
|
||||||
|
buf.write(b'%010d' % ioff)
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user