MOBI Output:Command line support for converting to MOBI via the command any2mobi (thanks to Marshall T. Vandegrift)

This commit is contained in:
Kovid Goyal 2009-01-17 12:23:27 -08:00
commit 329fd4866f
21 changed files with 2655 additions and 312 deletions

View File

@ -166,7 +166,7 @@ if __name__ == '__main__':
metadata_sqlite = 'library/metadata_sqlite.sql',
jquery = 'gui2/viewer/jquery.js',
jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js',
html_css = 'ebooks/lit/html.css',
html_css = 'ebooks/oeb/html.css',
)
DEST = os.path.join('src', APPNAME, 'resources.py')

View File

@ -798,8 +798,9 @@ class Processor(Parser):
if face is not None:
faces = []
for face in face.split(','):
if ' ' in face:
face = "%s" % face
face = face.strip()
if ' ' in face and not (face[0] == face[-1] == '"'):
face = '"%s"' % face.replace('"', r'\"')
faces.append(face)
for generic in ('serif', 'sans-serif', 'monospace'):
if generic in faces:

View File

@ -15,7 +15,7 @@ from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.lit.oeb import urlnormalize
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks import DRMError
from calibre import plugins
lzx, lxzerror = plugins['lzx']

View File

@ -23,14 +23,20 @@ from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit.reader import DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.lit.oeb import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize, xpath
from calibre.ebooks.lit.oeb import FauxLogger, OEBBook
from calibre.ebooks.lit.stylizer import Stylizer
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.lit.lzx import Compressor
import calibre
from calibre import LoggingInterface
from calibre import plugins
msdes, msdeserror = plugins['msdes']
import calibre.ebooks.lit.mssha1 as mssha1
@ -116,12 +122,6 @@ LZXC_CONTROL = \
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def prefixname(name, nsrmap):
prefix = nsrmap[namespace(name)]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def decint(value):
bytes = []
while True:
@ -143,9 +143,9 @@ def warn(x):
class ReBinary(object):
NSRMAP = {'': None, XML_NS: 'xml'}
def __init__(self, root, item, oeb, map=HTML_MAP, logger=FauxLogger()):
def __init__(self, root, path, oeb, map=HTML_MAP):
self.item = item
self.logger = logger
self.logger = oeb.logger
self.manifest = oeb.manifest
self.tags, self.tattrs = map
self.buf = StringIO()
@ -300,10 +300,9 @@ def preserve(function):
return wrapper
class LitWriter(object):
def __init__(self, oeb, logger=FauxLogger()):
self._oeb = oeb
self._logger = logger
self._litize_oeb()
def __init__(self):
# Wow, no options
pass
def _litize_oeb(self):
oeb = self._oeb
@ -312,32 +311,27 @@ class LitWriter(object):
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
cover = oeb.manifest[id]
elif MS_COVER_TYPE in oeb.guide:
href = oeb.guide[MS_COVER_TYPE].href
cover = oeb.manifest.hrefs[href]
elif 'cover' in oeb.guide:
href = oeb.guide['cover'].href
cover = oeb.manifest.hrefs[href]
else:
html = oeb.spine[0].data
imgs = xpath(html, '//img[position()=1]')
href = imgs[0].get('src') if imgs else None
cover = oeb.manifest.hrefs[href] if href else None
if cover:
if not oeb.metadata.cover:
oeb.metadata.add('cover', cover.id)
for type, title in ALL_MS_COVER_TYPES:
if type not in oeb.guide:
oeb.guide.add(type, title, cover.href)
else:
self._logger.log_warn('No suitable cover image found.')
self._logger.warn('No suitable cover image found.')
def dump(self, stream):
def dump(self, oeb, path):
if hasattr(path, 'write'):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _dump_stream(self, oeb, stream):
self._oeb = oeb
self._logger = oeb.logger
self._stream = stream
self._sections = [StringIO() for i in xrange(4)]
self._directory = []
self._meta = None
self._dump()
self._litize_oeb()
self._write_content()
def _write(self, *data):
for datum in data:
@ -351,7 +345,7 @@ class LitWriter(object):
def _tell(self):
return self._stream.tell()
def _dump(self):
def _write_content(self):
# Build content sections
self._build_sections()
@ -480,8 +474,7 @@ class LitWriter(object):
secnum = 0
if not isinstance(data, basestring):
self._add_folder(name)
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP,
logger=self._logger)
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP)
self._add_file(name + '/ahc', rebin.ahc, 0)
self._add_file(name + '/aht', rebin.aht, 0)
item.page_breaks = rebin.page_breaks
@ -560,8 +553,7 @@ class LitWriter(object):
meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP,
logger=self._logger)
rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP)
meta = rebin.content
self._meta = meta
self._add_file('/meta', meta)
@ -721,23 +713,35 @@ def option_parser():
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'--verbose', default=False, action='store_true',
'-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def oeb2lit(opts, opfpath):
logger = LoggingInterface(logging.getLogger('oeb2lit'))
def oeb2lit(opts, inpath):
logger = Logger(logging.getLogger('oeb2lit'))
logger.setup_cli_handler(opts.verbose)
litpath = opts.output
if litpath is None:
litpath = os.path.basename(opfpath)
litpath = os.path.splitext(litpath)[0] + '.lit'
litpath = os.path.abspath(litpath)
lit = LitWriter(OEBBook(opfpath, logger=logger), logger=logger)
with open(litpath, 'wb') as f:
lit.dump(f)
run_plugins_on_postprocess(litpath, 'lit')
logger.log_info(_('Output written to ')+litpath)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0] + '.lit'
outpath = os.path.abspath(outpath)
context = Context('Firefox', 'MSReader')
oeb = OEBBook(inpath, logger=logger)
tocadder = HTMLTOCAdder()
tocadder.transform(oeb, context)
mangler = CaseMangler()
mangler.transform(oeb, context)
fbase = context.dest.fbase
flattener = CSSFlattener(fbase=fbase, unfloat=True, untable=True)
flattener.transform(oeb, context)
rasterizer = SVGRasterizer()
rasterizer.transform(oeb, context)
trimmer = ManifestTrimmer()
trimmer.transform(oeb, context)
lit = LitWriter()
lit.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'lit')
logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv):
@ -746,8 +750,8 @@ def main(argv=sys.argv):
if len(args) != 1:
parser.print_help()
return 1
opfpath = args[0]
oeb2lit(opts, opfpath)
inpath = args[0]
oeb2lit(opts, inpath)
return 0
if __name__ == '__main__':

View File

@ -0,0 +1,67 @@
'''
Convert any ebook format to Mobipocket.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import sys, os, glob, logging
from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
from calibre.ebooks.epub import config as common_config
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.mobi.writer import oeb2mobi, add_mobi_options
def config(defaults=None):
return common_config(defaults=defaults, name='mobi')
def option_parser(usage=USAGE):
usage = usage % ('Mobipocket', formats())
parser = config().option_parser(usage=usage)
add_mobi_options(parser)
return parser
def any2mobi(opts, path):
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.mobi'
opts.output = os.path.abspath(opts.output)
orig_output = opts.output
with TemporaryDirectory('_any2mobi') as tdir:
oebdir = os.path.join(tdir, 'oeb')
os.mkdir(oebdir)
opts.output = os.path.join(tdir, 'dummy.epub')
opts.profile = 'None'
opts.dont_split_on_page_breaks = True
orig_bfs = opts.base_font_size2
opts.base_font_size2 = 0
any2epub(opts, path, create_epub=False, oeb_cover=True, extract_to=oebdir)
opts.base_font_size2 = orig_bfs
opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
opts.output = orig_output
logging.getLogger('html2epub').info(_('Creating Mobipocket file from EPUB...'))
oeb2mobi(opts, opf)
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print 'No input file specified.'
return 1
any2mobi(opts, args[1])
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -3,6 +3,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from struct import pack
main_language = {
0 : "NEUTRAL",
54 : "AFRIKAANS",
@ -155,5 +157,170 @@ sub_language = {
2 : "SWEDISH_FINLAND",
1 : "UZBEK_LATIN",
2 : "UZBEK_CYRILLIC",
}
}
IANA_MOBI = \
{None: {None: (0, 0)},
'af': {None: (54, 0)},
'ar': {None: (1, 0),
'AE': (1, 56),
'BH': (1, 60),
'DZ': (1, 20),
'EG': (1, 12),
'JO': (1, 44),
'KW': (1, 52),
'LB': (1, 48),
'MA': (1, 24),
'OM': (1, 32),
'QA': (1, 64),
'SA': (1, 4),
'SY': (1, 40),
'TN': (1, 28),
'YE': (1, 36)},
'as': {None: (77, 0)},
'az': {None: (44, 0)},
'be': {None: (35, 0)},
'bg': {None: (2, 0)},
'bn': {None: (69, 0)},
'ca': {None: (3, 0)},
'cs': {None: (5, 0)},
'da': {None: (6, 0)},
'de': {None: (7, 0),
'AT': (7, 12),
'CH': (7, 8),
'LI': (7, 20),
'LU': (7, 16)},
'el': {None: (8, 0)},
'en': {None: (9, 0),
'AU': (9, 12),
'BZ': (9, 40),
'CA': (9, 16),
'GB': (9, 8),
'IE': (9, 24),
'JM': (9, 32),
'NZ': (9, 20),
'PH': (9, 52),
'TT': (9, 44),
'US': (9, 4),
'ZA': (9, 28),
'ZW': (9, 48)},
'es': {None: (10, 0),
'AR': (10, 44),
'BO': (10, 64),
'CL': (10, 52),
'CO': (10, 36),
'CR': (10, 20),
'DO': (10, 28),
'EC': (10, 48),
'ES': (10, 4),
'GT': (10, 16),
'HN': (10, 72),
'MX': (10, 8),
'NI': (10, 76),
'PA': (10, 24),
'PE': (10, 40),
'PR': (10, 80),
'PY': (10, 60),
'SV': (10, 68),
'UY': (10, 56),
'VE': (10, 32)},
'et': {None: (37, 0)},
'eu': {None: (45, 0)},
'fa': {None: (41, 0)},
'fi': {None: (11, 0)},
'fo': {None: (56, 0)},
'fr': {None: (12, 0),
'BE': (12, 8),
'CA': (12, 12),
'CH': (12, 16),
'FR': (12, 4),
'LU': (12, 20),
'MC': (12, 24)},
'gu': {None: (71, 0)},
'he': {None: (13, 0)},
'hi': {None: (57, 0)},
'hr': {None: (26, 0)},
'hu': {None: (14, 0)},
'hy': {None: (43, 0)},
'id': {None: (33, 0)},
'is': {None: (15, 0)},
'it': {None: (16, 0),
'CH': (16, 8),
'IT': (16, 4)},
'ja': {None: (17, 0)},
'ka': {None: (55, 0)},
'kk': {None: (63, 0)},
'kn': {None: (75, 0)},
'ko': {None: (18, 0)},
'kok': {None: (87, 0)},
'lt': {None: (39, 0)},
'lv': {None: (38, 0)},
'mk': {None: (47, 0)},
'ml': {None: (76, 0)},
'mr': {None: (78, 0)},
'ms': {None: (62, 0)},
'mt': {None: (58, 0)},
'ne': {None: (97, 0)},
'nl': {None: (19, 0),
'BE': (19, 8)},
'no': {None: (20, 0)},
'or': {None: (72, 0)},
'pa': {None: (70, 0)},
'pl': {None: (21, 0)},
'pt': {None: (22, 0),
'BR': (22, 4),
'PT': (22, 8)},
'rm': {None: (23, 0)},
'ro': {None: (24, 0)},
'ru': {None: (25, 0)},
'sa': {None: (79, 0)},
'se': {None: (59, 0)},
'sk': {None: (27, 0)},
'sl': {None: (36, 0)},
'sq': {None: (28, 0)},
'sr': {None: (26, 12),
'RS': (26, 12)},
'st': {None: (48, 0)},
'sv': {None: (29, 0),
'FI': (29, 8)},
'sw': {None: (65, 0)},
'ta': {None: (73, 0)},
'te': {None: (74, 0)},
'th': {None: (30, 0)},
'tn': {None: (50, 0)},
'tr': {None: (31, 0)},
'ts': {None: (49, 0)},
'tt': {None: (68, 0)},
'uk': {None: (34, 0)},
'ur': {None: (32, 0)},
'uz': {None: (67, 0),
'UZ': (67, 8)},
'vi': {None: (42, 0)},
'wen': {None: (46, 0)},
'xh': {None: (52, 0)},
'zh': {None: (4, 0),
'CN': (4, 8),
'HK': (4, 12),
'SG': (4, 16),
'TW': (4, 4)},
'zu': {None: (53, 0)}}
def iana2mobi(icode):
subtags = list(icode.split('-'))
langdict = IANA_MOBI[None]
while len(subtags) > 0:
lang = subtags.pop(0).lower()
if lang in IANA_MOBI:
langdict = IANA_MOBI[lang]
break
mcode = langdict[None]
while len(subtags) > 0:
subtag = subtags.pop(0)
if subtag not in langdict:
subtag = subtag.title()
if subtag not in langdict:
subtag = subtag.upper()
if subtag in langdict:
mcode = langdict[subtag]
break
return pack('>HBB', 0, mcode[1], mcode[0])

View File

@ -0,0 +1,379 @@
'''
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys
import os
import copy
import re
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name): return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td', 'th'])
TABLE_TAGS = set(['table', 'tr', 'td', 'th'])
SPECIAL_TAGS = set(['hr', 'br'])
CONTENT_TAGS = set(['img', 'hr', 'br'])
PAGE_BREAKS = set(['always', 'odd', 'even'])
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def asfloat(value):
if not isinstance(value, (int, long, float)):
return 0.0
return float(value)
class BlockState(object):
def __init__(self, body):
self.body = body
self.nested = []
self.para = None
self.inline = None
self.anchor = None
self.vpadding = 0.
self.vmargin = 0.
self.pbreak = False
self.istate = None
self.content = False
class FormatState(object):
def __init__(self):
self.left = 0.
self.halign = 'auto'
self.indent = 0.
self.fsize = 3
self.ids = set()
self.valign = 'baseline'
self.italic = False
self.bold = False
self.preserve = False
self.family = 'serif'
self.href = None
self.list_num = 0
self.attrib = {}
def __eq__(self, other):
return self.fsize == other.fsize \
and self.italic == other.italic \
and self.bold == other.bold \
and self.href == other.href \
and self.valign == other.valign \
and self.preserve == other.preserve \
and self.family == other.family
def __ne__(self, other):
return not self.__eq__(other)
class MobiMLizer(object):
def transform(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.remove_html_cover()
self.mobimlize_spine()
def remove_html_cover(self):
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
def mobimlize_spine(self):
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body'))
self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()])
item.data = nroot
def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]]
def mobimlize_measure(self, ptsize):
if isinstance(ptsize, basestring):
return ptsize
embase = self.profile.fbase
if round(ptsize) < embase:
return "%dpt" % int(round(ptsize))
return "%dem" % int(round(ptsize / embase))
def preize_text(self, text):
text = unicode(text).replace(u' ', u'\xa0')
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
lines = text.split('\n')
result = lines[:1]
for line in lines[1:]:
result.append(etree.Element(XHTML('br')))
if line:
result.append(line)
return result
def mobimlize_content(self, tag, text, bstate, istates):
bstate.content = True
istate = istates[-1]
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None:
body = bstate.body
if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak'))
bstate.pbreak = False
if istate.ids:
for id in istate.ids:
etree.SubElement(body, XHTML('a'), attrib={'id': id})
istate.ids.clear()
bstate.istate = None
bstate.anchor = None
parent = bstate.nested[-1] if bstate.nested else bstate.body
indent = istate.indent
left = istate.left
if indent < 0 and abs(indent) < left:
left += indent
indent = 0
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS:
para = wrapper = etree.SubElement(parent, XHTML(tag))
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
para.attrib['value'] = str(istates[-2].list_num)
elif left > 0 and indent >= 0:
para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
para = wrapper
emleft = int(round(left / self.profile.fbase)) - 1
emleft = min((emleft, 10))
while emleft > 0:
para = etree.SubElement(para, XHTML('blockquote'))
emleft -= 1
else:
para = wrapper = etree.SubElement(parent, XHTML('p'))
bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0
if tag not in TABLE_TAGS:
wrapper.attrib['height'] = self.mobimlize_measure(vspace)
para.attrib['width'] = self.mobimlize_measure(indent)
elif tag == 'table' and vspace > 0:
body = bstate.body
vspace = int(round(vspace / self.profile.fbase))
index = max((0, len(body) - 1))
while vspace > 0:
body.insert(index, etree.Element(XHTML('br')))
vspace -= 1
if istate.halign != 'auto':
para.attrib['align'] = istate.halign
pstate = bstate.istate
if tag in CONTENT_TAGS:
bstate.inline = para
pstate = bstate.istate = None
etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top'
if not text:
return
if not pstate or istate != pstate:
inline = para
valign = istate.valign
fsize = istate.fsize
href = istate.href
if not href:
bstate.anchor = None
elif pstate and pstate.href == href:
inline = bstate.anchor
else:
inline = etree.SubElement(inline, XHTML('a'), href=href)
bstate.anchor = inline
if valign == 'super':
inline = etree.SubElement(inline, XHTML('sup'))
elif valign == 'sub':
inline = etree.SubElement(inline, XHTML('sub'))
elif fsize != 3:
inline = etree.SubElement(inline, XHTML('font'),
size=str(fsize))
if istate.family == 'monospace':
inline = etree.SubElement(inline, XHTML('tt'))
if istate.italic:
inline = etree.SubElement(inline, XHTML('i'))
if istate.bold:
inline = etree.SubElement(inline, XHTML('b'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
content = self.preize_text(text) if istate.preserve else [text]
for item in content:
if isinstance(item, basestring):
if len(inline) == 0:
inline.text = (inline.text or '') + item
else:
last = inline[-1]
last.tail = (last.tail or '') + item
else:
inline.append(item)
def mobimlize_elem(self, elem, stylizer, bstate, istates):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return
style = stylizer.style(elem)
# <mbp:frame-set/> does not exist lalalala
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return
tag = barename(elem.tag)
istate = copy.copy(istates[-1])
istate.list_num = 0
istates.append(istate)
left = 0
display = style['display']
isblock = not display.startswith('inline')
isblock = isblock and style['float'] == 'none'
isblock = isblock and tag != 'br'
if isblock:
bstate.para = None
istate.halign = style['text-align']
istate.indent = style['text-indent']
if style['margin-left'] == 'auto' \
and style['margin-right'] == 'auto':
istate.halign = 'center'
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
if tag != 'body':
left = margin + padding
istate.left += left
vmargin = asfloat(style['margin-top'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-top'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
elif not istate.href:
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
lspace = margin + padding
if lspace > 0:
spaces = int(round((lspace * 3) / style['font-size']))
elem.text = (u'\xa0' * spaces) + (elem.text or '')
margin = asfloat(style['margin-right'])
padding = asfloat(style['padding-right'])
rspace = margin + padding
if rspace > 0:
spaces = int(round((rspace * 3) / style['font-size']))
if len(elem) == 0:
elem.text = (elem.text or '') + (u'\xa0' * spaces)
else:
last = elem[-1]
last.text = (last.text or '') + (u'\xa0' * spaces)
if bstate.content and style['page-break-before'] in PAGE_BREAKS:
bstate.pbreak = True
istate.fsize = self.mobimlize_font(style['font-size'])
istate.italic = True if style['font-style'] == 'italic' else False
weight = style['font-weight']
istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
istate.preserve = (style['white-space'] in ('pre', 'pre-wrap'))
if 'monospace' in style['font-family']:
istate.family = 'monospace'
elif 'sans-serif' in style['font-family']:
istate.family = 'sans-serif'
else:
istate.family = 'serif'
valign = style['vertical-align']
if valign in ('super', 'text-top') or asfloat(valign) > 0:
istate.valign = 'super'
elif valign == 'sub' or asfloat(valign) < 0:
istate.valign = 'sub'
else:
istate.valign = 'baseline'
if 'id' in elem.attrib:
istate.ids.add(elem.attrib['id'])
if 'name' in elem.attrib:
istate.ids.add(elem.attrib['name'])
if tag == 'a' and 'href' in elem.attrib:
istate.href = elem.attrib['href']
istate.attrib.clear()
if tag == 'img' and 'src' in elem.attrib:
istate.attrib['src'] = elem.attrib['src']
istate.attrib['align'] = 'baseline'
for prop in ('width', 'height'):
if style[prop] != 'auto':
value = style[prop]
if value == getattr(self.profile, prop):
result = '100%'
else:
ems = int(round(value / self.profile.fbase))
result = "%dem" % ems
istate.attrib[prop] = result
elif tag == 'hr' and asfloat(style['width']) > 0:
prop = style['width'] / self.profile.width
istate.attrib['width'] = "%d%%" % int(round(prop * 100))
elif display == 'table':
tag = 'table'
elif display == 'table-row':
tag = 'tr'
elif display == 'table-cell':
tag = 'td'
text = None
if elem.text:
if istate.preserve:
text = elem.text
elif len(elem) > 0 and elem.text.isspace():
text = None
else:
text = COLLAPSE.sub(' ', elem.text)
if text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS:
self.mobimlize_content(tag, text, bstate, istates)
for child in elem:
self.mobimlize_elem(child, stylizer, bstate, istates)
tail = None
if child.tail:
if istate.preserve:
tail = child.tail
elif bstate.para is None and child.tail.isspace():
tail = None
else:
tail = COLLAPSE.sub(' ', child.tail)
if tail:
self.mobimlize_content(tag, tail, bstate, istates)
if bstate.content and style['page-break-after'] in PAGE_BREAKS:
bstate.pbreak = True
if isblock:
para = bstate.para
if para is not None and para.text == u'\xa0':
para.getparent().replace(para, etree.Element(XHTML('br')))
bstate.para = None
bstate.istate = None
vmargin = asfloat(style['margin-bottom'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-bottom'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
if tag in NESTABLE_TAGS and bstate.nested:
bstate.nested.pop()
istates.pop()

View File

@ -2,7 +2,11 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
from cStringIO import StringIO
from struct import pack
COUNT_BITS = 3
@ -31,4 +35,54 @@ def decompress_doc(data):
res.append(res[j - di+k])
return ''.join([chr(i) for i in res])
def compress_doc(data):
out = StringIO()
i = 0
ldata = len(data)
while i < ldata:
if i > 10 and (ldata - i) > 10:
chunk = ''
match = -1
for j in xrange(10, 2, -1):
chunk = data[i:i+j]
try:
match = data.rindex(chunk, 0, i)
except ValueError:
continue
if (i - match) <= 2047:
break
match = -1
if match >= 0:
n = len(chunk)
m = i - match
code = 0x8000 + ((m << 3) & 0x3ff8) + (n - 3)
out.write(pack('>H', code))
i += n
continue
ch = data[i]
och = ord(ch)
i += 1
if ch == ' ' and (i + 1) < ldata:
onch = ord(data[i])
if onch >= 0x40 and onch < 0x80:
out.write(pack('>B', onch ^ 0x80))
i += 1
continue
if och == 0 or (och > 8 and och < 0x80):
out.write(ch)
else:
j = i
binseq = [ch]
while j < ldata and len(binseq) < 8:
ch = data[j]
och = ord(ch)
if och == 0 or (och > 8 and och < 0x80):
break
binseq.append(ch)
j += 1
out.write(pack('>B', len(binseq)))
out.write(''.join(binseq))
i += len(binseq) - 1
return out.getvalue()

View File

@ -0,0 +1,583 @@
'''
Write content to Mobipocket books.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys
import os
from struct import pack
import functools
import time
import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
import logging
from lxml import etree
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS, MBP, MobiMLizer
from calibre.customize.ui import run_plugins_on_postprocess
from calibre.utils.config import OptionParser
from optparse import OptionGroup
# TODO:
# - Allow override CSS (?)
# - Generate index records
# - Optionally rasterize tables
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'date': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'title': 503,
}
RECORD_SIZE = 0x1000
UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
OTHER_MAX_IMAGE_SIZE = 10 * 1024 * 1024
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)
def encode(data):
return data.encode('utf-8')
# Almost like the one for MS LIT, but not quite.
DECINT_FORWARD = 0
DECINT_BACKWARD = 1
def decint(value, direction):
bytes = []
while True:
b = value & 0x7f
value >>= 7
bytes.append(b)
if value == 0:
break
if direction == DECINT_FORWARD:
bytes[0] |= 0x80
elif direction == DECINT_BACKWARD:
bytes[-1] |= 0x80
return ''.join(chr(b) for b in reversed(bytes))
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images):
self.oeb = oeb
self.images = images
self.id_offsets = {}
self.href_offsets = defaultdict(list)
self.breaks = []
buffer = self.buffer = StringIO()
buffer.write('<html>')
self.serialize_head()
self.serialize_body()
buffer.write('</html>')
self.fixup_links()
self.text = buffer.getvalue()
def serialize_head(self):
buffer = self.buffer
buffer.write('<head>')
if len(self.oeb.guide) > 0:
self.serialize_guide()
buffer.write('</head>')
def serialize_guide(self):
buffer = self.buffer
hrefs = self.oeb.manifest.hrefs
buffer.write('<guide>')
for ref in self.oeb.guide.values():
path, frag = urldefrag(ref.href)
if hrefs[path].media_type not in OEB_DOCS:
continue
buffer.write('<reference type="')
self.serialize_text(ref.type, quot=True)
buffer.write('" ')
if ref.title is not None:
buffer.write('title="')
self.serialize_text(ref.title, quot=True)
buffer.write('" ')
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
buffer.write(' />')
buffer.write('</guide>')
def serialize_href(self, href, base=None):
hrefs = self.oeb.manifest.hrefs
path, frag = urldefrag(href)
if path and base:
path = base.abshref(path)
if path and path not in hrefs:
return False
buffer = self.buffer
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
id = item.id if item else base.id
href = '#'.join((id, frag)) if frag else id
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
return True
def serialize_body(self):
buffer = self.buffer
buffer.write('<body>')
# CybookG3 'Start Reading' link
if 'text' in self.oeb.guide:
href = self.oeb.guide['text'].href
buffer.write('<a ')
self.serialize_href(href)
buffer.write(' />')
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
self.serialize_item(item)
buffer.write('</body>')
def serialize_item(self, item):
buffer = self.buffer
if not item.linear:
self.breaks.append(buffer.tell() - 1)
self.id_offsets[item.id] = buffer.tell()
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
buffer.write('<mbp:pagebreak/>')
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buffer = self.buffer
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap:
return
hrefs = self.oeb.manifest.hrefs
tag = prefixname(elem.tag, nsrmap)
for attr in ('name', 'id'):
if attr in elem.attrib:
id = '#'.join((item.id, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell()
del elem.attrib[attr]
if tag == 'a' and not elem.attrib \
and not len(elem) and not elem.text:
return
buffer.write('<')
buffer.write(tag)
if elem.attrib:
for attr, val in elem.attrib.items():
if namespace(attr) not in nsrmap:
continue
attr = prefixname(attr, nsrmap)
buffer.write(' ')
if attr == 'href':
if self.serialize_href(val, item):
continue
elif attr == 'src':
href = item.abshref(val)
if href in hrefs:
index = self.images[href]
buffer.write('recindex="%05d"' % index)
continue
buffer.write(attr)
buffer.write('="')
self.serialize_text(val, quot=True)
buffer.write('"')
if elem.text or len(elem) > 0:
buffer.write('>')
if elem.text:
self.serialize_text(elem.text)
for child in elem:
self.serialize_elem(child, item)
if child.tail:
self.serialize_text(child.tail)
buffer.write('</%s>' % tag)
else:
buffer.write('/>')
def serialize_text(self, text, quot=False):
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
if quot:
text = text.replace('"', '&quot;')
self.buffer.write(encode(text))
def fixup_links(self):
buffer = self.buffer
for id, hoffs in self.href_offsets.items():
ioff = self.id_offsets[id]
for hoff in hoffs:
buffer.seek(hoff)
buffer.write('%010d' % ioff)
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=None, imagemax=None):
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
def dump(self, oeb, path):
if hasattr(path, 'write'):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _write(self, *data):
for datum in data:
self._stream.write(datum)
def _tell(self):
return self._stream.tell()
def _dump_stream(self, oeb, stream):
self._oeb = oeb
self._stream = stream
self._records = [None]
self._generate_content()
self._generate_record0()
self._write_header()
self._write_content()
def _generate_content(self):
self._map_image_names()
self._generate_text()
self._generate_images()
def _map_image_names(self):
index = 1
self._images = images = {}
for item in self._oeb.manifest.values():
if item.media_type in OEB_RASTER_IMAGES:
images[item.href] = index
index += 1
def _read_text_record(self, text):
pos = text.tell()
text.seek(0, 2)
npos = min((pos + RECORD_SIZE, text.tell()))
last = ''
while not last.decode('utf-8', 'ignore'):
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
extra = 0
try:
last.decode('utf-8')
except UnicodeDecodeError:
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(pos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
def _generate_text(self):
self._oeb.logger.info('Serializing markup content...')
serializer = Serializer(self._oeb, self._images)
breaks = serializer.breaks
text = serializer.text
self._text_length = len(text)
text = StringIO(text)
nrecords = 0
offset = 0
if self._compression != UNCOMPRESSED:
self._oeb.logger.info('Compressing markup content...')
data, overlap = self._read_text_record(text)
while len(data) > 0:
if self._compression == PALMDOC:
data = compress_doc(data)
record = StringIO()
record.write(data)
record.write(overlap)
record.write(pack('>B', len(overlap)))
nextra = 0
pbreak = 0
running = offset
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = decint(pbreak, DECINT_FORWARD)
record.write(encoded)
running += pbreak << 3
nextra += len(encoded)
lsize = 1
while True:
size = decint(nextra + lsize, DECINT_BACKWARD)
if len(size) == lsize:
break
lsize += 1
record.write(size)
self._records.append(record.getvalue())
nrecords += 1
offset += RECORD_SIZE
data, overlap = self._read_text_record(text)
self._text_nrecords = nrecords
def _rescale_image(self, data, maxsizeb, dimen=None):
image = Image.open(StringIO(data))
format = image.format
changed = False
if image.format not in ('JPEG', 'GIF'):
width, height = image.size
area = width * height
format = 'GIF' if area <= 40000 else 'JPEG'
changed = True
if dimen is not None:
image.thumbnail(dimen, Image.ANTIALIAS)
changed = True
if changed:
data = StringIO()
image.save(data, format)
data = data.getvalue()
if len(data) <= maxsizeb:
return data
image = image.convert('RGBA')
for quality in xrange(95, -1, -1):
data = StringIO()
image.save(data, 'JPEG', quality=quality)
data = data.getvalue()
if len(data) <= maxsizeb:
return data
width, height = image.size
for scale in xrange(99, 0, -1):
scale = scale / 100.
data = StringIO()
scaled = image.copy()
size = (int(width * scale), (height * scale))
scaled.thumbnail(size, Image.ANTIALIAS)
scaled.save(data, 'JPEG', quality=0)
data = data.getvalue()
if len(data) <= maxsizeb:
return data
# Well, we tried?
return data
def _generate_images(self):
self._oeb.logger.warn('Serializing images...')
images = [(index, href) for href, index in self._images.items()]
images.sort()
metadata = self._oeb.metadata
coverid = metadata.cover[0] if metadata.cover else None
for _, href in images:
item = self._oeb.manifest.hrefs[href]
data = self._rescale_image(item.data, self._imagemax)
self._records.append(data)
def _generate_record0(self):
metadata = self._oeb.metadata
exth = self._build_exth()
record0 = StringIO()
record0.write(pack('>HHIHHHH', self._compression, 0,
self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0))
uid = random.randint(0, 0xffffffff)
title = str(metadata.title[0])
record0.write('MOBI')
record0.write(pack('>IIIII', 0xe8, 2, 65001, uid, 6))
record0.write('\xff' * 40)
record0.write(pack('>I', self._text_nrecords + 1))
record0.write(pack('>II', 0xe8 + 16 + len(exth), len(title)))
record0.write(iana2mobi(str(metadata.language[0])))
record0.write('\0' * 8)
record0.write(pack('>II', 6, self._text_nrecords + 1))
record0.write('\0' * 16)
record0.write(pack('>I', 0x50))
record0.write('\0' * 32)
record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
# The '5' is a bitmask of extra record data at the end:
# - 0x1: <extra multibyte bytes><size> (?)
# - 0x4: <uncrossable breaks><size>
# Of course, the formats aren't quite the same.
# TODO: What the hell are the rest of these fields?
record0.write(pack('>IIIIIIIIIIIIIIIII',
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff))
record0.write(exth)
record0.write(title)
record0 = record0.getvalue()
self._records[0] = record0 + ('\0' * (2452 - len(record0)))
def _build_exth(self):
oeb = self._oeb
exth = StringIO()
nrecs = 0
for term in oeb.metadata:
if term not in EXTH_CODES: continue
code = EXTH_CODES[term]
for item in oeb.metadata[term]:
data = self.COLLAPSE_RE.sub(' ', unicode(item))
data = data.encode('utf-8')
exth.write(pack('>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
item = oeb.manifest.ids[id]
href = item.href
index = self._images[href] - 1
exth.write(pack('>III', 0xc9, 0x0c, index))
exth.write(pack('>III', 0xcb, 0x0c, 0))
index = self._add_thumbnail(item) - 1
exth.write(pack('>III', 0xca, 0x0c, index))
nrecs += 3
exth = exth.getvalue()
trail = len(exth) % 4
pad = '' if not trail else '\0' * (4 - trail)
exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
return ''.join(exth)
def _add_thumbnail(self, item):
data = self._rescale_image(item.data, MAX_THUMB_SIZE, MAX_THUMB_DIMEN)
manifest = self._oeb.manifest
id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
manifest.add(id, href, 'image/jpeg', data=data)
index = len(self._images) + 1
self._images[href] = index
self._records.append(data)
return index
def _write_header(self):
title = str(self._oeb.metadata.title[0])
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
title = title + ('\0' * (32 - len(title)))
now = int(time.time())
nrecords = len(self._records)
self._write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
'BOOK', 'MOBI', pack('>IIH', nrecords, 0, nrecords))
offset = self._tell() + (8 * nrecords) + 2
for id, record in izip(count(), self._records):
self._write(pack('>I', offset), '\0', pack('>I', id)[1:])
offset += len(record)
self._write('\0\0')
def _write_content(self):
for record in self._records:
self._write(record)
def add_mobi_options(parser):
profiles = Context.PROFILES.keys()
profiles.sort()
profiles = ', '.join(profiles)
group = OptionGroup(parser, _('Mobipocket'),
_('Mobipocket-specific options.'))
group.add_option(
'-c', '--compress', default=False, action='store_true',
help=_('Compress file text using PalmDOC compression.'))
group.add_option(
'-r', '--rescale-images', default=False, action='store_true',
help=_('Modify images to meet Palm device size limitations.'))
parser.add_option_group(group)
group = OptionGroup(parser, _('Profiles'), _('Device renderer profiles. '
'Affects conversion of default font sizes and rasterization '
'resolution. Valid profiles are: %s.') % profiles)
group.add_option(
'--source-profile', default='Browser', metavar='PROFILE',
help=_("Source renderer profile. Default is 'Browser'."))
group.add_option(
'--dest-profile', default='CybookG3', metavar='PROFILE',
help=_("Destination renderer profile. Default is 'CybookG3'."))
parser.add_option_group(group)
return
def option_parser():
parser = OptionParser(usage=_('%prog [options] OPFFILE'))
parser.add_option(
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
add_mobi_options(parser)
return parser
def oeb2mobi(opts, inpath):
logger = Logger(logging.getLogger('oeb2mobi'))
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0] + '.mobi'
source = opts.source_profile
if source not in Context.PROFILES:
logger.error(_('Unknown source profile %r') % source)
return 1
dest = opts.dest_profile
if dest not in Context.PROFILES:
logger.error(_('Unknown destination profile %r') % dest)
return 1
compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
context = Context(source, dest)
oeb = OEBBook(inpath, logger=logger)
tocadder = HTMLTOCAdder()
tocadder.transform(oeb, context)
mangler = CaseMangler()
mangler.transform(oeb, context)
fbase = context.dest.fbase
fkey = context.dest.fnums.values()
flattener = CSSFlattener(
fbase=fbase, fkey=fkey, unfloat=True, untable=True)
flattener.transform(oeb, context)
rasterizer = SVGRasterizer()
rasterizer.transform(oeb, context)
trimmer = ManifestTrimmer()
trimmer.transform(oeb, context)
mobimlizer = MobiMLizer()
mobimlizer.transform(oeb, context)
writer = MobiWriter(compression=compression, imagemax=imagemax)
writer.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'mobi')
logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])
if len(args) != 1:
parser.print_help()
return 1
inpath = args[0]
retval = oeb2mobi(opts, inpath)
return retval
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,2 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

View File

@ -14,10 +14,14 @@ from itertools import izip, count
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
import re
import htmlentitydefs
import uuid
import copy
from lxml import etree
from calibre import LoggingInterface
XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False)
XML_PARSER = etree.XMLParser(recover=True)
XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
@ -28,25 +32,48 @@ DC11_NS = 'http://purl.org/dc/elements/1.1/'
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
DCTERMS_NS = 'http://purl.org/dc/terms/'
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS}
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS, 'xl': XLINK_NS}
def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
def OPF(name): return '{%s}%s' % (OPF2_NS, name)
def DC(name): return '{%s}%s' % (DC11_NS, name)
def NCX(name): return '{%s}%s' % (NCX_NS, name)
def SVG(name): return '{%s}%s' % (SVG_NS, name)
def XLINK(name): return '{%s}%s' % (XLINK_NS, name)
EPUB_MIME = 'application/epub+zip'
XHTML_MIME = 'application/xhtml+xml'
CSS_MIME = 'text/css'
NCX_MIME = 'application/x-dtbncx+xml'
OPF_MIME = 'application/oebps-package+xml'
OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = 'font/opentype'
GIF_MIME = 'image/gif'
JPEG_MIME = 'image/jpeg'
PNG_MIME = 'image/png'
SVG_MIME = 'image/svg+xml'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])
OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace')
ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items())
del ENTITYDEFS['lt']
del ENTITYDEFS['gt']
del ENTITYDEFS['quot']
del ENTITYDEFS['amp']
del recode
def element(parent, *args, **kwargs):
@ -64,10 +91,23 @@ def barename(name):
return name.split('}', 1)[1]
return name
def prefixname(name, nsrmap):
prefix = nsrmap[namespace(name)]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
def xml2str(root):
return etree.tostring(root, encoding='utf-8', xml_declaration=True)
ASCII_CHARS = set(chr(x) for x in xrange(128))
URL_SAFE = set(u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
u'abcdefghijklmnopqrstuvwxyz'
u'0123456789' u'_.-/~')
URL_UNSAFE = ASCII_CHARS - URL_SAFE
def urlquote(href):
result = []
for char in href:
@ -84,12 +124,20 @@ def urlnormalize(href):
return urlunparse(parts)
class OEBError(Exception):
pass
class FauxLogger(object):
def __getattr__(self, name):
return self
def __call__(self, message):
print message
class Logger(LoggingInterface, object):
def __getattr__(self, name):
return object.__getattribute__(self, 'log_' + name)
class AbstractContainer(object):
def read_xml(self, path):
@ -108,25 +156,45 @@ class DirContainer(AbstractContainer):
def write(self, path, data):
path = os.path.join(self.rootdir, path)
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
with open(urlunquote(path), 'wb') as f:
return f.write(data)
def exists(self, path):
path = os.path.join(self.rootdir, path)
return os.path.isfile(path)
return os.path.isfile(urlunquote(path))
class DirWriter(object):
def __init__(self, version=2.0):
self.version = version
def dump(self, oeb, path):
if not os.path.isdir(path):
os.mkdir(path)
output = DirContainer(path)
for item in oeb.manifest.values():
output.write(item.href, str(item))
metadata = oeb.to_opf2() if self.version == 2 else oeb.to_opf1()
for href, data in metadata.values():
output.write(href, xml2str(data))
return
class Metadata(object):
TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description',
'format', 'identifier', 'language', 'publisher', 'relation',
'rights', 'source', 'subject', 'title', 'type'])
ATTRS = set(['role', 'file-as', 'scheme'])
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS}
class Item(object):
def __init__(self, term, value, fq_attrib={}):
self.fq_attrib = dict(fq_attrib)
def __init__(self, term, value, fq_attrib={}, **kwargs):
self.fq_attrib = fq_attrib = dict(fq_attrib)
fq_attrib.update(kwargs)
if term == OPF('meta') and not value:
term = self.fq_attrib.pop('name')
value = self.fq_attrib.pop('content')
@ -136,7 +204,12 @@ class Metadata(object):
self.value = value
self.attrib = attrib = {}
for fq_attr in fq_attrib:
attr = barename(fq_attr)
if fq_attr in Metadata.ATTRS:
attr = fq_attr
fq_attr = OPF2(fq_attr)
fq_attrib[fq_attr] = fq_attrib.pop(attr)
else:
attr = barename(fq_attr)
attrib[attr] = fq_attrib[fq_attr]
def __getattr__(self, name):
@ -153,7 +226,7 @@ class Metadata(object):
% (barename(self.term), self.value, self.attrib)
def __str__(self):
return str(self.value)
return unicode(self.value).encode('ascii', 'xmlcharrefreplace')
def __unicode__(self):
return unicode(self.value)
@ -183,8 +256,8 @@ class Metadata(object):
self.oeb = oeb
self.items = defaultdict(list)
def add(self, term, value, attrib={}):
item = self.Item(term, value, attrib)
def add(self, term, value, attrib={}, **kwargs):
item = self.Item(term, value, attrib, **kwargs)
items = self.items[barename(item.term)]
items.append(item)
return item
@ -225,7 +298,11 @@ class Metadata(object):
class Manifest(object):
class Item(object):
def __init__(self, id, href, media_type, fallback=None, loader=str):
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
def __init__(self, id, href, media_type,
fallback=None, loader=str, data=None):
self.id = id
self.href = self.path = urlnormalize(href)
self.media_type = media_type
@ -233,26 +310,32 @@ class Manifest(object):
self.spine_position = None
self.linear = True
self._loader = loader
self._data = None
self._data = data
def __repr__(self):
return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = self.ENTITY_RE.sub(repl, data)
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data)
data = etree.fromstring(data, parser=XML_PARSER)
return data
def data():
def fget(self):
if self._data:
if self._data is not None:
return self._data
data = self._loader(self.href)
if self.media_type == XHTML_MIME:
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data)
data = etree.fromstring(data, parser=XML_PARSER)
elif self.media_type.startswith('application/') \
and self.media_type.endswith('+xml'):
if self.media_type in OEB_DOCS:
data = self._force_xhtml(data)
elif self.media_type[-4:] in ('+xml', '/xml'):
data = etree.fromstring(data, parser=XML_PARSER)
self._data = data
return data
def fset(self, value):
self._data = value
@ -260,13 +343,49 @@ class Manifest(object):
self._data = None
return property(fget, fset, fdel)
data = data()
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
return xml2str(data)
return str(data)
def __eq__(self, other):
return id(self) == id(other)
def __ne__(self, other):
return not self.__eq__(other)
def __cmp__(self, other):
result = cmp(self.spine_position, other.spine_position)
if result != 0:
return result
return cmp(self.id, other.id)
smatch = self.NUM_RE.search(self.href)
sref = smatch.group(1) if smatch else self.href
snum = float(smatch.group(2)) if smatch else 0.0
skey = (sref, snum, self.id)
omatch = self.NUM_RE.search(other.href)
oref = omatch.group(1) if omatch else other.href
onum = float(omatch.group(2)) if omatch else 0.0
okey = (oref, onum, other.id)
return cmp(skey, okey)
def relhref(self, href):
if '/' not in self.href:
return href
base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href)
target = target.split('/')
for index in xrange(min(len(base), len(target))):
if base[index] != target[index]: break
else:
index += 1
relhref = (['..'] * (len(base) - index)) + target[index:]
relhref = '/'.join(relhref)
if frag:
relhref = '#'.join((relhref, frag))
return relhref
def abshref(self, href):
if '/' not in self.href:
return href
@ -277,42 +396,60 @@ class Manifest(object):
def __init__(self, oeb):
self.oeb = oeb
self.items = {}
self.ids = {}
self.hrefs = {}
def add(self, id, href, media_type, fallback=None):
def add(self, id, href, media_type, fallback=None, loader=None, data=None):
loader = loader or self.oeb.container.read
item = self.Item(
id, href, media_type, fallback, self.oeb.container.read)
self.items[item.id] = item
id, href, media_type, fallback, loader, data)
self.ids[item.id] = item
self.hrefs[item.href] = item
return item
def remove(self, id):
href = self.items[id].href
del self.items[id]
del self.hrefs[href]
def remove(self, item):
if item in self.ids:
item = self.ids[item]
del self.ids[item.id]
del self.hrefs[item.href]
if item in self.oeb.spine:
self.oeb.spine.remove(item)
def generate(self, id, href):
href = urlnormalize(href)
base = id
index = 1
while id in self.ids:
id = base + str(index)
index += 1
base, ext = os.path.splitext(href)
index = 1
while href in self.hrefs:
href = base + str(index) + ext
index += 1
return id, href
def __iter__(self):
for id in self.items:
for id in self.ids:
yield id
def __getitem__(self, id):
return self.items[id]
return self.ids[id]
def values(self):
for item in self.items.values():
for item in self.ids.values():
yield item
def items(self):
for id, item in self.refs.items():
yield id, items
for id, item in self.ids.items():
yield id, item
def __contains__(self, key):
return key in self.items
return key in self.ids
def to_opf1(self, parent=None):
elem = element(parent, 'manifest')
for item in self.items.values():
for item in self.ids.values():
media_type = item.media_type
if media_type == XHTML_MIME:
media_type = OEB_DOC_MIME
@ -327,7 +464,7 @@ class Manifest(object):
def to_opf2(self, parent=None):
elem = element(parent, OPF('manifest'))
for item in self.items.values():
for item in self.ids.values():
attrib = {'id': item.id, 'href': item.href,
'media-type': item.media_type}
if item.fallback:
@ -341,18 +478,35 @@ class Spine(object):
self.oeb = oeb
self.items = []
def add(self, item, linear):
def _linear(self, linear):
if isinstance(linear, StringTypes):
linear = linear.lower()
if linear is None or linear in ('yes', 'true'):
linear = True
elif linear in ('no', 'false'):
linear = False
item.linear = linear
return linear
def add(self, item, linear=None):
item.linear = self._linear(linear)
item.spine_position = len(self.items)
self.items.append(item)
return item
def insert(self, index, item, linear):
item.linear = self._linear(linear)
item.spine_position = index
self.items.insert(index, item)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
return item
def remove(self, item):
index = item.spine_position
self.items.pop(index)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
def __iter__(self):
for item in self.items:
yield item
@ -385,46 +539,81 @@ class Spine(object):
class Guide(object):
class Reference(object):
_TYPES_TITLES = [('cover', 'Cover'), ('title-page', 'Title Page'),
('toc', 'Table of Contents'), ('index', 'Index'),
('glossary', 'Glossary'), ('acknowledgements', 'Acknowledgements'),
('bibliography', 'Bibliography'), ('colophon', 'Colophon'),
('copyright-page', 'Copyright'), ('dedication', 'Dedication'),
('epigraph', 'Epigraph'), ('foreword', 'Foreword'),
('loi', 'List of Illustrations'), ('lot', 'List of Tables'),
('notes', 'Notes'), ('preface', 'Preface'),
('text', 'Main Text')]
TYPES = set(t for t, _ in _TYPES_TITLES)
TITLES = dict(_TYPES_TITLES)
ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0)))
def __init__(self, type, title, href):
if type.lower() in self.TYPES:
type = type.lower()
elif type not in self.TYPES and \
not type.startswith('other.'):
type = 'other.' + type
if not title:
title = self.TITLES.get(type, None)
self.type = type
self.title = title
self.href = urlnormalize(href)
def __repr__(self):
return 'Reference(type=%r, title=%r, href=%r)' \
% (self.type, self.title, self.href)
def _order():
def fget(self):
return self.ORDER.get(self.type, self.type)
return property(fget=fget)
_order = _order()
def __cmp__(self, other):
if not isinstance(other, Guide.Reference):
return NotImplemented
return cmp(self._order, other._order)
def __init__(self, oeb):
self.oeb = oeb
self.refs = {}
def add(self, type, title, href):
ref = self.Reference(type, title, href)
self.refs[type] = ref
return ref
def by_type(self, type):
return self.ref_types[type]
def iterkeys(self):
for type in self.refs:
yield type
__iter__ = iterkeys
def values(self):
for ref in self.refs.values():
yield ref
values = list(self.refs.values())
values.sort()
return values
def items(self):
for type, ref in self.refs.items():
yield type, ref
def __getitem__(self, index):
return self.refs[index]
def __getitem__(self, key):
return self.refs[key]
def __delitem__(self, key):
del self.refs[key]
def __contains__(self, key):
return key in self.refs
def __len__(self):
return len(self.refs)
def to_opf1(self, parent=None):
elem = element(parent, 'guide')
for ref in self.refs.values():
@ -456,6 +645,12 @@ class TOC(object):
node = TOC(title, href, klass, id)
self.nodes.append(node)
return node
def iterdescendants(self):
for node in self.nodes:
yield node
for child in node.iterdescendants():
yield child
def __iter__(self):
for node in self.nodes:
@ -463,6 +658,15 @@ class TOC(object):
def __getitem__(self, index):
return self.nodes[index]
def autolayer(self):
prev = None
for node in list(self.nodes):
if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
self.nodes.remove(node)
prev.nodes.append(node)
else:
prev = node
def depth(self, level=0):
if self.nodes:
@ -496,23 +700,33 @@ class TOC(object):
class OEBBook(object):
def __init__(self, opfpath, container=None, logger=FauxLogger()):
if not container:
def __init__(self, opfpath=None, container=None, logger=FauxLogger()):
if opfpath and not container:
container = DirContainer(os.path.dirname(opfpath))
opfpath = os.path.basename(opfpath)
self.container = container
self.logger = logger
opf = self._read_opf(opfpath)
self._all_from_opf(opf)
if opfpath or container:
opf = self._read_opf(opfpath)
self._all_from_opf(opf)
def _convert_opf1(self, opf):
# Seriously, seriously wrong
if namespace(opf.tag) == OPF1_NS:
opf.tag = barename(opf.tag)
for elem in opf.iterdescendants():
if isinstance(elem.tag, basestring) \
and namespace(elem.tag) == OPF1_NS:
elem.tag = barename(elem.tag)
attrib = dict(opf.attrib)
attrib['version'] = '2.0'
nroot = etree.Element(OPF('package'),
nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib))
nsmap={None: OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'),
nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
for prefix in ('d11', 'd10', 'd09'):
elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix)
elements = xpath(opf, 'metadata//%s:*' % prefix)
if elements: break
for element in elements:
if not element.text: continue
@ -524,7 +738,7 @@ class OEBBook(object):
element.attrib[nsname] = element.attrib[name]
del element.attrib[name]
metadata.append(element)
for element in opf.xpath('metadata/x-metadata/meta'):
for element in opf.xpath('metadata//meta'):
metadata.append(element)
for item in opf.xpath('manifest/item'):
media_type = item.attrib['media-type'].lower()
@ -541,30 +755,56 @@ class OEBBook(object):
def _read_opf(self, opfpath):
opf = self.container.read_xml(opfpath)
version = float(opf.get('version', 1.0))
if version < 2.0:
ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns)
if ns != OPF2_NS or version < 2.0:
opf = self._convert_opf1(opf)
return opf
def _metadata_from_opf(self, opf):
uid = opf.attrib['unique-identifier']
self.metadata = metadata = Metadata(self)
for elem in xpath(opf, '/o2:package/o2:metadata/*'):
if elem.text or elem.attrib:
uid = opf.get('unique-identifier', 'calibre-uuid')
self.uid = None
self.metadata = metadata = Metadata(self)
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
if elem.tag not in ignored and (elem.text or elem.attrib):
metadata.add(elem.tag, elem.text, elem.attrib)
haveuuid = haveid = False
for ident in metadata.identifier:
if unicode(ident).startswith('urn:uuid:'):
haveuuid = True
if 'id' in ident.attrib:
haveid = True
if not haveuuid and haveid:
bookid = "urn:uuid:%s" % str(uuid.uuid4())
metadata.add('identifier', bookid, id='calibre-uuid')
for item in metadata.identifier:
if item.id == uid:
self.uid = item
break
else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
self.uid = metadata.identifier[0]
self.logger.warn(u'Unique-identifier %r not found.' % uid)
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.uid = metadata.identifier[0]
break
if not metadata.language:
self.logger.warn(u'Language not specified.')
metadata.add('language', 'en')
if not metadata.creator:
self.logger.warn(u'Creator not specified.')
metadata.add('creator', 'Unknown')
if not metadata.title:
self.logger.warn(u'Title not specified.')
metadata.add('title', 'Unknown')
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
href = elem.get('href')
if not self.container.exists(href):
self.logger.log_warn(u'Manifest item %r not found.' % href)
self.logger.warn(u'Manifest item %r not found.' % href)
continue
manifest.add(elem.get('id'), href, elem.get('media-type'),
elem.get('fallback'))
@ -574,7 +814,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in self.manifest:
self.logger.log_warn(u'Spine item %r not found.' % idref)
self.logger.warn(u'Spine item %r not found.' % idref)
continue
item = self.manifest[idref]
spine.add(item, elem.get('linear'))
@ -593,7 +833,7 @@ class OEBBook(object):
href = elem.get('href')
path, frag = urldefrag(href)
if path not in self.manifest.hrefs:
self.logger.log_warn(u'Guide reference %r not found' % href)
self.logger.warn(u'Guide reference %r not found' % href)
continue
guide.add(elem.get('type'), elem.get('title'), href)
@ -695,6 +935,32 @@ class OEBBook(object):
if self._toc_from_tour(opf): return
if self._toc_from_html(opf): return
self._toc_from_spine(opf)
def _ensure_cover_image(self):
cover = None
spine0 = self.spine[0]
html = spine0.data
if self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest.ids[id]
elif MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href]
elif xpath(html, '//h:img[position()=1]'):
img = xpath(html, '//h:img[position()=1]')[0]
href = spine0.abshref(img.get('src'))
cover = self.manifest.hrefs[href]
elif xpath(html, '//h:object[position()=1]'):
object = xpath(html, '//h:object[position()=1]')[0]
href = spine0.abshref(object.get('data'))
cover = self.manifest.hrefs[href]
elif xpath(html, '//svg:svg[position()=1]'):
svg = copy.deepcopy(xpath(html, '//svg:svg[position()=1]')[0])
href = os.path.splitext(spine0.href)[0] + '.svg'
id, href = self.manifest.generate(spine0.id, href)
cover = self.manifest.add(id, href, SVG_MIME, data=svg)
if cover and not self.metadata.cover:
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf):
self._metadata_from_opf(opf)
@ -702,6 +968,7 @@ class OEBBook(object):
self._spine_from_opf(opf)
self._guide_from_opf(opf)
self._toc_from_opf(opf)
self._ensure_cover_image()
def to_opf1(self):
package = etree.Element('package',

View File

@ -35,7 +35,8 @@
*
* ***** END LICENSE BLOCK ***** */
@namespace url(http://www.w3.org/1999/xhtml); /* set default namespace to HTML */
@namespace url(http://www.w3.org/1999/xhtml);
@namespace svg url(http://www.w3.org/2000/svg);
/* blocks */
@ -45,7 +46,6 @@ html, div, map, dt, isindex, form {
body {
display: block;
margin: 8px;
}
p, dl, multicol {
@ -59,7 +59,7 @@ dd {
blockquote {
display: block;
margin: 1em 40px;
margin: 1em;
}
address {
@ -74,7 +74,7 @@ center {
blockquote[type=cite] {
display: block;
margin: 1em 0px;
margin: 1em 0em;
border-color: blue;
border-width: thin;
}
@ -234,14 +234,6 @@ th {
/* inlines */
q:before {
content: open-quote;
}
q:after {
content: close-quote;
}
b, strong {
font-weight: bolder;
}
@ -392,22 +384,6 @@ spacer {
float: none ! important;
}
/* focusable content: anything w/ tabindex >=0 is focusable */
abbr:focus, acronym:focus, address:focus, applet:focus, b:focus,
base:focus, big:focus, blockquote:focus, br:focus, canvas:focus, caption:focus,
center:focus, cite:focus, code:focus, col:focus, colgroup:focus, dd:focus,
del:focus, dfn:focus, dir:focus, div:focus, dl:focus, dt:focus, em:focus,
fieldset:focus, font:focus, form:focus, h1:focus, h2:focus, h3:focus, h4:focus,
h5:focus, h6:focus, hr:focus, i:focus, img:focus, ins:focus,
kbd:focus, label:focus, legend:focus, li:focus, link:focus, menu:focus,
object:focus, ol:focus, p:focus, pre:focus, q:focus, s:focus, samp:focus,
small:focus, span:focus, strike:focus, strong:focus, sub:focus, sup:focus,
table:focus, tbody:focus, td:focus, tfoot:focus, th:focus, thead:focus,
tr:focus, tt:focus, u:focus, ul:focus, var:focus {
/* Don't specify the outline-color, we should always use initial value. */
outline: 1px dotted;
}
/* hidden elements */
area, base, basefont, head, meta, script, style, title,
noembed, param, link {
@ -424,3 +400,8 @@ br {
display: block;
}
/* Images, embedded object, and SVG size defaults */
img, object, svg|svg {
width: auto;
height: auto;
}

View File

@ -0,0 +1,75 @@
'''
Device profiles.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from itertools import izip
FONT_SIZES = [('xx-small', 1),
('x-small', None),
('small', 2),
('medium', 3),
('large', 4),
('x-large', 5),
('xx-large', 6),
(None, 7)]
class Profile(object):
def __init__(self, width, height, dpi, fbase, fsizes):
self.width = (float(width) / dpi) * 72.
self.height = (float(height) / dpi) * 72.
self.dpi = float(dpi)
self.fbase = float(fbase)
self.fsizes = []
for (name, num), size in izip(FONT_SIZES, fsizes):
self.fsizes.append((name, num, float(size)))
self.fnames = dict((name, sz) for name, _, sz in self.fsizes if name)
self.fnums = dict((num, sz) for _, num, sz in self.fsizes if num)
PROFILES = {
'PRS505':
Profile(width=584, height=754, dpi=168.451, fbase=12,
fsizes=[7.5, 9, 10, 12, 15.5, 20, 22, 24]),
'MSReader':
Profile(width=480, height=652, dpi=96, fbase=13,
fsizes=[10, 11, 13, 16, 18, 20, 22, 26]),
# Not really, but let's pretend
'Mobipocket':
Profile(width=600, height=800, dpi=96, fbase=18,
fsizes=[14, 14, 16, 18, 20, 22, 24, 26]),
# No clue on usable screen size; DPI should be good
'HanlinV3':
Profile(width=584, height=754, dpi=168.451, fbase=16,
fsizes=[12, 12, 14, 16, 18, 20, 22, 24]),
'CybookG3':
Profile(width=600, height=800, dpi=168.451, fbase=16,
fsizes=[12, 12, 14, 16, 18, 20, 22, 24]),
'Kindle':
Profile(width=525, height=640, dpi=168.451, fbase=16,
fsizes=[12, 12, 14, 16, 18, 20, 22, 24]),
'Browser':
Profile(width=800, height=600, dpi=100.0, fbase=12,
fsizes=[5, 7, 9, 12, 13.5, 17, 20, 22, 24])
}
class Context(object):
PROFILES = PROFILES
def __init__(self, source, dest):
if source in PROFILES:
source = PROFILES[source]
if dest in PROFILES:
dest = PROFILES[dest]
self.source = source
self.dest = dest

View File

@ -16,16 +16,20 @@ import itertools
import types
import re
import copy
from itertools import izip
import cssutils
from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \
CSSValueList, cssproperties
from lxml import etree
from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.lit.oeb import barename, urlnormalize
from lxml.cssselect import css_to_xpath, ExpressionError
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, barename, urlnormalize
from calibre.ebooks.oeb.profile import PROFILES
from calibre.resources import html_css
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
HTML_CSS_STYLESHEET = cssutils.parseString(html_css)
XHTML_CSS_NAMESPACE = "@namespace url(http://www.w3.org/1999/xhtml);\n"
HTML_CSS_STYLESHEET.namespaces['h'] = XHTML_NS
INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
'caption-side', 'color', 'cursor', 'direction', 'elevation',
@ -72,7 +76,7 @@ DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll',
'50', 'right': 'auto', 'speak': 'normal', 'speak-header': 'once',
'speak-numeral': 'continuous', 'speak-punctuation': 'none',
'speech-rate': 'medium', 'stress': '50', 'table-layout': 'auto',
'text-align': 'left', 'text-decoration': 'none', 'text-indent':
'text-align': 'auto', 'text-decoration': 'none', 'text-indent':
0, 'text-transform': 'none', 'top': 'auto', 'unicode-bidi':
'normal', 'vertical-align': 'baseline', 'visibility': 'visible',
'voice-family': 'default', 'volume': 'medium', 'white-space':
@ -82,42 +86,30 @@ DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll',
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large'])
FONT_SIZE_LIST = [('xx-small', 1, 6.),
('x-small', None, 7.),
('small', 2, 8.),
('medium', 3, 9.),
('large', 4, 11.),
('x-large', 5, 13.),
('xx-large', 6, 15.),
(None, 7, 17.)]
FONT_SIZE_BY_NAME = {}
FONT_SIZE_BY_NUM = {}
for name, num, size in FONT_SIZE_LIST:
FONT_SIZE_BY_NAME[name] = size
FONT_SIZE_BY_NUM[num] = size
XPNSMAP = {'h': XHTML_NS,}
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
class Page(object):
def __init__(self, width, height, dpi):
self.width = float(width)
self.height = float(height)
self.dpi = float(dpi)
class Profiles(object):
PRS500 = Page(584, 754, 168.451)
PRS505 = PRS500
class CSSSelector(etree.XPath):
MIN_SPACE_RE = re.compile(r' *([>~+]) *')
LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:")
def __init__(self, css, namespaces=XPNSMAP):
css = self.MIN_SPACE_RE.sub(r'\1', css)
path = css_to_xpath(css)
path = self.LOCAL_NAME_RE.sub(r"local-name() = '", path)
etree.XPath.__init__(self, path, namespaces=namespaces)
self.css = css
def __repr__(self):
return '<%s %s for %r>' % (
self.__class__.__name__,
hex(abs(id(self)))[2:],
self.css)
class Stylizer(object):
STYLESHEETS = {}
def __init__(self, tree, path, oeb, page=Profiles.PRS505):
self.page = page
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
self.profile = profile
base = os.path.dirname(path)
basename = os.path.basename(path)
cssname = os.path.splitext(basename)[0] + '.css'
@ -126,12 +118,13 @@ class Stylizer(object):
parser = cssutils.CSSParser()
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
for elem in head:
tag = barename(elem.tag)
if tag == 'style':
text = ''.join(elem.text)
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
text = XHTML_CSS_NAMESPACE + elem.text
stylesheet = parser.parseString(text, href=cssname)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet)
elif tag == 'link' \
elif elem.tag == XHTML('link') and elem.get('href') \
and elem.get('rel', 'stylesheet') == 'stylesheet' \
and elem.get('type', CSS_MIME) in OEB_STYLES:
href = urlnormalize(elem.attrib['href'])
@ -143,11 +136,13 @@ class Stylizer(object):
data = XHTML_CSS_NAMESPACE
data += oeb.manifest.hrefs[path].data
stylesheet = parser.parseString(data, href=path)
stylesheet.namespaces['h'] = XHTML_NS
self.STYLESHEETS[path] = stylesheet
stylesheets.append(stylesheet)
rules = []
index = 0
self.stylesheets = set()
self.page_rule = {}
for stylesheet in stylesheets:
href = stylesheet.href
self.stylesheets.add(href)
@ -157,7 +152,16 @@ class Stylizer(object):
rules.sort()
self.rules = rules
self._styles = {}
for _, _, cssdict, text, _ in rules:
try:
selector = CSSSelector(text)
except ExpressionError, e:
continue
for elem in selector(tree):
self.style(elem)._update_cssdict(cssdict)
for elem in xpath(tree, '//h:*[@style]'):
self.style(elem)._apply_style_attr()
def flatten_rule(self, rule, href, index):
results = []
if isinstance(rule, CSSStyleRule):
@ -169,9 +173,9 @@ class Stylizer(object):
results.append((specificity, selector, style, text, href))
elif isinstance(rule, CSSPageRule):
style = self.flatten_style(rule.style)
results.append(((0, 0, 0, 0), [], style, '@page', href))
self.page_rule.update(style)
return results
def flatten_style(self, cssstyle):
style = {}
for prop in cssstyle:
@ -186,7 +190,7 @@ class Stylizer(object):
size = style['font-size']
if size == 'normal': size = 'medium'
if size in FONT_SIZE_NAMES:
style['font-size'] = "%dpt" % FONT_SIZE_BY_NAME[size]
style['font-size'] = "%dpt" % self.profile.fnames[size]
return style
def _normalize_edge(self, cssvalue, name):
@ -233,9 +237,10 @@ class Stylizer(object):
return style
def style(self, element):
try: return self._styles[element]
except: pass
return Style(element, self)
try:
return self._styles[element]
except KeyError:
return Style(element, self)
def stylesheet(self, name, font_scale=None):
rules = []
@ -250,86 +255,43 @@ class Stylizer(object):
rules.append('%s {\n %s;\n}' % (selector, style))
return '\n'.join(rules)
class Style(object):
def __init__(self, element, stylizer):
self._element = element
self._page = stylizer.page
self._profile = stylizer.profile
self._stylizer = stylizer
self._style = self._assemble_style(element, stylizer)
self._style = {}
self._fontSize = None
self._width = None
self._height = None
self._lineHeight = None
stylizer._styles[element] = self
def _update_cssdict(self, cssdict):
self._style.update(cssdict)
def _assemble_style(self, element, stylizer):
result = {}
rules = stylizer.rules
for _, selector, style, _, _ in rules:
if self._selects_element(element, selector):
result.update(style)
try:
style = CSSStyleDeclaration(element.attrib['style'])
result.update(stylizer.flatten_style(style))
except KeyError:
pass
return result
def _selects_element(self, element, selector):
def _selects_element(element, items, index):
if index == -1:
return True
item = items[index]
if item.type == 'universal':
pass
elif item.type == 'type-selector':
name1 = ("{%s}%s" % item.value).lower()
name2 = element.tag.lower()
if name1 != name2:
return False
elif item.type == 'id':
name1 = item.value[1:]
name2 = element.get('id', '')
if name1 != name2:
return False
elif item.type == 'class':
name = item.value[1:].lower()
classes = element.get('class', '').lower().split()
if name not in classes:
return False
elif item.type == 'child':
parent = element.getparent()
if parent is None:
return False
element = parent
elif item.type == 'descendant':
element = element.getparent()
while element is not None:
if _selects_element(element, items, index - 1):
return True
element = element.getparent()
return False
elif item.type == 'pseudo-class':
if item.value == ':first-child':
e = element.getprevious()
if e is not None:
return False
else:
return False
elif item.type == 'pseudo-element':
return False
else:
return False
return _selects_element(element, items, index - 1)
return _selects_element(element, selector, len(selector) - 1)
def _apply_style_attr(self):
attrib = self._element.attrib
if 'style' in attrib:
style = CSSStyleDeclaration(attrib['style'])
self._style.update(self._stylizer.flatten_style(style))
def _has_parent(self):
parent = self._element.getparent()
return (parent is not None) \
and (parent in self._stylizer._styles)
return (self._element.getparent() is not None)
def _get_parent(self):
elem = self._element.getparent()
if elem is None:
return None
return self._stylizer.style(elem)
def __getitem__(self, name):
domname = cssproperties._toDOMname(name)
if hasattr(self, domname):
return getattr(self, domname)
return self._unit_convert(self._get(name))
def _get(self, name):
result = None
if name in self._style:
@ -337,8 +299,8 @@ class Style(object):
if (result == 'inherit'
or (result is None and name in INHERITED
and self._has_parent())):
styles = self._stylizer._styles
result = styles[self._element.getparent()]._get(name)
stylizer = self._stylizer
result = stylizer.style(self._element.getparent())._get(name)
if result is None:
result = DEFAULTS[name]
return result
@ -359,9 +321,9 @@ class Style(object):
unit = m.group(2)
if unit == '%':
base = base or self.width
result = (value/100.0) * base
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / self._page.dpi
result = value * 72.0 / self._profile.dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
@ -379,22 +341,22 @@ class Style(object):
@property
def fontSize(self):
def normalize_fontsize(value, base=None):
def normalize_fontsize(value, base):
result = None
factor = None
if value == 'inherit':
value = 'medium'
value = base
if value in FONT_SIZE_NAMES:
result = FONT_SIZE_BY_NAME[value]
result = self._profile.fnames[value]
elif value == 'smaller':
factor = 1.0/1.2
for _, _, size in FONT_SIZE_LIST:
for _, _, size in self._profile.fsizes:
if base <= size: break
factor = None
result = size
elif value == 'larger':
factor = 1.2
for _, _, size in reversed(FONT_SIZE_LIST):
for _, _, size in reversed(self._profile.fsizes):
if base >= size: break
factor = None
result = size
@ -405,40 +367,108 @@ class Style(object):
if factor:
result = factor * base
return result
result = None
if self._has_parent():
styles = self._stylizer._styles
base = styles[self._element.getparent()].fontSize
else:
base = normalize_fontsize(DEFAULTS['font-size'])
if 'font-size' in self._style:
size = self._style['font-size']
result = normalize_fontsize(size, base)
else:
result = base
self.__dict__['fontSize'] = result
return result
if self._fontSize is None:
result = None
parent = self._get_parent()
if parent is not None:
base = parent.fontSize
else:
base = self._profile.fbase
if 'font-size' in self._style:
size = self._style['font-size']
result = normalize_fontsize(size, base)
else:
result = base
self._fontSize = result
return self._fontSize
@property
def width(self):
result = None
base = None
if self._has_parent():
styles = self._stylizer._styles
base = styles[self._element.getparent()].width
else:
base = self._page.width
if 'width' in self._style:
width = self._style['width']
if width == 'auto':
if self._width is None:
width = None
base = None
parent = self._get_parent()
if parent is not None:
base = parent.width
else:
base = self._profile.width
if 'width' is self._element.attrib:
width = self._element.attrib['width']
elif 'width' in self._style:
width = self._style['width']
if not width or width == 'auto':
result = base
else:
result = self._unit_convert(width, base=base)
else:
result = base
self.__dict__['width'] = result
return result
self._width = result
return self._width
@property
def height(self):
if self._height is None:
height = None
base = None
parent = self._get_parent()
if parent is not None:
base = parent.height
else:
base = self._profile.height
if 'height' is self._element.attrib:
height = self._element.attrib['height']
elif 'height' in self._style:
height = self._style['height']
if not height or height == 'auto':
result = base
else:
result = self._unit_convert(height, base=base)
self._height = result
return self._height
@property
def lineHeight(self):
if self._lineHeight is None:
result = None
parent = self._getparent()
if 'line-height' in self._style:
lineh = self._style['line-height']
try:
float(lineh)
except ValueError:
result = self._unit_convert(lineh, base=self.fontSize)
else:
result = float(lineh) * self.fontSize
elif parent is not None:
# TODO: proper inheritance
result = parent.lineHeight
else:
result = 1.2 * self.fontSize
self._lineHeight = result
return self._lineHeight
@property
def marginTop(self):
return self._unit_convert(
self._get('margin-top'), base=self.height)
@property
def marginBottom(self):
return self._unit_convert(
self._get('margin-bottom'), base=self.height)
@property
def paddingTop(self):
return self._unit_convert(
self._get('padding-top'), base=self.height)
@property
def paddingBottom(self):
return self._unit_convert(
self._get('padding-bottom'), base=self.height)
def __str__(self):
items = self._style.items()
items.sort()
return '; '.join("%s: %s" % (key, val) for key, val in items)
def cssdict(self):
return dict(self._style)

View File

@ -0,0 +1,270 @@
'''
CSS flattening transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re
import operator
import math
from itertools import chain
from collections import defaultdict
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.stylizer import Stylizer
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
STRIPNUM = re.compile(r'[-0-9]+$')
class KeyMapper(object):
def __init__(self, sbase, dbase, dkey):
self.sbase = float(sbase)
self.dprop = [(self.relate(x, dbase), float(x)) for x in dkey]
self.cache = {}
@staticmethod
def relate(size, base):
size = float(size)
base = float(base)
if abs(size - base) < 0.1: return 0
sign = -1 if size < base else 1
endp = 0 if size < base else 36
diff = (abs(base - size) * 3) + ((36 - size) / 100)
logb = abs(base - endp)
result = sign * math.log(diff, logb)
return result
def __getitem__(self, ssize):
if ssize in self.cache:
return self.cache[ssize]
dsize = self.map(ssize)
self.cache[ssize] = dsize
return dsize
def map(self, ssize):
sbase = self.sbase
prop = self.relate(ssize, sbase)
diff = [(abs(prop - p), s) for p, s in self.dprop]
dsize = min(diff)[1]
return dsize
class ScaleMapper(object):
def __init__(self, sbase, dbase):
self.dscale = float(dbase) / float(sbase)
def __getitem__(self, ssize):
dsize = ssize * self.dscale
return dsize
class NullMapper(object):
def __init__(self):
pass
def __getitem__(self, ssize):
return ssize
def FontMapper(sbase=None, dbase=None, dkey=None):
if sbase and dbase and dkey:
return KeyMapper(sbase, dbase, dkey)
elif sbase and dbase:
return ScaleMapper(sbase, dbase)
else:
return NullMapper()
class CSSFlattener(object):
def __init__(self, fbase=None, fkey=None, lineh=None, unfloat=False,
untable=False):
self.fbase = fbase
self.fkey = fkey
self.lineh = lineh
self.unfloat = unfloat
self.untable = untable
def transform(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.oeb = oeb
self.context = context
self.stylize_spine()
self.sbase = self.baseline_spine() if self.fbase else None
self.fmap = FontMapper(self.sbase, self.fbase, self.fkey)
self.flatten_spine()
def stylize_spine(self):
self.stylizers = {}
profile = self.context.source
for item in self.oeb.spine:
html = item.data
stylizer = Stylizer(html, item.href, self.oeb, profile)
self.stylizers[item] = stylizer
def baseline_node(self, node, stylizer, sizes, csize):
csize = stylizer.style(node)['font-size']
if node.text:
sizes[csize] += len(COLLAPSE.sub(' ', node.text))
for child in node:
self.baseline_node(child, stylizer, sizes, csize)
if child.tail:
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
def baseline_spine(self):
sizes = defaultdict(float)
for item in self.oeb.spine:
html = item.data
stylizer = self.stylizers[item]
body = html.find(XHTML('body'))
fsize = self.context.source.fbase
self.baseline_node(body, stylizer, sizes, fsize)
sbase = max(sizes.items(), key=operator.itemgetter(1))[0]
self.oeb.logger.info(
"Source base font size is %0.05fpt" % sbase)
return sbase
def clean_edges(self, cssdict, style, fsize):
slineh = self.sbase * 1.26
dlineh = self.lineh
for kind in ('margin', 'padding'):
for edge in ('bottom', 'top'):
property = "%s-%s" % (kind, edge)
if property not in cssdict: continue
if '%' in cssdict[property]: continue
value = style[property]
if value == 0:
continue
elif value <= slineh:
cssdict[property] = "%0.5fem" % (dlineh / fsize)
else:
value = round(value / slineh) * dlineh
cssdict[property] = "%0.5fem" % (value / fsize)
def flatten_node(self, node, stylizer, names, styles, psize, left=0):
if not isinstance(node.tag, basestring) \
or namespace(node.tag) != XHTML_NS:
return
tag = barename(node.tag)
style = stylizer.style(node)
cssdict = style.cssdict()
if 'align' in node.attrib:
cssdict['text-align'] = node.attrib['align']
del node.attrib['align']
if node.tag == XHTML('font'):
node.tag = XHTML('span')
if 'size' in node.attrib:
size = node.attrib['size'].strip()
if size:
fnums = self.context.source.fnums
if size[0] in ('+', '-'):
# Oh, the warcrimes
cssdict['font-size'] = fnums[3+int(size)]
else:
cssdict['font-size'] = fnums[int(size)]
del node.attrib['size']
if 'color' in node.attrib:
cssdict['color'] = node.attrib['color']
del node.attrib['color']
if 'bgcolor' in node.attrib:
cssdict['background-color'] = node.attrib['bgcolor']
del node.attrib['bgcolor']
if cssdict:
if 'font-size' in cssdict:
fsize = self.fmap[style['font-size']]
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
psize = fsize
if self.lineh and self.fbase and tag != 'body':
self.clean_edges(cssdict, style, psize)
margin = style['margin-left']
left += margin if isinstance(margin, float) else 0
if (left + style['text-indent']) < 0:
percent = (margin - style['text-indent']) / style['width']
cssdict['margin-left'] = "%d%%" % (percent * 100)
left -= style['text-indent']
if 'display' in cssdict and cssdict['display'] == 'in-line':
cssdict['display'] = 'inline'
if self.unfloat and 'float' in cssdict \
and cssdict.get('display', 'none') != 'none':
del cssdict['display']
if self.untable and 'display' in cssdict \
and cssdict['display'].startswith('table'):
display = cssdict['display']
if display == 'table-cell':
cssdict['display'] = 'inline'
else:
cssdict['display'] = 'block'
if 'vertical-align' in cssdict \
and cssdict['vertical-align'] == 'sup':
cssdict['vertical-align'] = 'super'
if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh
if cssdict:
items = cssdict.items()
items.sort()
css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
klass = STRIPNUM.sub('', node.get('class', 'calibre').split()[0])
if css in styles:
match = styles[css]
else:
match = klass + str(names[klass] or '')
styles[css] = match
names[klass] += 1
node.attrib['class'] = match
elif 'class' in node.attrib:
del node.attrib['class']
if 'style' in node.attrib:
del node.attrib['style']
for child in node:
self.flatten_node(child, stylizer, names, styles, psize, left)
def flatten_head(self, item, stylizer, href):
html = item.data
head = html.find(XHTML('head'))
for node in head:
if node.tag == XHTML('link') \
and node.get('rel', 'stylesheet') == 'stylesheet' \
and node.get('type', CSS_MIME) in OEB_STYLES:
head.remove(node)
elif node.tag == XHTML('style') \
and node.get('type', CSS_MIME) in OEB_STYLES:
head.remove(node)
href = item.relhref(href)
etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
if stylizer.page_rule:
items = stylizer.page_rule.items()
items.sort()
css = '; '.join("%s: %s" % (key, val) for key, val in items)
style = etree.SubElement(head, XHTML('style'), type=CSS_MIME)
style.text = "@page { %s; }" % css
def replace_css(self, css):
manifest = self.oeb.manifest
id, href = manifest.generate('css', 'stylesheet.css')
for item in manifest.values():
if item.media_type in OEB_STYLES:
manifest.remove(item)
item = manifest.add(id, href, CSS_MIME, data=css)
return href
def flatten_spine(self):
names = defaultdict(int)
styles = {}
for item in self.oeb.spine:
html = item.data
stylizer = self.stylizers[item]
body = html.find(XHTML('body'))
fsize = self.context.dest.fbase
self.flatten_node(body, stylizer, names, styles, fsize)
items = [(key, val) for (val, key) in styles.items()]
items.sort()
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
href = self.replace_css(css)
for item in self.oeb.spine:
stylizer = self.stylizers[item]
self.flatten_head(item, stylizer, href)

View File

@ -0,0 +1,87 @@
'''
HTML-TOC-adding transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from lxml import etree
from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
from calibre.ebooks.oeb.base import element
STYLE_CSS = {
'nested': """
.calibre_toc_header {
text-align: center;
}
.calibre_toc_block {
margin-left: 1.2em;
text-indent: -1.2em;
}
.calibre_toc_block .calibre_toc_block {
margin-left: 2.4em;
}
.calibre_toc_block .calibre_toc_block .calibre_toc_block {
margin-left: 3.6em;
}
""",
'centered': """
.calibre_toc_header {
text-align: center;
}
.calibre_toc_block {
text-align: center;
}
body > .calibre_toc_block {
margin-top: 1.2em;
}
"""
}
class HTMLTOCAdder(object):
def __init__(self, style='nested'):
self.style = style
def transform(self, oeb, context):
if 'toc' in oeb.guide:
return
oeb.logger.info('Generating in-line TOC...')
style = self.style
if style not in STYLE_CSS:
oeb.logger.error('Unknown TOC style %r' % style)
style = 'nested'
id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style])
language = str(oeb.metadata.language[0])
contents = element(None, XHTML('html'), nsmap={None: XHTML_NS},
attrib={XML('lang'): language})
head = element(contents, XHTML('head'))
title = element(head, XHTML('title'))
title.text = 'Table of Contents'
element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME,
href=css_href)
body = element(contents, XHTML('body'),
attrib={'class': 'calibre_toc'})
h1 = element(body, XHTML('h1'),
attrib={'class': 'calibre_toc_header'})
h1.text = 'Table of Contents'
self.add_toc_level(body, oeb.toc)
id, href = oeb.manifest.generate('contents', 'contents.xhtml')
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents)
oeb.spine.add(item, linear=False)
oeb.guide.add('toc', 'Table of Contents', href)
def add_toc_level(self, elem, toc):
for node in toc:
block = element(elem, XHTML('div'),
attrib={'class': 'calibre_toc_block'})
line = element(block, XHTML('a'),
attrib={'href': node.href,
'class': 'calibre_toc_line'})
line.text = node.title
self.add_toc_level(block, node)

View File

@ -0,0 +1,112 @@
'''
CSS case-mangling transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re
import operator
import math
from itertools import chain
from collections import defaultdict
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME
from calibre.ebooks.oeb.base import namespace
from calibre.ebooks.oeb.stylizer import Stylizer
CASE_MANGLER_CSS = """
.calibre_lowercase {
font-variant: normal;
font-size: 0.65em;
}
"""
TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase'])
class CaseMangler(object):
def transform(self, oeb, context):
oeb.logger.info('Applying case-transforming CSS...')
self.oeb = oeb
self.profile = context.source
self.mangle_spine()
def mangle_spine(self):
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS)
for item in self.oeb.spine:
html = item.data
relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
rel='stylesheet', href=relhref, type=CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer)
def text_transform(self, transform, text):
if transform == 'capitalize':
return text.title()
elif transform == 'uppercase':
return text.upper()
elif transform == 'lowercase':
return text.lower()
return text
def split_text(self, text):
results = ['']
isupper = text[0].isupper()
for char in text:
if char.isupper() == isupper:
results[-1] += char
else:
isupper = not isupper
results.append(char)
return results
def smallcaps_elem(self, elem, attr):
texts = self.split_text(getattr(elem, attr))
setattr(elem, attr, None)
last = elem if attr == 'tail' else None
attrib = {'class': 'calibre_lowercase'}
for text in texts:
if text.isupper():
if last is None:
elem.text = text
else:
last.tail = text
else:
child = etree.Element(XHTML('span'), attrib=attrib)
child.text = text.upper()
if last is None:
elem.insert(0, child)
else:
# addnext() moves the tail for some reason
tail = last.tail
last.addnext(child)
last.tail = tail
child.tail = None
last = child
def mangle_elem(self, elem, stylizer):
if not isinstance(elem.tag, basestring) or \
namespace(elem.tag) != XHTML_NS:
return
children = list(elem)
style = stylizer.style(elem)
transform = style['text-transform']
variant = style['font-variant']
if elem.text:
if transform in TEXT_TRANSFORMS:
elem.text = self.text_transform(transform, elem.text)
if variant == 'small-caps':
self.smallcaps_elem(elem, 'text')
for child in children:
self.mangle_elem(child, stylizer)
if child.tail:
if transform in TEXT_TRANSFORMS:
child.tail = self.text_transform(transform, child.tail)
if variant == 'small-caps':
self.smallcaps_elem(child, 'tail')

View File

@ -0,0 +1,190 @@
'''
SVG rasterization transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from urlparse import urldefrag
import base64
from lxml import etree
from PyQt4.QtCore import Qt
from PyQt4.QtCore import QByteArray
from PyQt4.QtCore import QBuffer
from PyQt4.QtCore import QIODevice
from PyQt4.QtGui import QColor
from PyQt4.QtGui import QImage
from PyQt4.QtGui import QPainter
from PyQt4.QtSvg import QSvgRenderer
from PyQt4.QtGui import QApplication
from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
from calibre.ebooks.oeb.stylizer import Stylizer
IMAGE_TAGS = set([XHTML('img'), XHTML('object')])
KEEP_ATTRS = set(['class', 'style', 'width', 'height', 'align'])
class SVGRasterizer(object):
def __init__(self):
if QApplication.instance() is None:
QApplication([])
def transform(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb
self.profile = context.dest
self.images = {}
self.dataize_manifest()
self.rasterize_spine()
self.rasterize_cover()
def rasterize_svg(self, elem, width=0, height=0, format='PNG'):
data = QByteArray(xml2str(elem))
svg = QSvgRenderer(data)
size = svg.defaultSize()
if size.width() == 100 and size.height() == 100 \
and 'viewBox' in elem.attrib:
box = [float(x) for x in elem.attrib['viewBox'].split()]
size.setWidth(box[2] - box[0])
size.setHeight(box[3] - box[1])
if width or height:
size.scale(width, height, Qt.KeepAspectRatio)
logger = self.oeb.logger
logger.info('Rasterizing %r to %dx%d'
% (elem, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb())
painter = QPainter(image)
svg.render(painter)
painter.end()
array = QByteArray()
buffer = QBuffer(array)
buffer.open(QIODevice.WriteOnly)
image.save(buffer, format)
return str(array)
def dataize_manifest(self):
for item in self.oeb.manifest.values():
if item.media_type == SVG_MIME:
self.dataize_svg(item)
def dataize_svg(self, item, svg=None):
if svg is None:
svg = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'):
href = elem.attrib[XLINK('href')]
path, frag = urldefrag(href)
if not path:
continue
abshref = item.abshref(path)
if abshref not in hrefs:
continue
linkee = hrefs[abshref]
data = base64.encodestring(str(linkee))
data = "data:%s;base64,%s" % (linkee.media_type, data)
elem.attrib[XLINK('href')] = data
return svg
def rasterize_spine(self):
for item in self.oeb.spine:
html = item.data
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
self.rasterize_item(item, stylizer)
def rasterize_item(self, item, stylizer):
html = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(html, '//h:img'):
src = elem.get('src', None)
image = hrefs.get(item.abshref(src), None) if src else None
if image and image.media_type == SVG_MIME:
style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//h:object[@type="%s"]' % SVG_MIME):
data = elem.get('data', None)
image = hrefs.get(item.abshref(data), None) if data else None
if image and image.media_type == SVG_MIME:
style = stylizer.style(elem)
self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//svg:svg'):
style = stylizer.style(elem)
self.rasterize_inline(elem, style, item)
def rasterize_inline(self, elem, style, item):
width = style['width']
height = style['height']
width = (width / 72) * self.profile.dpi
height = (height / 72) * self.profile.dpi
elem = self.dataize_svg(item, elem)
data = self.rasterize_svg(elem, width, height)
manifest = self.oeb.manifest
href = os.path.splitext(item.href)[0] + '.png'
id, href = manifest.generate(item.id, href)
manifest.add(id, href, PNG_MIME, data=data)
img = etree.Element(XHTML('img'), src=item.relhref(href))
elem.getparent().replace(elem, img)
for prop in ('width', 'height'):
if prop in elem.attrib:
img.attrib[prop] = elem.attrib[prop]
def rasterize_external(self, elem, style, item, svgitem):
width = style['width']
height = style['height']
width = (width / 72) * self.profile.dpi
height = (height / 72) * self.profile.dpi
data = QByteArray(str(svgitem))
svg = QSvgRenderer(data)
size = svg.defaultSize()
size.scale(width, height, Qt.KeepAspectRatio)
key = (svgitem.href, size.width(), size.height())
if key in self.images:
href = self.images[key]
else:
logger = self.oeb.logger
logger.info('Rasterizing %r to %dx%d'
% (svgitem.href, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb())
painter = QPainter(image)
svg.render(painter)
painter.end()
array = QByteArray()
buffer = QBuffer(array)
buffer.open(QIODevice.WriteOnly)
image.save(buffer, 'PNG')
data = str(array)
manifest = self.oeb.manifest
href = os.path.splitext(svgitem.href)[0] + '.png'
id, href = manifest.generate(svgitem.id, href)
manifest.add(id, href, PNG_MIME, data=data)
self.images[key] = href
elem.tag = XHTML('img')
for attr in elem.attrib:
if attr not in KEEP_ATTRS:
del elem.attrib[attr]
elem.attrib['src'] = item.relhref(href)
if elem.text:
elem.attrib['alt'] = elem.text
elem.text = None
for child in elem:
elem.remove(child)
def rasterize_cover(self):
covers = self.oeb.metadata.cover
if not covers:
return
cover = self.oeb.manifest.ids[str(covers[0])]
if not cover.media_type == SVG_MIME:
return
width = (self.profile.width / 72) * self.profile.dpi
height = (self.profile.height / 72) * self.profile.dpi
data = self.rasterize_svg(cover.data, width, height)
href = os.path.splitext(cover.href)[0] + '.png'
id, href = self.oeb.manifest.generate(cover.id, href)
self.oeb.manifest.add(id, href, PNG_MIME, data=data)
covers[0].value = id

View File

@ -0,0 +1,68 @@
'''
OPF manifest trimming transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from itertools import chain
from urlparse import urldefrag
from lxml import etree
import cssutils
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
'//*/@xl:href'):
LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP))
class ManifestTrimmer(object):
def transform(self, oeb, context):
oeb.logger.info('Trimming unused files from manifest...')
used = set()
hrefs = oeb.manifest.hrefs
for term in oeb.metadata:
for item in oeb.metadata[term]:
if item.value in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[item.value])
elif item.value in oeb.manifest.ids:
used.add(oeb.manifest.ids[item.value])
for ref in oeb.guide.values():
path, _ = urldefrag(ref.href)
if path in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[path])
# TOC items are required to be in the spine
for item in oeb.spine:
used.add(item)
unchecked = used
while unchecked:
new = set()
for item in unchecked:
if item.media_type in OEB_DOCS or \
item.media_type[-4:] in ('/xml', '+xml'):
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs):
href = item.abshref(href)
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
def replacer(uri):
absuri = item.abshref(uri)
if absuri in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
return uri
sheet = cssutils.parseString(item.data, href=item.href)
cssutils.replaceUrls(sheet, replacer)
used.update(new)
unchecked = new
for item in oeb.manifest.values():
if item not in used:
oeb.logger.info('Trimming %r from manifest' % item.href)
oeb.manifest.remove(item)

View File

@ -48,12 +48,14 @@ entry_points = {
'any2lrf = calibre.ebooks.lrf.any.convert_from:main',
'any2epub = calibre.ebooks.epub.from_any:main',
'any2lit = calibre.ebooks.lit.from_any:main',
'any2mobi = calibre.ebooks.mobi.from_any:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main',
'isbndb = calibre.ebooks.metadata.isbndb:main',
'librarything = calibre.ebooks.metadata.library_thing:main',
'mobi2oeb = calibre.ebooks.mobi.reader:main',
'oeb2mobi = calibre.ebooks.mobi.writer:main',
'lrf2html = calibre.ebooks.lrf.html.convert_to:main',
'lit2oeb = calibre.ebooks.lit.reader:main',
'oeb2lit = calibre.ebooks.lit.writer:main',
@ -190,6 +192,8 @@ def setup_completion(fatal_errors):
from calibre.ebooks.epub.from_any import option_parser as any2epub
from calibre.ebooks.lit.from_any import option_parser as any2lit
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
from calibre.ebooks.mobi.from_any import option_parser as any2mobi
from calibre.ebooks.mobi.writer import option_parser as oeb2mobi
from calibre.gui2.main import option_parser as guiop
any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt']
@ -214,6 +218,8 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('calibre', guiop, any_formats))
f.write(opts_and_exts('any2epub', any2epub, any_formats))
f.write(opts_and_exts('any2lit', any2lit, any_formats))
f.write(opts_and_exts('any2mobi', any2mobi, any_formats))
f.write(opts_and_exts('oeb2mobi', oeb2mobi, ['mobi', 'prc']))
f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
f.write(opts_and_exts('rtf-meta', metaop, ['rtf']))
@ -230,7 +236,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('lit2oeb', lit2oeb, ['lit']))
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr']))
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2epub, feed_titles))