From f740d20f32e9ca2fbedcb2bcff5e7e4d9b5dfcd4 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 7 Dec 2008 23:53:14 -0500 Subject: [PATCH] Adding initial LitWriter and oeb2lit code. --- setup.py | 1 + src/calibre/ebooks/lit/html.css | 420 ++++++++++++++++++ src/calibre/ebooks/lit/lzxcomp.py | 176 ++++++++ src/calibre/ebooks/lit/oeb.py | 690 +++++++++++++++++++++++++++++ src/calibre/ebooks/lit/split.py | 149 +++++++ src/calibre/ebooks/lit/stylizer.py | 435 ++++++++++++++++++ src/calibre/ebooks/lit/writer.py | 655 +++++++++++++++++++++++++++ src/calibre/linux.py | 1 + 8 files changed, 2527 insertions(+) create mode 100644 src/calibre/ebooks/lit/html.css create mode 100644 src/calibre/ebooks/lit/lzxcomp.py create mode 100644 src/calibre/ebooks/lit/oeb.py create mode 100644 src/calibre/ebooks/lit/split.py create mode 100644 src/calibre/ebooks/lit/stylizer.py create mode 100644 src/calibre/ebooks/lit/writer.py diff --git a/setup.py b/setup.py index 37d54c4317..aa72b46f00 100644 --- a/setup.py +++ b/setup.py @@ -146,6 +146,7 @@ if __name__ == '__main__': metadata_sqlite = 'library/metadata_sqlite.sql', jquery = 'gui2/viewer/jquery.js', jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js', + html_css = 'ebooks/lit/html.css', ) DEST = os.path.join('src', APPNAME, 'resources.py') diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css new file mode 100644 index 0000000000..5b75ea6649 --- /dev/null +++ b/src/calibre/ebooks/lit/html.css @@ -0,0 +1,420 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Blake Ross + * + * Alternatively, the contents of this file may be used under the terms of + * either of the GNU General Public License Version 2 or later (the "GPL"), + * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +@namespace url(http://www.w3.org/1999/xhtml); /* set default namespace to HTML */ + +/* blocks */ + +html, div, map, dt, isindex, form { + display: block; +} + +body { + display: block; + margin: 8px; +} + +p, dl, multicol { + display: block; + margin: 1em 0; +} + +dd { + display: block; +} + +blockquote { + display: block; + margin: 1em 40px; +} + +address { + display: block; + font-style: italic; +} + +center { + display: block; + text-align: center; +} + +blockquote[type=cite] { + display: block; + margin: 1em 0px; + border-color: blue; + border-width: thin; +} + +span[_moz_quote=true] { + color: blue; +} + +pre[_moz_quote=true] { + color: blue; +} + +h1 { + display: block; + font-size: 2em; + font-weight: bold; + margin: .67em 0; +} + +h2 { + display: block; + font-size: 1.5em; + font-weight: bold; + margin: .83em 0; +} + +h3 { + display: block; + font-size: 1.17em; + font-weight: bold; + margin: 1em 0; +} + +h4 { + display: block; + font-weight: bold; + margin: 1.33em 0; +} + +h5 { + display: block; + font-size: 0.83em; + font-weight: bold; + margin: 1.67em 0; +} + +h6 { + display: block; + font-size: 0.67em; + font-weight: bold; + margin: 2.33em 0; +} + +listing { + display: block; + font-family: monospace; + font-size: medium; + white-space: pre; + margin: 1em 0; +} + +xmp, pre, plaintext { + display: block; + font-family: monospace; + white-space: pre; + margin: 1em 0; +} + +/* tables */ + +table { + display: table; + border-spacing: 2px; + border-collapse: separate; + margin-top: 0; + margin-bottom: 0; + text-indent: 0; +} + +table[align="left"] { + float: left; +} + +table[align="right"] { + float: right; +} + +table[rules]:not([rules="none"]) { + border-collapse: collapse; +} + +/* caption inherits from table not table-outer */ +caption { + display: table-caption; + text-align: center; +} + +table[align="center"] > caption { + margin-left: auto; + margin-right: auto; +} + +table[align="center"] > caption[align="left"] { + margin-right: 0; +} + +table[align="center"] > caption[align="right"] { + margin-left: 0; +} + +tr { + display: table-row; + vertical-align: inherit; +} + +col { + display: table-column; +} + +colgroup { + display: table-column-group; +} + +tbody { + display: table-row-group; + vertical-align: middle; +} + +thead { + display: table-header-group; + vertical-align: middle; +} + +tfoot { + display: table-footer-group; + vertical-align: middle; +} + +/* for XHTML tables without tbody */ +table > tr { + vertical-align: middle; +} + +td { + display: table-cell; + vertical-align: inherit; + text-align: inherit; + padding: 1px; +} + +th { + display: table-cell; + vertical-align: inherit; + font-weight: bold; + padding: 1px; +} + +/* inlines */ + +q:before { + content: open-quote; +} + +q:after { + content: close-quote; +} + +b, strong { + font-weight: bolder; +} + +i, cite, em, var, dfn { + font-style: italic; +} + +tt, code, kbd, samp { + font-family: monospace; +} + +u, ins { + text-decoration: underline; +} + +s, strike, del { + text-decoration: line-through; +} + +blink { + text-decoration: blink; +} + +big { + font-size: larger; +} + +small { + font-size: smaller; +} + +sub { + vertical-align: sub; + font-size: smaller; + line-height: normal; +} + +sup { + vertical-align: super; + font-size: smaller; + line-height: normal; +} + +nobr { + white-space: nowrap; +} + +/* titles */ +abbr[title], acronym[title] { + border-bottom: dotted 1px; +} + +/* lists */ + +ul, menu, dir { + display: block; + list-style-type: disc; + margin: 1em 0; +} + +ol { + display: block; + list-style-type: decimal; + margin: 1em 0; +} + +li { + display: list-item; +} + +/* nested lists have no top/bottom margins */ +ul ul, ul ol, ul dir, ul menu, ul dl, +ol ul, ol ol, ol dir, ol menu, ol dl, +dir ul, dir ol, dir dir, dir menu, dir dl, +menu ul, menu ol, menu dir, menu menu, menu dl, +dl ul, dl ol, dl dir, dl menu, dl dl { + margin-top: 0; + margin-bottom: 0; +} + +/* 2 deep unordered lists use a circle */ +ol ul, ul ul, menu ul, dir ul, +ol menu, ul menu, menu menu, dir menu, +ol dir, ul dir, menu dir, dir dir { + list-style-type: circle; +} + +/* 3 deep (or more) unordered lists use a square */ +ol ol ul, ol ul ul, ol menu ul, ol dir ul, +ol ol menu, ol ul menu, ol menu menu, ol dir menu, +ol ol dir, ol ul dir, ol menu dir, ol dir dir, +ul ol ul, ul ul ul, ul menu ul, ul dir ul, +ul ol menu, ul ul menu, ul menu menu, ul dir menu, +ul ol dir, ul ul dir, ul menu dir, ul dir dir, +menu ol ul, menu ul ul, menu menu ul, menu dir ul, +menu ol menu, menu ul menu, menu menu menu, menu dir menu, +menu ol dir, menu ul dir, menu menu dir, menu dir dir, +dir ol ul, dir ul ul, dir menu ul, dir dir ul, +dir ol menu, dir ul menu, dir menu menu, dir dir menu, +dir ol dir, dir ul dir, dir menu dir, dir dir dir { + list-style-type: square; +} + + +/* leafs */ + +/*
noshade and color attributes are handled completely by + * the nsHTMLHRElement attribute mapping code + */ +hr { + display: block; + height: 2px; + border: 1px inset; + margin: 0.5em auto 0.5em auto; + color: gray; +} + +hr[size="1"] { + border-style: solid none none none; +} + +img[usemap], object[usemap] { + color: blue; +} + +frameset { + display: block ! important; + position: static ! important; + float: none ! important; + border: none ! important; +} + +frame { + border: none ! important; +} + +iframe { + border: 2px inset; +} + +noframes { + display: none; +} + +spacer { + position: static ! important; + float: none ! important; +} + +/* focusable content: anything w/ tabindex >=0 is focusable */ +abbr:focus, acronym:focus, address:focus, applet:focus, b:focus, +base:focus, big:focus, blockquote:focus, br:focus, canvas:focus, caption:focus, +center:focus, cite:focus, code:focus, col:focus, colgroup:focus, dd:focus, +del:focus, dfn:focus, dir:focus, div:focus, dl:focus, dt:focus, em:focus, +fieldset:focus, font:focus, form:focus, h1:focus, h2:focus, h3:focus, h4:focus, +h5:focus, h6:focus, hr:focus, i:focus, img:focus, ins:focus, +kbd:focus, label:focus, legend:focus, li:focus, link:focus, menu:focus, +object:focus, ol:focus, p:focus, pre:focus, q:focus, s:focus, samp:focus, +small:focus, span:focus, strike:focus, strong:focus, sub:focus, sup:focus, +table:focus, tbody:focus, td:focus, tfoot:focus, th:focus, thead:focus, +tr:focus, tt:focus, u:focus, ul:focus, var:focus { + /* Don't specify the outline-color, we should always use initial value. */ + outline: 1px dotted; +} + +/* hidden elements */ +area, base, basefont, head, meta, script, style, title, +noembed, param { + display: none; +} + +/* Page breaks at body tags, to help out with LIT-generation */ +body { + page-break-before: always; +} diff --git a/src/calibre/ebooks/lit/lzxcomp.py b/src/calibre/ebooks/lit/lzxcomp.py new file mode 100644 index 0000000000..4f147a90a1 --- /dev/null +++ b/src/calibre/ebooks/lit/lzxcomp.py @@ -0,0 +1,176 @@ +from __future__ import with_statement +import sys +import os +from cStringIO import StringIO +from ctypes import * + +__all__ = ['Compressor'] + +liblzxcomp = cdll.LoadLibrary('liblzxcomp.so') + +class lzx_data(Structure): + pass + +lzx_get_bytes_t = CFUNCTYPE(c_int, c_voidp, c_int, c_voidp) +lzx_put_bytes_t = CFUNCTYPE(c_int, c_voidp, c_int, c_voidp) +lzx_mark_frame_t = CFUNCTYPE(None, c_voidp, c_uint32, c_uint32) +lzx_at_eof_t = CFUNCTYPE(c_int, c_voidp) + +class lzx_results(Structure): + _fields_ = [('len_compressed_output', c_long), + ('len_uncompressed_input', c_long)] + +# int lzx_init(struct lzx_data **lzxdp, int wsize_code, +# lzx_get_bytes_t get_bytes, void *get_bytes_arg, +# lzx_at_eof_t at_eof, +# lzx_put_bytes_t put_bytes, void *put_bytes_arg, +# lzx_mark_frame_t mark_frame, void *mark_frame_arg); +lzx_init = liblzxcomp.lzx_init +lzx_init.restype = c_int +lzx_init.argtypes = [POINTER(POINTER(lzx_data)), c_int, + lzx_get_bytes_t, c_voidp, + lzx_at_eof_t, + lzx_put_bytes_t, c_voidp, + lzx_mark_frame_t, c_voidp] + +# void lzx_reset(lzx_data *lzxd); +lzx_reset = liblzxcomp.lzx_reset +lzx_reset.restype = None +lzx_reset.argtypes = [POINTER(lzx_data)] + +# int lzx_compress_block(lzx_data *lzxd, int block_size, int subdivide); +lzx_compress_block = liblzxcomp.lzx_compress_block +lzx_compress_block.restype = c_int +lzx_compress_block.argtypes = [POINTER(lzx_data), c_int, c_int] + +# int lzx_finish(struct lzx_data *lzxd, struct lzx_results *lzxr); +lzx_finish = liblzxcomp.lzx_finish +lzx_finish.restype = c_int +lzx_finish.argtypes = [POINTER(lzx_data), POINTER(lzx_results)] + + +class LzxError(Exception): + pass + + +class Compressor(object): + def __init__(self, wbits, reset=True): + self._reset = reset + self._blocksize = 1 << wbits + self._buffered = 0 + self._input = StringIO() + self._output = StringIO() + self._flushing = False + self._rtable = [] + self._get_bytes = lzx_get_bytes_t(self._get_bytes) + self._at_eof = lzx_at_eof_t(self._at_eof) + self._put_bytes = lzx_put_bytes_t(self._put_bytes) + self._mark_frame = lzx_mark_frame_t(self._mark_frame) + self._lzx = POINTER(lzx_data)() + self._results = lzx_results() + rv = lzx_init(self._lzx, wbits, self._get_bytes, c_voidp(), + self._at_eof, self._put_bytes, c_voidp(), + self._mark_frame, c_voidp()) + if rv != 0: + raise LzxError("lzx_init() failed with %d" % rv) + + def _add_input(self, data): + self._input.seek(0, 2) + self._input.write(data) + self._input.seek(0) + self._buffered += len(data) + + def _reset_input(self): + data = self._input.read() + self._input.seek(0) + self._input.truncate() + self._input.write(data) + self._input.seek(0) + + def _reset_output(self): + data = self._output.getvalue() + self._output.seek(0) + self._output.truncate() + return data + + def _reset_rtable(self): + rtable = list(self._rtable) + del self._rtable[:] + return rtable + + def _get_bytes(self, arg, n, buf): + data = self._input.read(n) + memmove(buf, data, len(data)) + self._buffered -= len(data) + return len(data) + + def _put_bytes(self, arg, n, buf): + self._output.write(string_at(buf, n)) + return n + + def _at_eof(self, arg): + if self._flushing and self._buffered == 0: + return 1 + return 0 + + def _mark_frame(self, arg, uncomp, comp): + self._rtable.append((uncomp, comp)) + return + + def _compress_block(self): + rv = lzx_compress_block(self._lzx, self._blocksize, 1) + if rv != 0: + raise LzxError("lzx_compress_block() failed with %d" % rv) + if self._reset: + lzx_reset(self._lzx) + + def compress(self, data, flush=False): + self._add_input(data) + self._flushing = flush + while self._buffered >= self._blocksize: + self._compress_block() + if self._buffered > 0 and flush: + self._compress_block() + self._reset_input() + data = self._reset_output() + rtable = self._reset_rtable() + return (data, rtable) + + def flush(self): + self._flushing = True + if self._buffered > 0: + self._compress_block() + self._reset_input() + data = self._reset_output() + rtable = self._reset_rtable() + return (data, rtable) + + def close(self): + if self._lzx: + lzx_finish(self._lzx, self._results) + self._lzx = None + pass + + def __enter__(self): + return self + + def __exit__(self, *exc_info): + self.close() + + def __del__(self): + self.close() + + +def main(argv=sys.argv): + wbits, inf, outf = argv[1:] + with open(inf, 'rb') as f: + data = f.read() + with Compressor(int(wbits)) as lzx: + data, rtable = lzx.compress(data, flush=True) + print rtable + with open(outf, 'wb') as f: + f.write(data) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py new file mode 100644 index 0000000000..a4ad927fed --- /dev/null +++ b/src/calibre/ebooks/lit/oeb.py @@ -0,0 +1,690 @@ +from __future__ import with_statement +import os +import sys +from collections import defaultdict +from types import StringTypes +from itertools import izip, count +from urlparse import urldefrag +from lxml import etree + +XML_PARSER = etree.XMLParser( + remove_blank_text=True, recover=True, resolve_entities=False) +XHTML_NS = 'http://www.w3.org/1999/xhtml' +OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' +OPF2_NS = 'http://www.idpf.org/2007/opf' +DC09_NS = 'http://purl.org/metadata/dublin_core' +DC10_NS = 'http://purl.org/dc/elements/1.0/' +DC11_NS = 'http://purl.org/dc/elements/1.1/' +XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' +DCTERMS_NS = 'http://purl.org/dc/terms/' +NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' +XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} + +def XHTML(name): return '{%s}%s' % (XHTML_NS, name) +def OPF(name): return '{%s}%s' % (OPF2_NS, name) +def DC(name): return '{%s}%s' % (DC11_NS, name) +def NCX(name): return '{%s}%s' % (NCX_NS, name) + +XHTML_MIME = 'application/xhtml+xml' +CSS_MIME = 'text/css' +NCX_MIME = 'application/x-dtbncx+xml' +OPF_MIME = 'application/oebps-package+xml' + +OEB_STYLES = set([CSS_MIME, 'text/x-oeb1-css', 'text/x-oeb-css']) +OEB_DOCS = set([XHTML_MIME, 'text/html', 'text/x-oeb1-document', + 'text/x-oeb-document']) + + +def element(parent, *args, **kwargs): + if parent is not None: + return etree.SubElement(parent, *args, **kwargs) + return etree.Element(*args, **kwargs) + +def namespace(name): + if '}' in name: + return name.split('}', 1)[0][1:] + return '' + +def barename(name): + if '}' in name: + return name.split('}', 1)[1] + return name + +def xpath(elem, expr): + return elem.xpath(expr, namespaces=XPNSMAP) + + +class AbstractContainer(object): + def read_xml(self, path): + return etree.fromstring( + self.read(path), parser=XML_PARSER, + base_url=os.path.dirname(path)) + +class DirContainer(AbstractContainer): + def __init__(self, rootdir): + self.rootdir = rootdir + + def read(self, path): + path = os.path.join(self.rootdir, path) + with open(path, 'rb') as f: + return f.read() + + def write(self, path, data): + path = os.path.join(self.rootdir, path) + with open(path, 'wb') as f: + return f.write(data) + + +class Metadata(object): + TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description', + 'format', 'identifier', 'language', 'publisher', 'relation', + 'rights', 'source', 'subject', 'title', 'type']) + OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} + OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS} + + class Item(object): + def __init__(self, term, value, fq_attrib={}): + if term == OPF('meta') and not value: + fq_attrib = dict(fq_attrib) + term = fq_attrib.pop('name') + value = fq_attrib.pop('content') + elif term in Metadata.TERMS and not namespace(term): + term = DC(term) + self.term = term + self.value = value + self.fq_attrib = dict(fq_attrib) + self.attrib = attrib = {} + for fq_attr in fq_attrib: + attr = barename(fq_attr) + attrib[attr] = fq_attrib[fq_attr] + + def __getattr__(self, name): + name = name.replace('_', '-') + try: + return self.attrib[name] + except KeyError: + raise AttributeError( + '%r object has no attribute %r' \ + % (self.__class__.__name__, name)) + + def __repr__(self): + return 'Item(term=%r, value=%r, attrib=%r)' \ + % (barename(self.term), self.value, self.attrib) + + def __str__(self): + return str(self.value) + + def __unicode__(self): + return unicode(self.value) + + def to_opf1(self, dcmeta=None, xmeta=None): + if namespace(self.term) == DC11_NS: + name = DC(barename(self.term).title()) + elem = element(dcmeta, name, attrib=self.attrib) + elem.text = self.value + else: + elem = element(xmeta, 'meta', attrib=self.attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def to_opf2(self, parent=None): + if namespace(self.term) == DC11_NS: + elem = element(parent, self.term, attrib=self.fq_attrib) + elem.text = self.value + else: + elem = element(parent, OPF('meta'), attrib=self.fq_attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def __init__(self, oeb): + self.oeb = oeb + self.items = defaultdict(list) + + def add(self, term, value, attrib): + item = self.Item(term, value, attrib) + items = self.items[barename(term)] + items.append(item) + return item + + def iterkeys(self): + for key in self.items: + yield key + __iter__ = iterkeys + + def __getitem__(self, key): + return self.items[key] + + def __contains__(self, key): + return key in self.items + + def __getattr__(self, term): + return self.items[term] + + def to_opf1(self, parent=None): + elem = element(parent, 'metadata') + dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + xmeta = element(elem, 'x-metadata') + for term in self.items: + for item in self.items[term]: + item.to_opf1(dcmeta, xmeta) + if 'ms-chaptertour' not in self.items: + chaptertour = self.Item('ms-chaptertour', 'chaptertour') + chaptertour.to_opf1(dcmeta, xmeta) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('metadata'), nsmap=self.NSMAP) + for term in self.items: + for item in self.items[term]: + item.to_opf2(elem) + return elem + + +class Manifest(object): + class Item(object): + def __init__(self, id, href, media_type, loader=str): + self.id = id + self.href = self.path = href.replace('%20', ' ') + self.media_type = media_type + self.spine_position = None + self.linear = True + self._loader = loader + self._data = None + + def __repr__(self): + return 'Item(id=%r, href=%r, media_type=%r)' \ + % (self.id, self.href, self.media_type) + + def data(): + def fget(self): + if self._data: + return self._data + data = self._loader(self.href) + if self.media_type == XHTML_MIME: + data = etree.fromstring(data, parser=XML_PARSER) + if namespace(data.tag) != XHTML_NS: + data.attrib['xmlns'] = XHTML_NS + data = etree.tostring(data) + data = etree.fromstring(data, parser=XML_PARSER) + elif self.media_type.startswith('application/') \ + and self.media_type.endswith('+xml'): + data = etree.fromstring(data, parser=XML_PARSER) + return data + def fset(self, value): + self._data = value + def fdel(self): + self._data = None + return property(fget, fset, fdel) + data = data() + + def __cmp__(self, other): + result = cmp(self.spine_position, other.spine_position) + if result != 0: + return result + return cmp(self.id, other.id) + + def __init__(self, oeb): + self.oeb = oeb + self.items = {} + self.hrefs = {} + + def add(self, id, href, media_type): + item = self.Item(id, href, media_type, self.oeb.container.read) + self.items[id] = item + self.hrefs[href] = item + return item + + def remove(self, id): + href = self.items[id].href + del self.items[id] + del self.hrefs[href] + + def __iter__(self): + for id in self.items: + yield id + + def __getitem__(self, id): + return self.items[id] + + def values(self): + for item in self.items.values(): + yield item + + def items(self): + for id, item in self.refs.items(): + yield id, items + + def __contains__(self, key): + return id in self.items + + def to_opf1(self, parent=None): + elem = element(parent, 'manifest') + for item in self.items.values(): + attrib = {'id': item.id, 'href': item.href, + 'media-type': item.media_type} + element(elem, 'item', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('manifest')) + for item in self.items.values(): + attrib = {'id': item.id, 'href': item.href, + 'media-type': item.media_type} + element(elem, OPF('item'), attrib=attrib) + return elem + + +class Spine(object): + def __init__(self, oeb): + self.oeb = oeb + self.items = [] + + def add(self, item, linear): + if isinstance(linear, StringTypes): + linear = linear.lower() + if linear is None or linear in ('yes', 'true'): + linear = True + elif linear in ('no', 'false'): + linear = False + item.linear = linear + item.spine_position = len(self.items) + self.items.append(item) + return item + + def __iter__(self): + for item in self.items: + yield item + + def __getitem__(self, index): + return self.items[index] + + def __len__(self): + return len(self.items) + + def __contains__(self, item): + return (item in self.items) + + def to_opf1(self, parent=None): + elem = element(parent, 'spine') + for item in self.items: + if item.linear: + element(elem, 'itemref', attrib={'idref': item.id}) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('spine')) + for item in self.items: + attrib = {'idref': item.id} + if not item.linear: + attrib['linear'] = 'no' + element(elem, OPF('itemref'), attrib=attrib) + return elem + + +class Guide(object): + class Reference(object): + def __init__(self, type, title, href): + self.type = type + self.title = title + self.href = href + + def __repr__(self): + return 'Reference(type=%r, title=%r, href=%r)' \ + % (self.type, self.title, self.href) + + def __init__(self, oeb): + self.oeb = oeb + self.refs = {} + + def add(self, type, title, href): + ref = self.Reference(type, title, href) + self.refs[type] = ref + return ref + + def by_type(self, type): + return self.ref_types[type] + + def iterkeys(self): + for type in self.refs: + yield type + __iter__ = iterkeys + + def values(self): + for ref in self.refs.values(): + yield ref + + def items(self): + for type, ref in self.refs.items(): + yield type, ref + + def __getitem__(self, index): + return self.refs[index] + + def __contains__(self, key): + return key in self.refs + + def to_opf1(self, parent=None): + elem = element(parent, 'guide') + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, 'reference', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('guide')) + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, OPF('reference'), attrib=attrib) + return elem + + +class Toc(object): + def __init__(self, title=None, href=None, klass=None, id=None): + self.title = title + self.href = href + self.klass = klass + self.id = id + self.nodes = [] + + def add(self, title, href, klass=None, id=None): + node = Toc(title, href, klass, id) + self.nodes.append(node) + return node + + def __iter__(self): + for node in self.nodes: + yield node + + def __getitem__(self, index): + return self.nodes[index] + + def depth(self, level=0): + if self.nodes: + return self.nodes[0].depth(level+1) + return level + + def to_opf1(self, tour): + for node in self.nodes: + element(tour, 'site', + attrib={'title': node.title, 'href': node.href}) + node.to_opf1(tour) + return tour + + def to_ncx(self, parent, playorder=None, depth=1): + if not playorder: playorder = [0] + for node in self.nodes: + playorder[0] += 1 + point = etree.SubElement(parent, + NCX('navPoint'), attrib={'playOrder': str(playorder[0])}) + if self.klass: + point.attrib['class'] = self.klass + if self.id: + point.attrib['id'] = self.id + label = etree.SubElement(point, NCX('navLabel')) + etree.SubElement(label, NCX('text')).text = node.title + href = node.href if depth > 1 else node.href.split('#', 1)[0] + etree.SubElement(point, NCX('content'), attrib={'src': href}) + node.to_ncx(point, playorder, depth+1) + return parent + + +class Oeb(object): + def __init__(self, opfpath, container=None): + if not container: + container = DirContainer(os.path.dirname(opfpath)) + opfpath = os.path.basename(opfpath) + self.container = container + opf = self._read_opf(opfpath) + self._all_from_opf(opf) + + def _convert_opf1(self, opf): + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib)) + metadata = etree.SubElement(nroot, OPF('metadata'), + nsmap={'opf': OPF2_NS, 'dc': DC11_NS, + 'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) + for prefix in ('d11', 'd10', 'd09'): + elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix) + if elements: break + for element in elements: + if not element.text: continue + tag = barename(element.tag).lower() + element.tag = '{%s}%s' % (DC11_NS, tag) + for name in element.attrib: + if name in ('role', 'file-as', 'scheme'): + nsname = '{%s}%s' % (OPF2_NS, name) + element.attrib[nsname] = element.attrib[name] + del element.attrib[name] + metadata.append(element) + for element in opf.xpath('metadata/x-metadata/meta'): + metadata.append(element) + for item in opf.xpath('manifest/item'): + media_type = item.attrib['media-type'] + if media_type in OEB_DOCS: + media_type = XHTML_MIME + elif media_type in OEB_STYLES: + media_type = CSS_MIME + item.attrib['media-type'] = media_type + for tag in ('manifest', 'spine', 'tours', 'guide'): + for element in opf.xpath(tag): + nroot.append(element) + return etree.fromstring(etree.tostring(nroot), parser=XML_PARSER) + + def _read_opf(self, opfpath): + opf = self.container.read_xml(opfpath) + version = float(opf.get('version', 1.0)) + if version < 2.0: + opf = self._convert_opf1(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.attrib['unique-identifier'] + self.metadata = metadata = Metadata(self) + for elem in xpath(opf, '/o2:package/o2:metadata/*'): + metadata.add(elem.tag, elem.text, elem.attrib) + for item in metadata.identifier: + if item.id == uid: + self.uid = item + break + + def _manifest_from_opf(self, opf): + self.manifest = manifest = Manifest(self) + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + manifest.add(elem.get('id'), elem.get('href'), + elem.get('media-type')) + + def _spine_from_opf(self, opf): + self.spine = spine = Spine(self) + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + item = self.manifest[elem.get('idref')] + spine.add(item, elem.get('linear')) + extras = [] + for item in self.manifest.values(): + if item.media_type == XHTML_MIME \ + and item not in spine: + extras.append(item) + extras.sort() + for item in extras: + spine.add(item, False) + + def _guide_from_opf(self, opf): + self.guide = guide = Guide(self) + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + guide.add(elem.get('type'), elem.get('title'), elem.get('href')) + + def _toc_from_navpoint(self, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0] + href = xpath(child, 'ncx:content/@src')[0] + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(node, child) + + def _toc_from_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if not result: + return False + id = result[0] + ncx = self.manifest[id].data + self.manifest.remove(id) + title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0] + self.toc = toc = Toc(title) + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, '/o2:package/o2:tours/o2:tour') + if not result: + return False + tour = result[0] + self.toc = toc = Toc(tour.get('title')) + sites = xpath(tour, 'o2:site') + for site in sites: + toc.add(site.get('title'), site.get('href')) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.guide: + return False + self.toc = toc = Toc() + itempath, frag = urldefrag(self.guide['toc'].href) + item = self.manifest.hrefs[itempath] + html = item.data + if frag: + elem = xpath(html, './/*[@id="%s"]' % frag) + html = elem[0] if elem else html + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + path, frag = urldefrag(href) + if not path: + href = '#'.join((itempath, frag)) + title = ' '.join(xpath(anchor, './/text()')) + if href not in titles: + order.append(href) + titles[href].append(title) + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + self.toc = toc = Toc() + titles = [] + headers = [] + for item in self.spine: + if not item.linear: continue + html = item.data + title = xpath(html, '/h:html/h:head/h:title/text()') + if title: titles.append(title[0]) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) + header = xpath(html, expr) + if header: + headers[-1] = header[0] + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf): + if self._toc_from_ncx(opf): return + if self._toc_from_tour(opf): return + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _all_from_opf(self, opf): + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + self._toc_from_opf(opf) + + def to_opf1(self): + package = etree.Element('package', + attrib={'unique-identifier': self.uid.id}) + metadata = self.metadata.to_opf1(package) + manifest = self.manifest.to_opf1(package) + spine = self.spine.to_opf1(package) + tours = element(package, 'tours') + tour = element(tours, 'tour', + attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) + self.toc.to_opf1(tour) + guide = self.guide.to_opf1(package) + return {OPF_MIME: ('content.opf', package)} + + def _generate_ncx_item(self): + id = 'ncx' + index = 0 + while id in self.manifest: + id = 'ncx' + str(index) + index = index + 1 + href = 'toc' + index = 0 + while (href + '.ncx') in self.manifest.hrefs: + href = 'toc' + str(index) + href += '.ncx' + return (id, href) + + def _to_ncx(self): + ncx = etree.Element(NCX('ncx'), attrib={'version': '2005-1'}, + nsmap={None: NCX_NS}) + head = etree.SubElement(ncx, NCX('head')) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:uid', 'content': unicode(self.uid)}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:totalPageCount', 'content': '0'}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:maxPageNumber', 'content': '0'}) + title = etree.SubElement(ncx, NCX('docTitle')) + text = etree.SubElement(title, NCX('text')) + text.text = unicode(self.metadata.title[0]) + navmap = etree.SubElement(ncx, NCX('navMap')) + self.toc.to_ncx(navmap) + return ncx + + def to_opf2(self): + package = etree.Element(OPF('package'), + attrib={'version': '2.0', 'unique-identifier': self.uid.id}, + nsmap={None: OPF2_NS}) + metadata = self.metadata.to_opf2(package) + manifest = self.manifest.to_opf2(package) + id, href = self._generate_ncx_item() + etree.SubElement(manifest, OPF('item'), + attrib={'id': id, 'href': href, 'media-type': NCX_MIME}) + spine = self.spine.to_opf2(package) + spine.attrib['toc'] = id + guide = self.guide.to_opf2(package) + ncx = self._to_ncx() + return {OPF_MIME: ('content.opf', package), + NCX_MIME: (href, ncx)} + +def main(argv=sys.argv): + for arg in argv[1:]: + oeb = Oeb(arg) + for name, doc in oeb.to_opf2().items(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/split.py b/src/calibre/ebooks/lit/split.py new file mode 100644 index 0000000000..2083f95016 --- /dev/null +++ b/src/calibre/ebooks/lit/split.py @@ -0,0 +1,149 @@ +#! /usr/bin/python + +from __future__ import with_statement +import sys +import os +import re +import types +import copy +import itertools +from collections import defaultdict +from lxml import etree +from stylizer import Page, Stylizer, Style + +XHTML_NS = 'http://www.w3.org/1999/xhtml' +XPNSMAP = {'h': XHTML_NS,} + +class Splitter(object): + XML_PARSER = etree.XMLParser(remove_blank_text=True) + COLLAPSE = re.compile(r'[ \n\r]+') + CONTENT_TAGS = set(['img', 'object', 'embed']) + for tag in list(CONTENT_TAGS): + CONTENT_TAGS.add('{%s}%s' % (XHTML_NS, tag)) + + def __init__(self, path): + with open(path, 'rb') as f: + self.tree = etree.parse(f, parser=self.XML_PARSER) + self.stylizer = Stylizer(self.tree, path) + self.path = path + self.basename = os.path.splitext( + os.path.basename(path))[0].lower() + self.splits = [] + self.names = [] + self.idmap = {} + self.fonts = defaultdict(int) + self.content = False + + def split(self): + tree = self.tree + for prefix in ('', 'h:'): + d = {'h': prefix} + roots = tree.xpath('/%(h)shtml' % d, namespaces=XPNSMAP) + if roots: break + self.root, = roots + self.head, = tree.xpath('/%(h)shtml/%(h)shead' % d, namespaces=XPNSMAP) + body, = tree.xpath('/%(h)shtml/%(h)sbody' % d, namespaces=XPNSMAP) + self._split(body, [self.new_root(str(self.basename))], 9.0) + results = zip(self.names, self.splits) + self.post_process_links(results, d) + return results + + def new_root(self, name): + nroot = self.dup(self.root) + nroot.append(copy.deepcopy(self.head)) + self.splits.append(nroot) + self.names.append(name + '.html') + return nroot + + def dup(self, e): + new = etree.Element(e.tag, nsmap=e.nsmap, **dict(e.attrib)) + new.text = e.text + new.tail = e.tail + return new + + def dupsub(self, p, e): + new = etree.SubElement(p, e.tag, nsmap=e.nsmap, **dict(e.attrib)) + new.text = e.text + new.tail = e.tail + return new + + def _split(self, src, dstq, psize): + style = self.stylizer.style(src) + if self.new_page(style, 'before'): + self.new_split(src, dstq) + attrib = src.attrib + name = self.names[-1] + for aname in ('id', 'name'): + if aname in attrib: + self.idmap[attrib[aname]] = name + text = self.COLLAPSE.sub(' ', src.text or '') + tail = self.COLLAPSE.sub(' ', src.text or '') + if text or tail or src.tag.lower() in self.CONTENT_TAGS: + self.content = True + size = style['font-size'] + self.fonts[size] += len(text) + self.fonts[psize] += len(tail) + new = self.dupsub(dstq[-1], src) + if len(src) > 0: + dstq.append(new) + for child in src: + self._split(child, dstq, size) + dstq.pop() + if self.new_page(style, 'after'): + self.new_split(src, dstq) + + def new_page(self, style, when): + if self.content \ + and (style['page-break-%s' % when] \ + in ('always', 'odd', 'even')): + return True + return False + + def new_split(self, src, dstq): + name = self.basename + attrib = src.attrib + if 'class' in attrib: + name = src.attrib['class'] + if ' ' in name: + name = name.split(' ', 2)[0] + if 'id' in attrib: + name = '%s-%s' % (name, attrib['id']) + name = name.lower().replace('_', '-') + if (name + '.html') in self.names: + name = '%s-%02d' % (name, len(self.names)) + prev = None + for i in xrange(len(dstq)): + new = self.new_root(name) if prev is None \ + else self.dupsub(prev, dstq[i]) + prev = dstq[i] = new + self.content = False + + def post_process_links(self, results, prefixes): + basename = os.path.basename(self.path) + query = '//%(h)sa[@href]' % prefixes + for name, root in results: + elements = root.xpath(query, namespaces=XPNSMAP) + for element in elements: + href = element.attrib['href'] + if '#' not in href: continue + fname, id = href.split('#', 2) + if fname in ('', basename): + href = '#'.join((self.idmap[id], id)) + element.attrib['href'] = href + +def main(): + def xml2str(root): + return etree.tostring(root, pretty_print=True, + encoding='utf-8', xml_declaration=True) + tree = None + path = sys.argv[1] + dest = sys.argv[2] + splitter = Splitter(path) + for name, root in splitter.split(): + print name + with open(os.path.join(dest, name), 'wb') as f: + f.write(xml2str(root)) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/stylizer.py b/src/calibre/ebooks/lit/stylizer.py new file mode 100644 index 0000000000..97b7e2d91d --- /dev/null +++ b/src/calibre/ebooks/lit/stylizer.py @@ -0,0 +1,435 @@ +#! /usr/bin/python2.5 +# -*- encoding: utf-8 -*- + +from __future__ import with_statement +import sys +import os +import locale +import codecs +import itertools +import types +import re +import copy +import cssutils +from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \ + CSSValueList, cssproperties +from lxml import etree +from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename +from calibre.resources import html_css + +HTML_CSS_STYLESHEET = cssutils.parseString(html_css) +XHTML_CSS_NAMESPACE = "@namespace url(http://www.w3.org/1999/xhtml);\n" + +INHERITED = set(['azimuth', 'border-collapse', 'border-spacing', + 'caption-side', 'color', 'cursor', 'direction', 'elevation', + 'empty-cells', 'font-family', 'font-size', 'font-style', + 'font-variant', 'font-weight', 'letter-spacing', + 'line-height', 'list-style-image', 'list-style-position', + 'list-style-type', 'orphans', 'page-break-inside', + 'pitch-range', 'pitch', 'quotes', 'richness', 'speak-header', + 'speak-numeral', 'speak-punctuation', 'speak', 'speech-rate', + 'stress', 'text-align', 'text-indent', 'text-transform', + 'visibility', 'voice-family', 'volume', 'white-space', + 'widows', 'word-spacing']) + +DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll', + 'background-color': 'transparent', 'background-image': 'none', + 'background-position': '0% 0%', 'background-repeat': 'repeat', + 'border-bottom-color': ':color', 'border-bottom-style': 'none', + 'border-bottom-width': 'medium', 'border-collapse': 'separate', + 'border-left-color': ':color', 'border-left-style': 'none', + 'border-left-width': 'medium', 'border-right-color': ':color', + 'border-right-style': 'none', 'border-right-width': 'medium', + 'border-spacing': 0, 'border-top-color': ':color', + 'border-top-style': 'none', 'border-top-width': 'medium', 'bottom': + 'auto', 'caption-side': 'top', 'clear': 'none', 'clip': 'auto', + 'color': 'black', 'content': 'normal', 'counter-increment': 'none', + 'counter-reset': 'none', 'cue-after': 'none', 'cue-before': 'none', + 'cursor': 'auto', 'direction': 'ltr', 'display': 'inline', + 'elevation': 'level', 'empty-cells': 'show', 'float': 'none', + 'font-family': 'serif', 'font-size': 'medium', 'font-style': + 'normal', 'font-variant': 'normal', 'font-weight': 'normal', + 'height': 'auto', 'left': 'auto', 'letter-spacing': 'normal', + 'line-height': 'normal', 'list-style-image': 'none', + 'list-style-position': 'outside', 'list-style-type': 'disc', + 'margin-bottom': 0, 'margin-left': 0, 'margin-right': 0, + 'margin-top': 0, 'max-height': 'none', 'max-width': 'none', + 'min-height': 0, 'min-width': 0, 'orphans': '2', + 'outline-color': 'invert', 'outline-style': 'none', + 'outline-width': 'medium', 'overflow': 'visible', 'padding-bottom': + 0, 'padding-left': 0, 'padding-right': 0, 'padding-top': 0, + 'page-break-after': 'auto', 'page-break-before': 'auto', + 'page-break-inside': 'auto', 'pause-after': 0, 'pause-before': + 0, 'pitch': 'medium', 'pitch-range': '50', 'play-during': 'auto', + 'position': 'static', 'quotes': u"'“' '”' '‘' '’'", 'richness': + '50', 'right': 'auto', 'speak': 'normal', 'speak-header': 'once', + 'speak-numeral': 'continuous', 'speak-punctuation': 'none', + 'speech-rate': 'medium', 'stress': '50', 'table-layout': 'auto', + 'text-align': 'left', 'text-decoration': 'none', 'text-indent': + 0, 'text-transform': 'none', 'top': 'auto', 'unicode-bidi': + 'normal', 'vertical-align': 'baseline', 'visibility': 'visible', + 'voice-family': 'default', 'volume': 'medium', 'white-space': + 'normal', 'widows': '2', 'width': 'auto', 'word-spacing': 'normal', + 'z-index': 'auto'} + +FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', + 'x-large', 'xx-large']) + +FONT_SIZE_LIST = [('xx-small', 1, 6.), + ('x-small', None, 7.), + ('small', 2, 8.), + ('medium', 3, 9.), + ('large', 4, 11.), + ('x-large', 5, 13.), + ('xx-large', 6, 15.), + (None, 7, 17.)] + +FONT_SIZE_BY_NAME = {} +FONT_SIZE_BY_NUM = {} +for name, num, size in FONT_SIZE_LIST: + FONT_SIZE_BY_NAME[name] = size + FONT_SIZE_BY_NUM[num] = size + +XPNSMAP = {'h': XHTML_NS,} +def xpath(elem, expr): + return elem.xpath(expr, namespaces=XPNSMAP) + + +class Page(object): + def __init__(self, width, height, dpi): + self.width = float(width) + self.height = float(height) + self.dpi = float(dpi) + +class Profiles(object): + PRS500 = Page(584, 754, 168.451) + PRS505 = PRS500 + + +class Stylizer(object): + STYLESHEETS = {} + + def __init__(self, tree, path, oeb, page=Profiles.PRS505): + self.page = page + base = os.path.dirname(path) + basename = os.path.basename(path) + cssname = os.path.splitext(basename)[0] + '.css' + stylesheets = [HTML_CSS_STYLESHEET] + head = xpath(tree, '/h:html/h:head')[0] + for elem in head: + tag = barename(elem.tag) + if tag == 'style': + text = ''.join(elem.text) + stylesheet = cssutils.parseString(text, href=cssname) + stylesheets.append(stylesheet) + elif tag == 'link' \ + and elem.get('rel', 'stylesheet') == 'stylesheet' \ + and elem.get('type', CSS_MIME) in OEB_STYLES: + href = elem.attrib['href'] + path = os.path.join(base, href) + path = os.path.normpath(path).replace('\\', '/') + if path in self.STYLESHEETS: + stylesheet = self.STYLESHEETS[path] + else: + data = XHTML_CSS_NAMESPACE + data += oeb.manifest.hrefs[path].data + stylesheet = cssutils.parseString(data, href=path) + self.STYLESHEETS[path] = stylesheet + stylesheets.append(stylesheet) + rules = [] + index = 0 + self.stylesheets = set() + for stylesheet in stylesheets: + href = stylesheet.href + self.stylesheets.add(href) + for rule in stylesheet.cssRules: + rules.extend(self.flatten_rule(rule, href, index)) + index = index + 1 + rules.sort() + self.rules = rules + self._styles = {} + + def flatten_rule(self, rule, href, index): + results = [] + if isinstance(rule, CSSStyleRule): + style = self.flatten_style(rule.style) + for selector in rule.selectorList: + specificity = selector.specificity + (index,) + text = selector.selectorText + selector = list(selector.seq) + results.append((specificity, selector, style, text, href)) + elif isinstance(rule, CSSPageRule): + style = self.flatten_style(rule.style) + results.append(((0, 0, 0, 0), [], style, '@page', href)) + return results + + def flatten_style(self, cssstyle): + style = {} + for prop in cssstyle: + name = prop.name + if name in ('margin', 'padding'): + style.update(self._normalize_edge(prop.cssValue, name)) + elif name == 'font': + style.update(self._normalize_font(prop.cssValue)) + else: + style[name] = prop.value + if 'font-size' in style: + size = style['font-size'] + if size == 'normal': size = 'medium' + if size in FONT_SIZE_NAMES: + style['font-size'] = "%dpt" % FONT_SIZE_BY_NAME[size] + return style + + def _normalize_edge(self, cssvalue, name): + style = {} + if isinstance(cssvalue, CSSValueList): + primitives = [v.cssText for v in cssvalue] + else: + primitives = [cssvalue.cssText] + if len(primitives) == 1: + value, = primitives + values = [value, value, value, value] + elif len(primitives) == 2: + vert, horiz = primitives + values = [vert, horiz, vert, horiz] + elif len(primitives) == 3: + top, horiz, bottom = primitives + values = [top, horiz, bottom, horiz] + else: + values = primitives[:4] + edges = ('top', 'right', 'bottom', 'left') + for edge, value in itertools.izip(edges, values): + style["%s-%s" % (name, edge)] = value + return style + + def _normalize_font(self, cssvalue): + composition = ('font-style', 'font-variant', 'font-weight', + 'font-size', 'line-height', 'font-family') + style = {} + if cssvalue.cssText == 'inherit': + for key in composition: + style[key] = 'inherit' + else: + primitives = [v.cssText for v in cssvalue] + primitites.reverse() + value = primitives.pop() + for key in composition: + if cssproperties.cssvalues[key](value): + style[key] = value + if not primitives: break + value = primitives.pop() + for key in composition: + if key not in style: + style[key] = DEFAULTS[key] + return style + + def style(self, element): + try: return self._styles[element] + except: pass + return Style(element, self) + + def stylesheet(self, name, font_scale=None): + rules = [] + for _, _, style, selector, href in self.rules: + if href != name: continue + if font_scale and 'font-size' in style and \ + style['font-size'].endswith('pt'): + style = copy.copy(style) + size = float(style['font-size'][:-2]) + style['font-size'] = "%.2fpt" % (size * font_scale) + style = ';\n '.join(': '.join(item) for item in style.items()) + rules.append('%s {\n %s;\n}' % (selector, style)) + return '\n'.join(rules) + +class Style(object): + def __init__(self, element, stylizer): + self._element = element + self._page = stylizer.page + self._stylizer = stylizer + self._style = self._assemble_style(element, stylizer) + stylizer._styles[element] = self + + def _assemble_style(self, element, stylizer): + result = {} + rules = stylizer.rules + for _, selector, style, _, _ in rules: + if self._selects_element(element, selector): + result.update(style) + try: + style = CSSStyleDeclaration(element.attrib['style']) + result.update(stylizer.flatten_style(style)) + except KeyError: + pass + return result + + def _selects_element(self, element, selector): + def _selects_element(element, items, index): + if index == -1: + return True + item = items[index] + if item.type == 'universal': + pass + elif item.type == 'type-selector': + name1 = ("{%s}%s" % item.value).lower() + name2 = element.tag.lower() + if name1 != name2: + return False + elif item.type == 'id': + name1 = item.value[1:].lower() + name2 = element.attrib.get('id', '').lower().split() + if name1 != name2: + return False + elif item.type == 'class': + name = item.value[1:].lower() + classes = element.attrib.get('class', '').lower().split() + if name not in classes: + return False + elif item.type == 'child': + parent = element.getparent() + if parent is None: + return False + element = parent + elif item.type == 'descendant': + element = element.getparent() + while element is not None: + if _selects_element(element, items, index - 1): + return True + element = element.getparent() + return False + elif item.type == 'pseudo-class': + if item.value == ':first-child': + e = element.getprevious() + if e is not None: + return False + else: + return False + elif item.type == 'pseudo-element': + return False + else: + return False + return _selects_element(element, items, index - 1) + return _selects_element(element, selector, len(selector) - 1) + + def _has_parent(self): + parent = self._element.getparent() + return (parent is not None) \ + and (parent in self._stylizer._styles) + + def __getitem__(self, name): + domname = cssproperties._toDOMname(name) + if hasattr(self, domname): + return getattr(self, domname) + return self._unit_convert(self._get(name)) + + def _get(self, name): + result = None + styles = self._stylizer._styles + if name in self._style: + result = self._style[name] + if (result == 'inherit' + or (result is None and name in INHERITED + and self._has_parent())): + result = styles[self._element.getparent()]._get(name) + if result is None: + result = DEFAULTS[name] + return result + + def _unit_convert(self, value, base=None, font=None): + if isinstance(value, (int, long, float)): + return value + try: + if float(value) == 0: + return 0.0 + except: + pass + result = value + m = re.search( + r"^(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)$", value) + if m is not None and m.group(1): + value = float(m.group(1)) + unit = m.group(2) + if unit == '%': + base = base or self.width + result = (value/100.0) * base + elif unit == 'px': + result = value * 72.0 / self._page.dpi + elif unit == 'in': + result = value * 72.0 + elif unit == 'pt': + result = value + elif unit == 'em': + font = font or self.fontSize + result = value * font + elif unit == 'pc': + result = value * 12.0 + elif unit == 'mm': + result = value * 0.04 + elif unit == 'cm': + result = value * 0.40 + return result + + @property + def fontSize(self): + def normalize_fontsize(value, base=None): + result = None + factor = None + if value == 'inherit': + value = 'medium' + if value in FONT_SIZE_NAMES: + result = FONT_SIZE_BY_NAME[value] + elif value == 'smaller': + factor = 1.0/1.2 + for _, _, size in FONT_SIZE_LIST: + if base <= size: break + factor = None + result = size + elif value == 'larger': + factor = 1.2 + for _, _, size in reversed(FONT_SIZE_LIST): + if base >= size: break + factor = None + result = size + else: + result = self._unit_convert(value, base=base, font=base) + if result < 0: + result = normalize_fontsize("smaller", base) + if factor: + result = factor * base + return result + result = None + if self._has_parent(): + styles = self._stylizer._styles + base = styles[self._element.getparent()].fontSize + else: + base = normalize_fontsize(DEFAULTS['font-size']) + if 'font-size' in self._style: + size = self._style['font-size'] + result = normalize_fontsize(size, base) + else: + result = base + self.__dict__['fontSize'] = result + return result + + @property + def width(self): + result = None + base = None + if self._has_parent(): + styles = self._stylizer._styles + base = styles[self._element.getparent()].width + else: + base = self._page.width + if 'width' in self._style: + width = self._style['width'] + if width == 'auto': + result = base + else: + result = self._unit_convert(width, base=base) + else: + result = base + self.__dict__['width'] = result + return result + + def __str__(self): + items = self._style.items() + return '; '.join("%s: %s" % (key, val) for key, val in items) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py new file mode 100644 index 0000000000..5ed3bdf8ec --- /dev/null +++ b/src/calibre/ebooks/lit/writer.py @@ -0,0 +1,655 @@ +from __future__ import with_statement +import sys +import os +from cStringIO import StringIO +from struct import pack, unpack +from itertools import izip, count +import time +import random +import re +import copy +import uuid +import functools +from lxml import etree +from calibre.ebooks.lit.reader import msguid, DirectoryEntry +import calibre.ebooks.lit.maps as maps +from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME +from calibre.ebooks.lit.oeb import Oeb, namespace, barename +from calibre.ebooks.lit.stylizer import Stylizer +from calibre.ebooks.lit.lzxcomp import Compressor +import calibre +from calibre import plugins +msdes, msdeserror = plugins['msdes'] +import calibre.ebooks.lit.mssha1 as mssha1 + +__all__ = ['LitWriter'] + +def invert_tag_map(tag_map): + tags, dattrs, tattrs = tag_map + tags = dict((tags[i], i) for i in xrange(len(tags))) + dattrs = dict((v, k) for k, v in dattrs.items()) + tattrs = [dict((v, k) for k, v in (map or {}).items()) for map in tattrs] + for map in tattrs: + if map: map.update(dattrs) + tattrs[0] = dattrs + return tags, tattrs + +OPF_MAP = invert_tag_map(maps.OPF_MAP) +HTML_MAP = invert_tag_map(maps.HTML_MAP) + +LIT_MAGIC = 'ITOLITLS' + +LITFILE_GUID = "{0A9007C1-4076-11D3-8789-0000F8105754}" +PIECE3_GUID = "{0A9007C3-4076-11D3-8789-0000F8105754}" +PIECE4_GUID = "{0A9007C4-4076-11D3-8789-0000F8105754}" +DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" +LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" + +def packguid(guid): + values = guid[1:9], guid[10:14], guid[15:19], \ + guid[20:22], guid[22:24], guid[25:27], guid[27:29], \ + guid[29:31], guid[31:33], guid[33:35], guid[35:37] + values = [int(value, 16) for value in values] + return pack(">= 7 + if bytes: + b |= 0x80 + bytes.append(chr(b)) + if value == 0: + break + return ''.join(reversed(bytes)) + +def randbytes(n): + return ''.join(chr(random.randint(0, 255)) for x in xrange(n)) + +class ReBinary(object): + def __init__(self, root, path, oeb, map=HTML_MAP): + self.dir = os.path.dirname(path) + self.manifest = oeb.manifest + self.tags, self.tattrs = map + self.buf = StringIO() + self.anchors = [] + self.page_breaks = [] + self.is_html = is_html = map is HTML_MAP + self.stylizer = Stylizer(root, path, oeb) if is_html else None + self.tree_to_binary(root) + self.content = self.buf.getvalue() + self.ahc = self.build_ahc() + self.aht = self.build_aht() + + def write(self, *values): + for value in values: + if isinstance(value, (int, long)): + value = unichr(value) + self.buf.write(value.encode('utf-8')) + + def tree_to_binary(self, elem, nsrmap={'': None}, parents=[], + inhead=False, preserve=False): + if not isinstance(elem.tag, basestring): + self.write(etree.tostring(elem)) + return + nsrmap = copy.copy(nsrmap) + attrib = dict(elem.attrib) + style = self.stylizer.style(elem) if self.stylizer else None + for key, value in elem.nsmap.items(): + if value not in nsrmap or nsrmap[value] != key: + xmlns = ('xmlns:' + key) if key else 'xmlns' + attrib[xmlns] = value + nsrmap[value] = key + tag = prefixname(elem.tag, nsrmap) + tag_offset = self.buf.tell() + if tag == 'head': + inhead = True + flags = FLAG_OPENING + if not elem.text and len(elem) == 0: + flags |= FLAG_CLOSING + if inhead: + flags |= FLAG_HEAD + if style and style['display'] in ('block', 'table'): + flags |= FLAG_BLOCK + self.write(0, flags) + tattrs = self.tattrs[0] + if tag in self.tags: + index = self.tags[tag] + self.write(index) + if self.tattrs[index]: + tattrs = self.tattrs[index] + else: + self.write(FLAG_CUSTOM, len(tag)+1, tag) + last_break = self.page_breaks[-1][0] if self.page_breaks else None + if style and last_break != tag_offset \ + and style['page-break-before'] not in ('avoid', 'auto'): + self.page_breaks.append((tag_offset, list(parents))) + for attr, value in attrib.items(): + attr = prefixname(attr, nsrmap) + if attr in ('href', 'src'): + path, hash, frag = value.partition('#') + path = os.path.join(self.dir, path) + path = os.path.normpath(path) + path = path.replace('\\', '/') + prefix = unichr(3) + if path in self.manifest.hrefs: + prefix = unichr(2) + value = self.manifest.hrefs[path].id + if hash and frag: + value = '#'.join((value, frag)) + value = prefix + value + elif attr in ('id', 'name'): + self.anchors.append((value, tag_offset)) + elif attr.startswith('ms--'): + attr = '%' + attr[4:] + if attr in tattrs: + self.write(tattrs[attr]) + else: + self.write(FLAG_CUSTOM, len(attr)+1, attr) + try: + self.write(ATTR_NUMBER, int(value)+1) + except ValueError: + self.write(len(value)+1, value) + self.write(0) + if elem.text: + text = elem.text + if style and style['white-space'] == 'pre': + preserve = True + if elem.get('xml:space') == 'preserve': + preserve = True + if not preserve: + text = COLLAPSE.sub(' ', text) + self.write(text) + parents.append(tag_offset) + for child in elem: + self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + parents.pop() + if not flags & FLAG_CLOSING: + self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) + if elem.tail: + tail = elem.tail + if tag != 'pre': + tail = COLLAPSE.sub(' ', tail) + self.write(tail) + if style and style['page-break-after'] not in ('avoid', 'auto'): + self.page_breaks.append((self.buf.tell(), list(parents))) + + def build_ahc(self): + data = StringIO() + data.write(unichr(len(self.anchors)).encode('utf-8')) + for anchor, offset in self.anchors: + data.write(unichr(len(anchor)).encode('utf-8')) + data.write(anchor) + data.write(pack(' 0: + section = self._sections[secnum] + offset = section.tell() + section.write(data) + else: + offset = 0 + self._directory.append( + DirectoryEntry(name, secnum, offset, len(data))) + + def _add_folder(self, name, offset=0, size=0): + if not name.endswith('/'): + name += '/' + self._directory.append( + DirectoryEntry(name, 0, offset, size)) + + def _djoin(self, *names): + return '/'.join(names) + + def _build_sections(self): + self._add_folder('/', ROOT_OFFSET, ROOT_SIZE) + self._build_data() + self._build_manifest() + self._build_page_breaks() + self._build_meta() + self._build_drm_storage() + self._build_version() + self._build_namelist() + self._build_storage() + self._build_transforms() + + def _build_data(self): + self._add_folder('/data') + for item in self._oeb.manifest.values(): + name = '/data/' + item.id + data = item.data + secnum = 0 + if not isinstance(data, basestring): + self._add_folder(name) + rebin = ReBinary(data, item.href, self._oeb) + self._add_file(name + '/ahc', rebin.ahc, 0) + self._add_file(name + '/aht', rebin.aht, 0) + item.page_breaks = rebin.page_breaks + data = rebin.content + name = name + '/content' + secnum = 1 + self._add_file(name, data, secnum) + item.size = len(data) + + def _build_manifest(self): + states = ['linear', 'nonlinear', 'css', 'images'] + manifest = dict((state, []) for state in states) + for item in self._oeb.manifest.values(): + if item.spine_position is not None: + key = 'linear' if item.linear else 'nonlinear' + manifest[key].append(item) + elif item.media_type == CSS_MIME: + manifest['css'].append(item) + else: + manifest['images'].append(item) + data = StringIO() + data.write(pack(' 1: + pb3cur |= 0x2 + bits += 2 + if bits >= 8: + pb3.write(pack(' 0: + data = ("\000" * prepad) + data + prepad = 0 + postpad = 64 - (len(data) % 64) + if postpad < 64: + data = data + ("\000" * postpad) + hash.update(data) + digest = hash.digest() + key = [0] * 8 + for i in xrange(0, len(digest)): + key[i % 8] ^= ord(digest[i]) + return ''.join(chr(x) for x in key) + + def _build_dchunks(self): + ddata = [] + directory = list(self._directory) + directory.sort(cmp=lambda x, y: \ + cmp(x.name.lower(), y.name.lower())) + qrn = 1 + (1 << 2) + dchunk = StringIO() + dcount = 0 + quickref = [] + name = directory[0].name + for entry in directory: + next = ''.join([decint(len(entry.name)), entry.name, + decint(entry.section), decint(entry.offset), + decint(entry.size)]) + usedlen = dchunk.tell() + len(next) + (len(quickref) * 2) + 52 + if usedlen >= DCHUNK_SIZE: + ddata.append((dchunk.getvalue(), quickref, dcount, name)) + dchunk = StringIO() + dcount = 0 + quickref = [] + name = entry.name + if (dcount % qrn) == 0: + quickref.append(dchunk.tell()) + dchunk.write(next) + dcount = dcount + 1 + ddata.append((dchunk.getvalue(), quickref, dcount, name)) + cidmax = len(ddata) - 1 + rdcount = 0 + dchunks = [] + dcounts = [] + ichunk = None + if len(ddata) > 1: + ichunk = StringIO() + for cid, (content, quickref, dcount, name) in izip(count(), ddata): + dchunk = StringIO() + prev = cid - 1 if cid > 0 else ULL_NEG1 + next = cid + 1 if cid < cidmax else ULL_NEG1 + rem = DCHUNK_SIZE - (len(content) + 50) + pad = rem - (len(quickref) * 2) + dchunk.write('AOLL') + dchunk.write(pack('