From f740d20f32e9ca2fbedcb2bcff5e7e4d9b5dfcd4 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 7 Dec 2008 23:53:14 -0500 Subject: [PATCH 01/15] Adding initial LitWriter and oeb2lit code. --- setup.py | 1 + src/calibre/ebooks/lit/html.css | 420 ++++++++++++++++++ src/calibre/ebooks/lit/lzxcomp.py | 176 ++++++++ src/calibre/ebooks/lit/oeb.py | 690 +++++++++++++++++++++++++++++ src/calibre/ebooks/lit/split.py | 149 +++++++ src/calibre/ebooks/lit/stylizer.py | 435 ++++++++++++++++++ src/calibre/ebooks/lit/writer.py | 655 +++++++++++++++++++++++++++ src/calibre/linux.py | 1 + 8 files changed, 2527 insertions(+) create mode 100644 src/calibre/ebooks/lit/html.css create mode 100644 src/calibre/ebooks/lit/lzxcomp.py create mode 100644 src/calibre/ebooks/lit/oeb.py create mode 100644 src/calibre/ebooks/lit/split.py create mode 100644 src/calibre/ebooks/lit/stylizer.py create mode 100644 src/calibre/ebooks/lit/writer.py diff --git a/setup.py b/setup.py index 37d54c4317..aa72b46f00 100644 --- a/setup.py +++ b/setup.py @@ -146,6 +146,7 @@ if __name__ == '__main__': metadata_sqlite = 'library/metadata_sqlite.sql', jquery = 'gui2/viewer/jquery.js', jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js', + html_css = 'ebooks/lit/html.css', ) DEST = os.path.join('src', APPNAME, 'resources.py') diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css new file mode 100644 index 0000000000..5b75ea6649 --- /dev/null +++ b/src/calibre/ebooks/lit/html.css @@ -0,0 +1,420 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Blake Ross + * + * Alternatively, the contents of this file may be used under the terms of + * either of the GNU General Public License Version 2 or later (the "GPL"), + * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +@namespace url(http://www.w3.org/1999/xhtml); /* set default namespace to HTML */ + +/* blocks */ + +html, div, map, dt, isindex, form { + display: block; +} + +body { + display: block; + margin: 8px; +} + +p, dl, multicol { + display: block; + margin: 1em 0; +} + +dd { + display: block; +} + +blockquote { + display: block; + margin: 1em 40px; +} + +address { + display: block; + font-style: italic; +} + +center { + display: block; + text-align: center; +} + +blockquote[type=cite] { + display: block; + margin: 1em 0px; + border-color: blue; + border-width: thin; +} + +span[_moz_quote=true] { + color: blue; +} + +pre[_moz_quote=true] { + color: blue; +} + +h1 { + display: block; + font-size: 2em; + font-weight: bold; + margin: .67em 0; +} + +h2 { + display: block; + font-size: 1.5em; + font-weight: bold; + margin: .83em 0; +} + +h3 { + display: block; + font-size: 1.17em; + font-weight: bold; + margin: 1em 0; +} + +h4 { + display: block; + font-weight: bold; + margin: 1.33em 0; +} + +h5 { + display: block; + font-size: 0.83em; + font-weight: bold; + margin: 1.67em 0; +} + +h6 { + display: block; + font-size: 0.67em; + font-weight: bold; + margin: 2.33em 0; +} + +listing { + display: block; + font-family: monospace; + font-size: medium; + white-space: pre; + margin: 1em 0; +} + +xmp, pre, plaintext { + display: block; + font-family: monospace; + white-space: pre; + margin: 1em 0; +} + +/* tables */ + +table { + display: table; + border-spacing: 2px; + border-collapse: separate; + margin-top: 0; + margin-bottom: 0; + text-indent: 0; +} + +table[align="left"] { + float: left; +} + +table[align="right"] { + float: right; +} + +table[rules]:not([rules="none"]) { + border-collapse: collapse; +} + +/* caption inherits from table not table-outer */ +caption { + display: table-caption; + text-align: center; +} + +table[align="center"] > caption { + margin-left: auto; + margin-right: auto; +} + +table[align="center"] > caption[align="left"] { + margin-right: 0; +} + +table[align="center"] > caption[align="right"] { + margin-left: 0; +} + +tr { + display: table-row; + vertical-align: inherit; +} + +col { + display: table-column; +} + +colgroup { + display: table-column-group; +} + +tbody { + display: table-row-group; + vertical-align: middle; +} + +thead { + display: table-header-group; + vertical-align: middle; +} + +tfoot { + display: table-footer-group; + vertical-align: middle; +} + +/* for XHTML tables without tbody */ +table > tr { + vertical-align: middle; +} + +td { + display: table-cell; + vertical-align: inherit; + text-align: inherit; + padding: 1px; +} + +th { + display: table-cell; + vertical-align: inherit; + font-weight: bold; + padding: 1px; +} + +/* inlines */ + +q:before { + content: open-quote; +} + +q:after { + content: close-quote; +} + +b, strong { + font-weight: bolder; +} + +i, cite, em, var, dfn { + font-style: italic; +} + +tt, code, kbd, samp { + font-family: monospace; +} + +u, ins { + text-decoration: underline; +} + +s, strike, del { + text-decoration: line-through; +} + +blink { + text-decoration: blink; +} + +big { + font-size: larger; +} + +small { + font-size: smaller; +} + +sub { + vertical-align: sub; + font-size: smaller; + line-height: normal; +} + +sup { + vertical-align: super; + font-size: smaller; + line-height: normal; +} + +nobr { + white-space: nowrap; +} + +/* titles */ +abbr[title], acronym[title] { + border-bottom: dotted 1px; +} + +/* lists */ + +ul, menu, dir { + display: block; + list-style-type: disc; + margin: 1em 0; +} + +ol { + display: block; + list-style-type: decimal; + margin: 1em 0; +} + +li { + display: list-item; +} + +/* nested lists have no top/bottom margins */ +ul ul, ul ol, ul dir, ul menu, ul dl, +ol ul, ol ol, ol dir, ol menu, ol dl, +dir ul, dir ol, dir dir, dir menu, dir dl, +menu ul, menu ol, menu dir, menu menu, menu dl, +dl ul, dl ol, dl dir, dl menu, dl dl { + margin-top: 0; + margin-bottom: 0; +} + +/* 2 deep unordered lists use a circle */ +ol ul, ul ul, menu ul, dir ul, +ol menu, ul menu, menu menu, dir menu, +ol dir, ul dir, menu dir, dir dir { + list-style-type: circle; +} + +/* 3 deep (or more) unordered lists use a square */ +ol ol ul, ol ul ul, ol menu ul, ol dir ul, +ol ol menu, ol ul menu, ol menu menu, ol dir menu, +ol ol dir, ol ul dir, ol menu dir, ol dir dir, +ul ol ul, ul ul ul, ul menu ul, ul dir ul, +ul ol menu, ul ul menu, ul menu menu, ul dir menu, +ul ol dir, ul ul dir, ul menu dir, ul dir dir, +menu ol ul, menu ul ul, menu menu ul, menu dir ul, +menu ol menu, menu ul menu, menu menu menu, menu dir menu, +menu ol dir, menu ul dir, menu menu dir, menu dir dir, +dir ol ul, dir ul ul, dir menu ul, dir dir ul, +dir ol menu, dir ul menu, dir menu menu, dir dir menu, +dir ol dir, dir ul dir, dir menu dir, dir dir dir { + list-style-type: square; +} + + +/* leafs */ + +/*
noshade and color attributes are handled completely by + * the nsHTMLHRElement attribute mapping code + */ +hr { + display: block; + height: 2px; + border: 1px inset; + margin: 0.5em auto 0.5em auto; + color: gray; +} + +hr[size="1"] { + border-style: solid none none none; +} + +img[usemap], object[usemap] { + color: blue; +} + +frameset { + display: block ! important; + position: static ! important; + float: none ! important; + border: none ! important; +} + +frame { + border: none ! important; +} + +iframe { + border: 2px inset; +} + +noframes { + display: none; +} + +spacer { + position: static ! important; + float: none ! important; +} + +/* focusable content: anything w/ tabindex >=0 is focusable */ +abbr:focus, acronym:focus, address:focus, applet:focus, b:focus, +base:focus, big:focus, blockquote:focus, br:focus, canvas:focus, caption:focus, +center:focus, cite:focus, code:focus, col:focus, colgroup:focus, dd:focus, +del:focus, dfn:focus, dir:focus, div:focus, dl:focus, dt:focus, em:focus, +fieldset:focus, font:focus, form:focus, h1:focus, h2:focus, h3:focus, h4:focus, +h5:focus, h6:focus, hr:focus, i:focus, img:focus, ins:focus, +kbd:focus, label:focus, legend:focus, li:focus, link:focus, menu:focus, +object:focus, ol:focus, p:focus, pre:focus, q:focus, s:focus, samp:focus, +small:focus, span:focus, strike:focus, strong:focus, sub:focus, sup:focus, +table:focus, tbody:focus, td:focus, tfoot:focus, th:focus, thead:focus, +tr:focus, tt:focus, u:focus, ul:focus, var:focus { + /* Don't specify the outline-color, we should always use initial value. */ + outline: 1px dotted; +} + +/* hidden elements */ +area, base, basefont, head, meta, script, style, title, +noembed, param { + display: none; +} + +/* Page breaks at body tags, to help out with LIT-generation */ +body { + page-break-before: always; +} diff --git a/src/calibre/ebooks/lit/lzxcomp.py b/src/calibre/ebooks/lit/lzxcomp.py new file mode 100644 index 0000000000..4f147a90a1 --- /dev/null +++ b/src/calibre/ebooks/lit/lzxcomp.py @@ -0,0 +1,176 @@ +from __future__ import with_statement +import sys +import os +from cStringIO import StringIO +from ctypes import * + +__all__ = ['Compressor'] + +liblzxcomp = cdll.LoadLibrary('liblzxcomp.so') + +class lzx_data(Structure): + pass + +lzx_get_bytes_t = CFUNCTYPE(c_int, c_voidp, c_int, c_voidp) +lzx_put_bytes_t = CFUNCTYPE(c_int, c_voidp, c_int, c_voidp) +lzx_mark_frame_t = CFUNCTYPE(None, c_voidp, c_uint32, c_uint32) +lzx_at_eof_t = CFUNCTYPE(c_int, c_voidp) + +class lzx_results(Structure): + _fields_ = [('len_compressed_output', c_long), + ('len_uncompressed_input', c_long)] + +# int lzx_init(struct lzx_data **lzxdp, int wsize_code, +# lzx_get_bytes_t get_bytes, void *get_bytes_arg, +# lzx_at_eof_t at_eof, +# lzx_put_bytes_t put_bytes, void *put_bytes_arg, +# lzx_mark_frame_t mark_frame, void *mark_frame_arg); +lzx_init = liblzxcomp.lzx_init +lzx_init.restype = c_int +lzx_init.argtypes = [POINTER(POINTER(lzx_data)), c_int, + lzx_get_bytes_t, c_voidp, + lzx_at_eof_t, + lzx_put_bytes_t, c_voidp, + lzx_mark_frame_t, c_voidp] + +# void lzx_reset(lzx_data *lzxd); +lzx_reset = liblzxcomp.lzx_reset +lzx_reset.restype = None +lzx_reset.argtypes = [POINTER(lzx_data)] + +# int lzx_compress_block(lzx_data *lzxd, int block_size, int subdivide); +lzx_compress_block = liblzxcomp.lzx_compress_block +lzx_compress_block.restype = c_int +lzx_compress_block.argtypes = [POINTER(lzx_data), c_int, c_int] + +# int lzx_finish(struct lzx_data *lzxd, struct lzx_results *lzxr); +lzx_finish = liblzxcomp.lzx_finish +lzx_finish.restype = c_int +lzx_finish.argtypes = [POINTER(lzx_data), POINTER(lzx_results)] + + +class LzxError(Exception): + pass + + +class Compressor(object): + def __init__(self, wbits, reset=True): + self._reset = reset + self._blocksize = 1 << wbits + self._buffered = 0 + self._input = StringIO() + self._output = StringIO() + self._flushing = False + self._rtable = [] + self._get_bytes = lzx_get_bytes_t(self._get_bytes) + self._at_eof = lzx_at_eof_t(self._at_eof) + self._put_bytes = lzx_put_bytes_t(self._put_bytes) + self._mark_frame = lzx_mark_frame_t(self._mark_frame) + self._lzx = POINTER(lzx_data)() + self._results = lzx_results() + rv = lzx_init(self._lzx, wbits, self._get_bytes, c_voidp(), + self._at_eof, self._put_bytes, c_voidp(), + self._mark_frame, c_voidp()) + if rv != 0: + raise LzxError("lzx_init() failed with %d" % rv) + + def _add_input(self, data): + self._input.seek(0, 2) + self._input.write(data) + self._input.seek(0) + self._buffered += len(data) + + def _reset_input(self): + data = self._input.read() + self._input.seek(0) + self._input.truncate() + self._input.write(data) + self._input.seek(0) + + def _reset_output(self): + data = self._output.getvalue() + self._output.seek(0) + self._output.truncate() + return data + + def _reset_rtable(self): + rtable = list(self._rtable) + del self._rtable[:] + return rtable + + def _get_bytes(self, arg, n, buf): + data = self._input.read(n) + memmove(buf, data, len(data)) + self._buffered -= len(data) + return len(data) + + def _put_bytes(self, arg, n, buf): + self._output.write(string_at(buf, n)) + return n + + def _at_eof(self, arg): + if self._flushing and self._buffered == 0: + return 1 + return 0 + + def _mark_frame(self, arg, uncomp, comp): + self._rtable.append((uncomp, comp)) + return + + def _compress_block(self): + rv = lzx_compress_block(self._lzx, self._blocksize, 1) + if rv != 0: + raise LzxError("lzx_compress_block() failed with %d" % rv) + if self._reset: + lzx_reset(self._lzx) + + def compress(self, data, flush=False): + self._add_input(data) + self._flushing = flush + while self._buffered >= self._blocksize: + self._compress_block() + if self._buffered > 0 and flush: + self._compress_block() + self._reset_input() + data = self._reset_output() + rtable = self._reset_rtable() + return (data, rtable) + + def flush(self): + self._flushing = True + if self._buffered > 0: + self._compress_block() + self._reset_input() + data = self._reset_output() + rtable = self._reset_rtable() + return (data, rtable) + + def close(self): + if self._lzx: + lzx_finish(self._lzx, self._results) + self._lzx = None + pass + + def __enter__(self): + return self + + def __exit__(self, *exc_info): + self.close() + + def __del__(self): + self.close() + + +def main(argv=sys.argv): + wbits, inf, outf = argv[1:] + with open(inf, 'rb') as f: + data = f.read() + with Compressor(int(wbits)) as lzx: + data, rtable = lzx.compress(data, flush=True) + print rtable + with open(outf, 'wb') as f: + f.write(data) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py new file mode 100644 index 0000000000..a4ad927fed --- /dev/null +++ b/src/calibre/ebooks/lit/oeb.py @@ -0,0 +1,690 @@ +from __future__ import with_statement +import os +import sys +from collections import defaultdict +from types import StringTypes +from itertools import izip, count +from urlparse import urldefrag +from lxml import etree + +XML_PARSER = etree.XMLParser( + remove_blank_text=True, recover=True, resolve_entities=False) +XHTML_NS = 'http://www.w3.org/1999/xhtml' +OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' +OPF2_NS = 'http://www.idpf.org/2007/opf' +DC09_NS = 'http://purl.org/metadata/dublin_core' +DC10_NS = 'http://purl.org/dc/elements/1.0/' +DC11_NS = 'http://purl.org/dc/elements/1.1/' +XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' +DCTERMS_NS = 'http://purl.org/dc/terms/' +NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' +XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} + +def XHTML(name): return '{%s}%s' % (XHTML_NS, name) +def OPF(name): return '{%s}%s' % (OPF2_NS, name) +def DC(name): return '{%s}%s' % (DC11_NS, name) +def NCX(name): return '{%s}%s' % (NCX_NS, name) + +XHTML_MIME = 'application/xhtml+xml' +CSS_MIME = 'text/css' +NCX_MIME = 'application/x-dtbncx+xml' +OPF_MIME = 'application/oebps-package+xml' + +OEB_STYLES = set([CSS_MIME, 'text/x-oeb1-css', 'text/x-oeb-css']) +OEB_DOCS = set([XHTML_MIME, 'text/html', 'text/x-oeb1-document', + 'text/x-oeb-document']) + + +def element(parent, *args, **kwargs): + if parent is not None: + return etree.SubElement(parent, *args, **kwargs) + return etree.Element(*args, **kwargs) + +def namespace(name): + if '}' in name: + return name.split('}', 1)[0][1:] + return '' + +def barename(name): + if '}' in name: + return name.split('}', 1)[1] + return name + +def xpath(elem, expr): + return elem.xpath(expr, namespaces=XPNSMAP) + + +class AbstractContainer(object): + def read_xml(self, path): + return etree.fromstring( + self.read(path), parser=XML_PARSER, + base_url=os.path.dirname(path)) + +class DirContainer(AbstractContainer): + def __init__(self, rootdir): + self.rootdir = rootdir + + def read(self, path): + path = os.path.join(self.rootdir, path) + with open(path, 'rb') as f: + return f.read() + + def write(self, path, data): + path = os.path.join(self.rootdir, path) + with open(path, 'wb') as f: + return f.write(data) + + +class Metadata(object): + TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description', + 'format', 'identifier', 'language', 'publisher', 'relation', + 'rights', 'source', 'subject', 'title', 'type']) + OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} + OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS} + + class Item(object): + def __init__(self, term, value, fq_attrib={}): + if term == OPF('meta') and not value: + fq_attrib = dict(fq_attrib) + term = fq_attrib.pop('name') + value = fq_attrib.pop('content') + elif term in Metadata.TERMS and not namespace(term): + term = DC(term) + self.term = term + self.value = value + self.fq_attrib = dict(fq_attrib) + self.attrib = attrib = {} + for fq_attr in fq_attrib: + attr = barename(fq_attr) + attrib[attr] = fq_attrib[fq_attr] + + def __getattr__(self, name): + name = name.replace('_', '-') + try: + return self.attrib[name] + except KeyError: + raise AttributeError( + '%r object has no attribute %r' \ + % (self.__class__.__name__, name)) + + def __repr__(self): + return 'Item(term=%r, value=%r, attrib=%r)' \ + % (barename(self.term), self.value, self.attrib) + + def __str__(self): + return str(self.value) + + def __unicode__(self): + return unicode(self.value) + + def to_opf1(self, dcmeta=None, xmeta=None): + if namespace(self.term) == DC11_NS: + name = DC(barename(self.term).title()) + elem = element(dcmeta, name, attrib=self.attrib) + elem.text = self.value + else: + elem = element(xmeta, 'meta', attrib=self.attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def to_opf2(self, parent=None): + if namespace(self.term) == DC11_NS: + elem = element(parent, self.term, attrib=self.fq_attrib) + elem.text = self.value + else: + elem = element(parent, OPF('meta'), attrib=self.fq_attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def __init__(self, oeb): + self.oeb = oeb + self.items = defaultdict(list) + + def add(self, term, value, attrib): + item = self.Item(term, value, attrib) + items = self.items[barename(term)] + items.append(item) + return item + + def iterkeys(self): + for key in self.items: + yield key + __iter__ = iterkeys + + def __getitem__(self, key): + return self.items[key] + + def __contains__(self, key): + return key in self.items + + def __getattr__(self, term): + return self.items[term] + + def to_opf1(self, parent=None): + elem = element(parent, 'metadata') + dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + xmeta = element(elem, 'x-metadata') + for term in self.items: + for item in self.items[term]: + item.to_opf1(dcmeta, xmeta) + if 'ms-chaptertour' not in self.items: + chaptertour = self.Item('ms-chaptertour', 'chaptertour') + chaptertour.to_opf1(dcmeta, xmeta) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('metadata'), nsmap=self.NSMAP) + for term in self.items: + for item in self.items[term]: + item.to_opf2(elem) + return elem + + +class Manifest(object): + class Item(object): + def __init__(self, id, href, media_type, loader=str): + self.id = id + self.href = self.path = href.replace('%20', ' ') + self.media_type = media_type + self.spine_position = None + self.linear = True + self._loader = loader + self._data = None + + def __repr__(self): + return 'Item(id=%r, href=%r, media_type=%r)' \ + % (self.id, self.href, self.media_type) + + def data(): + def fget(self): + if self._data: + return self._data + data = self._loader(self.href) + if self.media_type == XHTML_MIME: + data = etree.fromstring(data, parser=XML_PARSER) + if namespace(data.tag) != XHTML_NS: + data.attrib['xmlns'] = XHTML_NS + data = etree.tostring(data) + data = etree.fromstring(data, parser=XML_PARSER) + elif self.media_type.startswith('application/') \ + and self.media_type.endswith('+xml'): + data = etree.fromstring(data, parser=XML_PARSER) + return data + def fset(self, value): + self._data = value + def fdel(self): + self._data = None + return property(fget, fset, fdel) + data = data() + + def __cmp__(self, other): + result = cmp(self.spine_position, other.spine_position) + if result != 0: + return result + return cmp(self.id, other.id) + + def __init__(self, oeb): + self.oeb = oeb + self.items = {} + self.hrefs = {} + + def add(self, id, href, media_type): + item = self.Item(id, href, media_type, self.oeb.container.read) + self.items[id] = item + self.hrefs[href] = item + return item + + def remove(self, id): + href = self.items[id].href + del self.items[id] + del self.hrefs[href] + + def __iter__(self): + for id in self.items: + yield id + + def __getitem__(self, id): + return self.items[id] + + def values(self): + for item in self.items.values(): + yield item + + def items(self): + for id, item in self.refs.items(): + yield id, items + + def __contains__(self, key): + return id in self.items + + def to_opf1(self, parent=None): + elem = element(parent, 'manifest') + for item in self.items.values(): + attrib = {'id': item.id, 'href': item.href, + 'media-type': item.media_type} + element(elem, 'item', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('manifest')) + for item in self.items.values(): + attrib = {'id': item.id, 'href': item.href, + 'media-type': item.media_type} + element(elem, OPF('item'), attrib=attrib) + return elem + + +class Spine(object): + def __init__(self, oeb): + self.oeb = oeb + self.items = [] + + def add(self, item, linear): + if isinstance(linear, StringTypes): + linear = linear.lower() + if linear is None or linear in ('yes', 'true'): + linear = True + elif linear in ('no', 'false'): + linear = False + item.linear = linear + item.spine_position = len(self.items) + self.items.append(item) + return item + + def __iter__(self): + for item in self.items: + yield item + + def __getitem__(self, index): + return self.items[index] + + def __len__(self): + return len(self.items) + + def __contains__(self, item): + return (item in self.items) + + def to_opf1(self, parent=None): + elem = element(parent, 'spine') + for item in self.items: + if item.linear: + element(elem, 'itemref', attrib={'idref': item.id}) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('spine')) + for item in self.items: + attrib = {'idref': item.id} + if not item.linear: + attrib['linear'] = 'no' + element(elem, OPF('itemref'), attrib=attrib) + return elem + + +class Guide(object): + class Reference(object): + def __init__(self, type, title, href): + self.type = type + self.title = title + self.href = href + + def __repr__(self): + return 'Reference(type=%r, title=%r, href=%r)' \ + % (self.type, self.title, self.href) + + def __init__(self, oeb): + self.oeb = oeb + self.refs = {} + + def add(self, type, title, href): + ref = self.Reference(type, title, href) + self.refs[type] = ref + return ref + + def by_type(self, type): + return self.ref_types[type] + + def iterkeys(self): + for type in self.refs: + yield type + __iter__ = iterkeys + + def values(self): + for ref in self.refs.values(): + yield ref + + def items(self): + for type, ref in self.refs.items(): + yield type, ref + + def __getitem__(self, index): + return self.refs[index] + + def __contains__(self, key): + return key in self.refs + + def to_opf1(self, parent=None): + elem = element(parent, 'guide') + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, 'reference', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('guide')) + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, OPF('reference'), attrib=attrib) + return elem + + +class Toc(object): + def __init__(self, title=None, href=None, klass=None, id=None): + self.title = title + self.href = href + self.klass = klass + self.id = id + self.nodes = [] + + def add(self, title, href, klass=None, id=None): + node = Toc(title, href, klass, id) + self.nodes.append(node) + return node + + def __iter__(self): + for node in self.nodes: + yield node + + def __getitem__(self, index): + return self.nodes[index] + + def depth(self, level=0): + if self.nodes: + return self.nodes[0].depth(level+1) + return level + + def to_opf1(self, tour): + for node in self.nodes: + element(tour, 'site', + attrib={'title': node.title, 'href': node.href}) + node.to_opf1(tour) + return tour + + def to_ncx(self, parent, playorder=None, depth=1): + if not playorder: playorder = [0] + for node in self.nodes: + playorder[0] += 1 + point = etree.SubElement(parent, + NCX('navPoint'), attrib={'playOrder': str(playorder[0])}) + if self.klass: + point.attrib['class'] = self.klass + if self.id: + point.attrib['id'] = self.id + label = etree.SubElement(point, NCX('navLabel')) + etree.SubElement(label, NCX('text')).text = node.title + href = node.href if depth > 1 else node.href.split('#', 1)[0] + etree.SubElement(point, NCX('content'), attrib={'src': href}) + node.to_ncx(point, playorder, depth+1) + return parent + + +class Oeb(object): + def __init__(self, opfpath, container=None): + if not container: + container = DirContainer(os.path.dirname(opfpath)) + opfpath = os.path.basename(opfpath) + self.container = container + opf = self._read_opf(opfpath) + self._all_from_opf(opf) + + def _convert_opf1(self, opf): + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib)) + metadata = etree.SubElement(nroot, OPF('metadata'), + nsmap={'opf': OPF2_NS, 'dc': DC11_NS, + 'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) + for prefix in ('d11', 'd10', 'd09'): + elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix) + if elements: break + for element in elements: + if not element.text: continue + tag = barename(element.tag).lower() + element.tag = '{%s}%s' % (DC11_NS, tag) + for name in element.attrib: + if name in ('role', 'file-as', 'scheme'): + nsname = '{%s}%s' % (OPF2_NS, name) + element.attrib[nsname] = element.attrib[name] + del element.attrib[name] + metadata.append(element) + for element in opf.xpath('metadata/x-metadata/meta'): + metadata.append(element) + for item in opf.xpath('manifest/item'): + media_type = item.attrib['media-type'] + if media_type in OEB_DOCS: + media_type = XHTML_MIME + elif media_type in OEB_STYLES: + media_type = CSS_MIME + item.attrib['media-type'] = media_type + for tag in ('manifest', 'spine', 'tours', 'guide'): + for element in opf.xpath(tag): + nroot.append(element) + return etree.fromstring(etree.tostring(nroot), parser=XML_PARSER) + + def _read_opf(self, opfpath): + opf = self.container.read_xml(opfpath) + version = float(opf.get('version', 1.0)) + if version < 2.0: + opf = self._convert_opf1(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.attrib['unique-identifier'] + self.metadata = metadata = Metadata(self) + for elem in xpath(opf, '/o2:package/o2:metadata/*'): + metadata.add(elem.tag, elem.text, elem.attrib) + for item in metadata.identifier: + if item.id == uid: + self.uid = item + break + + def _manifest_from_opf(self, opf): + self.manifest = manifest = Manifest(self) + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + manifest.add(elem.get('id'), elem.get('href'), + elem.get('media-type')) + + def _spine_from_opf(self, opf): + self.spine = spine = Spine(self) + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + item = self.manifest[elem.get('idref')] + spine.add(item, elem.get('linear')) + extras = [] + for item in self.manifest.values(): + if item.media_type == XHTML_MIME \ + and item not in spine: + extras.append(item) + extras.sort() + for item in extras: + spine.add(item, False) + + def _guide_from_opf(self, opf): + self.guide = guide = Guide(self) + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + guide.add(elem.get('type'), elem.get('title'), elem.get('href')) + + def _toc_from_navpoint(self, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0] + href = xpath(child, 'ncx:content/@src')[0] + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(node, child) + + def _toc_from_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if not result: + return False + id = result[0] + ncx = self.manifest[id].data + self.manifest.remove(id) + title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0] + self.toc = toc = Toc(title) + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, '/o2:package/o2:tours/o2:tour') + if not result: + return False + tour = result[0] + self.toc = toc = Toc(tour.get('title')) + sites = xpath(tour, 'o2:site') + for site in sites: + toc.add(site.get('title'), site.get('href')) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.guide: + return False + self.toc = toc = Toc() + itempath, frag = urldefrag(self.guide['toc'].href) + item = self.manifest.hrefs[itempath] + html = item.data + if frag: + elem = xpath(html, './/*[@id="%s"]' % frag) + html = elem[0] if elem else html + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + path, frag = urldefrag(href) + if not path: + href = '#'.join((itempath, frag)) + title = ' '.join(xpath(anchor, './/text()')) + if href not in titles: + order.append(href) + titles[href].append(title) + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + self.toc = toc = Toc() + titles = [] + headers = [] + for item in self.spine: + if not item.linear: continue + html = item.data + title = xpath(html, '/h:html/h:head/h:title/text()') + if title: titles.append(title[0]) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) + header = xpath(html, expr) + if header: + headers[-1] = header[0] + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf): + if self._toc_from_ncx(opf): return + if self._toc_from_tour(opf): return + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _all_from_opf(self, opf): + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + self._toc_from_opf(opf) + + def to_opf1(self): + package = etree.Element('package', + attrib={'unique-identifier': self.uid.id}) + metadata = self.metadata.to_opf1(package) + manifest = self.manifest.to_opf1(package) + spine = self.spine.to_opf1(package) + tours = element(package, 'tours') + tour = element(tours, 'tour', + attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) + self.toc.to_opf1(tour) + guide = self.guide.to_opf1(package) + return {OPF_MIME: ('content.opf', package)} + + def _generate_ncx_item(self): + id = 'ncx' + index = 0 + while id in self.manifest: + id = 'ncx' + str(index) + index = index + 1 + href = 'toc' + index = 0 + while (href + '.ncx') in self.manifest.hrefs: + href = 'toc' + str(index) + href += '.ncx' + return (id, href) + + def _to_ncx(self): + ncx = etree.Element(NCX('ncx'), attrib={'version': '2005-1'}, + nsmap={None: NCX_NS}) + head = etree.SubElement(ncx, NCX('head')) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:uid', 'content': unicode(self.uid)}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:totalPageCount', 'content': '0'}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:maxPageNumber', 'content': '0'}) + title = etree.SubElement(ncx, NCX('docTitle')) + text = etree.SubElement(title, NCX('text')) + text.text = unicode(self.metadata.title[0]) + navmap = etree.SubElement(ncx, NCX('navMap')) + self.toc.to_ncx(navmap) + return ncx + + def to_opf2(self): + package = etree.Element(OPF('package'), + attrib={'version': '2.0', 'unique-identifier': self.uid.id}, + nsmap={None: OPF2_NS}) + metadata = self.metadata.to_opf2(package) + manifest = self.manifest.to_opf2(package) + id, href = self._generate_ncx_item() + etree.SubElement(manifest, OPF('item'), + attrib={'id': id, 'href': href, 'media-type': NCX_MIME}) + spine = self.spine.to_opf2(package) + spine.attrib['toc'] = id + guide = self.guide.to_opf2(package) + ncx = self._to_ncx() + return {OPF_MIME: ('content.opf', package), + NCX_MIME: (href, ncx)} + +def main(argv=sys.argv): + for arg in argv[1:]: + oeb = Oeb(arg) + for name, doc in oeb.to_opf2().items(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/split.py b/src/calibre/ebooks/lit/split.py new file mode 100644 index 0000000000..2083f95016 --- /dev/null +++ b/src/calibre/ebooks/lit/split.py @@ -0,0 +1,149 @@ +#! /usr/bin/python + +from __future__ import with_statement +import sys +import os +import re +import types +import copy +import itertools +from collections import defaultdict +from lxml import etree +from stylizer import Page, Stylizer, Style + +XHTML_NS = 'http://www.w3.org/1999/xhtml' +XPNSMAP = {'h': XHTML_NS,} + +class Splitter(object): + XML_PARSER = etree.XMLParser(remove_blank_text=True) + COLLAPSE = re.compile(r'[ \n\r]+') + CONTENT_TAGS = set(['img', 'object', 'embed']) + for tag in list(CONTENT_TAGS): + CONTENT_TAGS.add('{%s}%s' % (XHTML_NS, tag)) + + def __init__(self, path): + with open(path, 'rb') as f: + self.tree = etree.parse(f, parser=self.XML_PARSER) + self.stylizer = Stylizer(self.tree, path) + self.path = path + self.basename = os.path.splitext( + os.path.basename(path))[0].lower() + self.splits = [] + self.names = [] + self.idmap = {} + self.fonts = defaultdict(int) + self.content = False + + def split(self): + tree = self.tree + for prefix in ('', 'h:'): + d = {'h': prefix} + roots = tree.xpath('/%(h)shtml' % d, namespaces=XPNSMAP) + if roots: break + self.root, = roots + self.head, = tree.xpath('/%(h)shtml/%(h)shead' % d, namespaces=XPNSMAP) + body, = tree.xpath('/%(h)shtml/%(h)sbody' % d, namespaces=XPNSMAP) + self._split(body, [self.new_root(str(self.basename))], 9.0) + results = zip(self.names, self.splits) + self.post_process_links(results, d) + return results + + def new_root(self, name): + nroot = self.dup(self.root) + nroot.append(copy.deepcopy(self.head)) + self.splits.append(nroot) + self.names.append(name + '.html') + return nroot + + def dup(self, e): + new = etree.Element(e.tag, nsmap=e.nsmap, **dict(e.attrib)) + new.text = e.text + new.tail = e.tail + return new + + def dupsub(self, p, e): + new = etree.SubElement(p, e.tag, nsmap=e.nsmap, **dict(e.attrib)) + new.text = e.text + new.tail = e.tail + return new + + def _split(self, src, dstq, psize): + style = self.stylizer.style(src) + if self.new_page(style, 'before'): + self.new_split(src, dstq) + attrib = src.attrib + name = self.names[-1] + for aname in ('id', 'name'): + if aname in attrib: + self.idmap[attrib[aname]] = name + text = self.COLLAPSE.sub(' ', src.text or '') + tail = self.COLLAPSE.sub(' ', src.text or '') + if text or tail or src.tag.lower() in self.CONTENT_TAGS: + self.content = True + size = style['font-size'] + self.fonts[size] += len(text) + self.fonts[psize] += len(tail) + new = self.dupsub(dstq[-1], src) + if len(src) > 0: + dstq.append(new) + for child in src: + self._split(child, dstq, size) + dstq.pop() + if self.new_page(style, 'after'): + self.new_split(src, dstq) + + def new_page(self, style, when): + if self.content \ + and (style['page-break-%s' % when] \ + in ('always', 'odd', 'even')): + return True + return False + + def new_split(self, src, dstq): + name = self.basename + attrib = src.attrib + if 'class' in attrib: + name = src.attrib['class'] + if ' ' in name: + name = name.split(' ', 2)[0] + if 'id' in attrib: + name = '%s-%s' % (name, attrib['id']) + name = name.lower().replace('_', '-') + if (name + '.html') in self.names: + name = '%s-%02d' % (name, len(self.names)) + prev = None + for i in xrange(len(dstq)): + new = self.new_root(name) if prev is None \ + else self.dupsub(prev, dstq[i]) + prev = dstq[i] = new + self.content = False + + def post_process_links(self, results, prefixes): + basename = os.path.basename(self.path) + query = '//%(h)sa[@href]' % prefixes + for name, root in results: + elements = root.xpath(query, namespaces=XPNSMAP) + for element in elements: + href = element.attrib['href'] + if '#' not in href: continue + fname, id = href.split('#', 2) + if fname in ('', basename): + href = '#'.join((self.idmap[id], id)) + element.attrib['href'] = href + +def main(): + def xml2str(root): + return etree.tostring(root, pretty_print=True, + encoding='utf-8', xml_declaration=True) + tree = None + path = sys.argv[1] + dest = sys.argv[2] + splitter = Splitter(path) + for name, root in splitter.split(): + print name + with open(os.path.join(dest, name), 'wb') as f: + f.write(xml2str(root)) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/stylizer.py b/src/calibre/ebooks/lit/stylizer.py new file mode 100644 index 0000000000..97b7e2d91d --- /dev/null +++ b/src/calibre/ebooks/lit/stylizer.py @@ -0,0 +1,435 @@ +#! /usr/bin/python2.5 +# -*- encoding: utf-8 -*- + +from __future__ import with_statement +import sys +import os +import locale +import codecs +import itertools +import types +import re +import copy +import cssutils +from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \ + CSSValueList, cssproperties +from lxml import etree +from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename +from calibre.resources import html_css + +HTML_CSS_STYLESHEET = cssutils.parseString(html_css) +XHTML_CSS_NAMESPACE = "@namespace url(http://www.w3.org/1999/xhtml);\n" + +INHERITED = set(['azimuth', 'border-collapse', 'border-spacing', + 'caption-side', 'color', 'cursor', 'direction', 'elevation', + 'empty-cells', 'font-family', 'font-size', 'font-style', + 'font-variant', 'font-weight', 'letter-spacing', + 'line-height', 'list-style-image', 'list-style-position', + 'list-style-type', 'orphans', 'page-break-inside', + 'pitch-range', 'pitch', 'quotes', 'richness', 'speak-header', + 'speak-numeral', 'speak-punctuation', 'speak', 'speech-rate', + 'stress', 'text-align', 'text-indent', 'text-transform', + 'visibility', 'voice-family', 'volume', 'white-space', + 'widows', 'word-spacing']) + +DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll', + 'background-color': 'transparent', 'background-image': 'none', + 'background-position': '0% 0%', 'background-repeat': 'repeat', + 'border-bottom-color': ':color', 'border-bottom-style': 'none', + 'border-bottom-width': 'medium', 'border-collapse': 'separate', + 'border-left-color': ':color', 'border-left-style': 'none', + 'border-left-width': 'medium', 'border-right-color': ':color', + 'border-right-style': 'none', 'border-right-width': 'medium', + 'border-spacing': 0, 'border-top-color': ':color', + 'border-top-style': 'none', 'border-top-width': 'medium', 'bottom': + 'auto', 'caption-side': 'top', 'clear': 'none', 'clip': 'auto', + 'color': 'black', 'content': 'normal', 'counter-increment': 'none', + 'counter-reset': 'none', 'cue-after': 'none', 'cue-before': 'none', + 'cursor': 'auto', 'direction': 'ltr', 'display': 'inline', + 'elevation': 'level', 'empty-cells': 'show', 'float': 'none', + 'font-family': 'serif', 'font-size': 'medium', 'font-style': + 'normal', 'font-variant': 'normal', 'font-weight': 'normal', + 'height': 'auto', 'left': 'auto', 'letter-spacing': 'normal', + 'line-height': 'normal', 'list-style-image': 'none', + 'list-style-position': 'outside', 'list-style-type': 'disc', + 'margin-bottom': 0, 'margin-left': 0, 'margin-right': 0, + 'margin-top': 0, 'max-height': 'none', 'max-width': 'none', + 'min-height': 0, 'min-width': 0, 'orphans': '2', + 'outline-color': 'invert', 'outline-style': 'none', + 'outline-width': 'medium', 'overflow': 'visible', 'padding-bottom': + 0, 'padding-left': 0, 'padding-right': 0, 'padding-top': 0, + 'page-break-after': 'auto', 'page-break-before': 'auto', + 'page-break-inside': 'auto', 'pause-after': 0, 'pause-before': + 0, 'pitch': 'medium', 'pitch-range': '50', 'play-during': 'auto', + 'position': 'static', 'quotes': u"'“' '”' '‘' '’'", 'richness': + '50', 'right': 'auto', 'speak': 'normal', 'speak-header': 'once', + 'speak-numeral': 'continuous', 'speak-punctuation': 'none', + 'speech-rate': 'medium', 'stress': '50', 'table-layout': 'auto', + 'text-align': 'left', 'text-decoration': 'none', 'text-indent': + 0, 'text-transform': 'none', 'top': 'auto', 'unicode-bidi': + 'normal', 'vertical-align': 'baseline', 'visibility': 'visible', + 'voice-family': 'default', 'volume': 'medium', 'white-space': + 'normal', 'widows': '2', 'width': 'auto', 'word-spacing': 'normal', + 'z-index': 'auto'} + +FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', + 'x-large', 'xx-large']) + +FONT_SIZE_LIST = [('xx-small', 1, 6.), + ('x-small', None, 7.), + ('small', 2, 8.), + ('medium', 3, 9.), + ('large', 4, 11.), + ('x-large', 5, 13.), + ('xx-large', 6, 15.), + (None, 7, 17.)] + +FONT_SIZE_BY_NAME = {} +FONT_SIZE_BY_NUM = {} +for name, num, size in FONT_SIZE_LIST: + FONT_SIZE_BY_NAME[name] = size + FONT_SIZE_BY_NUM[num] = size + +XPNSMAP = {'h': XHTML_NS,} +def xpath(elem, expr): + return elem.xpath(expr, namespaces=XPNSMAP) + + +class Page(object): + def __init__(self, width, height, dpi): + self.width = float(width) + self.height = float(height) + self.dpi = float(dpi) + +class Profiles(object): + PRS500 = Page(584, 754, 168.451) + PRS505 = PRS500 + + +class Stylizer(object): + STYLESHEETS = {} + + def __init__(self, tree, path, oeb, page=Profiles.PRS505): + self.page = page + base = os.path.dirname(path) + basename = os.path.basename(path) + cssname = os.path.splitext(basename)[0] + '.css' + stylesheets = [HTML_CSS_STYLESHEET] + head = xpath(tree, '/h:html/h:head')[0] + for elem in head: + tag = barename(elem.tag) + if tag == 'style': + text = ''.join(elem.text) + stylesheet = cssutils.parseString(text, href=cssname) + stylesheets.append(stylesheet) + elif tag == 'link' \ + and elem.get('rel', 'stylesheet') == 'stylesheet' \ + and elem.get('type', CSS_MIME) in OEB_STYLES: + href = elem.attrib['href'] + path = os.path.join(base, href) + path = os.path.normpath(path).replace('\\', '/') + if path in self.STYLESHEETS: + stylesheet = self.STYLESHEETS[path] + else: + data = XHTML_CSS_NAMESPACE + data += oeb.manifest.hrefs[path].data + stylesheet = cssutils.parseString(data, href=path) + self.STYLESHEETS[path] = stylesheet + stylesheets.append(stylesheet) + rules = [] + index = 0 + self.stylesheets = set() + for stylesheet in stylesheets: + href = stylesheet.href + self.stylesheets.add(href) + for rule in stylesheet.cssRules: + rules.extend(self.flatten_rule(rule, href, index)) + index = index + 1 + rules.sort() + self.rules = rules + self._styles = {} + + def flatten_rule(self, rule, href, index): + results = [] + if isinstance(rule, CSSStyleRule): + style = self.flatten_style(rule.style) + for selector in rule.selectorList: + specificity = selector.specificity + (index,) + text = selector.selectorText + selector = list(selector.seq) + results.append((specificity, selector, style, text, href)) + elif isinstance(rule, CSSPageRule): + style = self.flatten_style(rule.style) + results.append(((0, 0, 0, 0), [], style, '@page', href)) + return results + + def flatten_style(self, cssstyle): + style = {} + for prop in cssstyle: + name = prop.name + if name in ('margin', 'padding'): + style.update(self._normalize_edge(prop.cssValue, name)) + elif name == 'font': + style.update(self._normalize_font(prop.cssValue)) + else: + style[name] = prop.value + if 'font-size' in style: + size = style['font-size'] + if size == 'normal': size = 'medium' + if size in FONT_SIZE_NAMES: + style['font-size'] = "%dpt" % FONT_SIZE_BY_NAME[size] + return style + + def _normalize_edge(self, cssvalue, name): + style = {} + if isinstance(cssvalue, CSSValueList): + primitives = [v.cssText for v in cssvalue] + else: + primitives = [cssvalue.cssText] + if len(primitives) == 1: + value, = primitives + values = [value, value, value, value] + elif len(primitives) == 2: + vert, horiz = primitives + values = [vert, horiz, vert, horiz] + elif len(primitives) == 3: + top, horiz, bottom = primitives + values = [top, horiz, bottom, horiz] + else: + values = primitives[:4] + edges = ('top', 'right', 'bottom', 'left') + for edge, value in itertools.izip(edges, values): + style["%s-%s" % (name, edge)] = value + return style + + def _normalize_font(self, cssvalue): + composition = ('font-style', 'font-variant', 'font-weight', + 'font-size', 'line-height', 'font-family') + style = {} + if cssvalue.cssText == 'inherit': + for key in composition: + style[key] = 'inherit' + else: + primitives = [v.cssText for v in cssvalue] + primitites.reverse() + value = primitives.pop() + for key in composition: + if cssproperties.cssvalues[key](value): + style[key] = value + if not primitives: break + value = primitives.pop() + for key in composition: + if key not in style: + style[key] = DEFAULTS[key] + return style + + def style(self, element): + try: return self._styles[element] + except: pass + return Style(element, self) + + def stylesheet(self, name, font_scale=None): + rules = [] + for _, _, style, selector, href in self.rules: + if href != name: continue + if font_scale and 'font-size' in style and \ + style['font-size'].endswith('pt'): + style = copy.copy(style) + size = float(style['font-size'][:-2]) + style['font-size'] = "%.2fpt" % (size * font_scale) + style = ';\n '.join(': '.join(item) for item in style.items()) + rules.append('%s {\n %s;\n}' % (selector, style)) + return '\n'.join(rules) + +class Style(object): + def __init__(self, element, stylizer): + self._element = element + self._page = stylizer.page + self._stylizer = stylizer + self._style = self._assemble_style(element, stylizer) + stylizer._styles[element] = self + + def _assemble_style(self, element, stylizer): + result = {} + rules = stylizer.rules + for _, selector, style, _, _ in rules: + if self._selects_element(element, selector): + result.update(style) + try: + style = CSSStyleDeclaration(element.attrib['style']) + result.update(stylizer.flatten_style(style)) + except KeyError: + pass + return result + + def _selects_element(self, element, selector): + def _selects_element(element, items, index): + if index == -1: + return True + item = items[index] + if item.type == 'universal': + pass + elif item.type == 'type-selector': + name1 = ("{%s}%s" % item.value).lower() + name2 = element.tag.lower() + if name1 != name2: + return False + elif item.type == 'id': + name1 = item.value[1:].lower() + name2 = element.attrib.get('id', '').lower().split() + if name1 != name2: + return False + elif item.type == 'class': + name = item.value[1:].lower() + classes = element.attrib.get('class', '').lower().split() + if name not in classes: + return False + elif item.type == 'child': + parent = element.getparent() + if parent is None: + return False + element = parent + elif item.type == 'descendant': + element = element.getparent() + while element is not None: + if _selects_element(element, items, index - 1): + return True + element = element.getparent() + return False + elif item.type == 'pseudo-class': + if item.value == ':first-child': + e = element.getprevious() + if e is not None: + return False + else: + return False + elif item.type == 'pseudo-element': + return False + else: + return False + return _selects_element(element, items, index - 1) + return _selects_element(element, selector, len(selector) - 1) + + def _has_parent(self): + parent = self._element.getparent() + return (parent is not None) \ + and (parent in self._stylizer._styles) + + def __getitem__(self, name): + domname = cssproperties._toDOMname(name) + if hasattr(self, domname): + return getattr(self, domname) + return self._unit_convert(self._get(name)) + + def _get(self, name): + result = None + styles = self._stylizer._styles + if name in self._style: + result = self._style[name] + if (result == 'inherit' + or (result is None and name in INHERITED + and self._has_parent())): + result = styles[self._element.getparent()]._get(name) + if result is None: + result = DEFAULTS[name] + return result + + def _unit_convert(self, value, base=None, font=None): + if isinstance(value, (int, long, float)): + return value + try: + if float(value) == 0: + return 0.0 + except: + pass + result = value + m = re.search( + r"^(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)$", value) + if m is not None and m.group(1): + value = float(m.group(1)) + unit = m.group(2) + if unit == '%': + base = base or self.width + result = (value/100.0) * base + elif unit == 'px': + result = value * 72.0 / self._page.dpi + elif unit == 'in': + result = value * 72.0 + elif unit == 'pt': + result = value + elif unit == 'em': + font = font or self.fontSize + result = value * font + elif unit == 'pc': + result = value * 12.0 + elif unit == 'mm': + result = value * 0.04 + elif unit == 'cm': + result = value * 0.40 + return result + + @property + def fontSize(self): + def normalize_fontsize(value, base=None): + result = None + factor = None + if value == 'inherit': + value = 'medium' + if value in FONT_SIZE_NAMES: + result = FONT_SIZE_BY_NAME[value] + elif value == 'smaller': + factor = 1.0/1.2 + for _, _, size in FONT_SIZE_LIST: + if base <= size: break + factor = None + result = size + elif value == 'larger': + factor = 1.2 + for _, _, size in reversed(FONT_SIZE_LIST): + if base >= size: break + factor = None + result = size + else: + result = self._unit_convert(value, base=base, font=base) + if result < 0: + result = normalize_fontsize("smaller", base) + if factor: + result = factor * base + return result + result = None + if self._has_parent(): + styles = self._stylizer._styles + base = styles[self._element.getparent()].fontSize + else: + base = normalize_fontsize(DEFAULTS['font-size']) + if 'font-size' in self._style: + size = self._style['font-size'] + result = normalize_fontsize(size, base) + else: + result = base + self.__dict__['fontSize'] = result + return result + + @property + def width(self): + result = None + base = None + if self._has_parent(): + styles = self._stylizer._styles + base = styles[self._element.getparent()].width + else: + base = self._page.width + if 'width' in self._style: + width = self._style['width'] + if width == 'auto': + result = base + else: + result = self._unit_convert(width, base=base) + else: + result = base + self.__dict__['width'] = result + return result + + def __str__(self): + items = self._style.items() + return '; '.join("%s: %s" % (key, val) for key, val in items) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py new file mode 100644 index 0000000000..5ed3bdf8ec --- /dev/null +++ b/src/calibre/ebooks/lit/writer.py @@ -0,0 +1,655 @@ +from __future__ import with_statement +import sys +import os +from cStringIO import StringIO +from struct import pack, unpack +from itertools import izip, count +import time +import random +import re +import copy +import uuid +import functools +from lxml import etree +from calibre.ebooks.lit.reader import msguid, DirectoryEntry +import calibre.ebooks.lit.maps as maps +from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME +from calibre.ebooks.lit.oeb import Oeb, namespace, barename +from calibre.ebooks.lit.stylizer import Stylizer +from calibre.ebooks.lit.lzxcomp import Compressor +import calibre +from calibre import plugins +msdes, msdeserror = plugins['msdes'] +import calibre.ebooks.lit.mssha1 as mssha1 + +__all__ = ['LitWriter'] + +def invert_tag_map(tag_map): + tags, dattrs, tattrs = tag_map + tags = dict((tags[i], i) for i in xrange(len(tags))) + dattrs = dict((v, k) for k, v in dattrs.items()) + tattrs = [dict((v, k) for k, v in (map or {}).items()) for map in tattrs] + for map in tattrs: + if map: map.update(dattrs) + tattrs[0] = dattrs + return tags, tattrs + +OPF_MAP = invert_tag_map(maps.OPF_MAP) +HTML_MAP = invert_tag_map(maps.HTML_MAP) + +LIT_MAGIC = 'ITOLITLS' + +LITFILE_GUID = "{0A9007C1-4076-11D3-8789-0000F8105754}" +PIECE3_GUID = "{0A9007C3-4076-11D3-8789-0000F8105754}" +PIECE4_GUID = "{0A9007C4-4076-11D3-8789-0000F8105754}" +DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" +LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" + +def packguid(guid): + values = guid[1:9], guid[10:14], guid[15:19], \ + guid[20:22], guid[22:24], guid[25:27], guid[27:29], \ + guid[29:31], guid[31:33], guid[33:35], guid[35:37] + values = [int(value, 16) for value in values] + return pack(">= 7 + if bytes: + b |= 0x80 + bytes.append(chr(b)) + if value == 0: + break + return ''.join(reversed(bytes)) + +def randbytes(n): + return ''.join(chr(random.randint(0, 255)) for x in xrange(n)) + +class ReBinary(object): + def __init__(self, root, path, oeb, map=HTML_MAP): + self.dir = os.path.dirname(path) + self.manifest = oeb.manifest + self.tags, self.tattrs = map + self.buf = StringIO() + self.anchors = [] + self.page_breaks = [] + self.is_html = is_html = map is HTML_MAP + self.stylizer = Stylizer(root, path, oeb) if is_html else None + self.tree_to_binary(root) + self.content = self.buf.getvalue() + self.ahc = self.build_ahc() + self.aht = self.build_aht() + + def write(self, *values): + for value in values: + if isinstance(value, (int, long)): + value = unichr(value) + self.buf.write(value.encode('utf-8')) + + def tree_to_binary(self, elem, nsrmap={'': None}, parents=[], + inhead=False, preserve=False): + if not isinstance(elem.tag, basestring): + self.write(etree.tostring(elem)) + return + nsrmap = copy.copy(nsrmap) + attrib = dict(elem.attrib) + style = self.stylizer.style(elem) if self.stylizer else None + for key, value in elem.nsmap.items(): + if value not in nsrmap or nsrmap[value] != key: + xmlns = ('xmlns:' + key) if key else 'xmlns' + attrib[xmlns] = value + nsrmap[value] = key + tag = prefixname(elem.tag, nsrmap) + tag_offset = self.buf.tell() + if tag == 'head': + inhead = True + flags = FLAG_OPENING + if not elem.text and len(elem) == 0: + flags |= FLAG_CLOSING + if inhead: + flags |= FLAG_HEAD + if style and style['display'] in ('block', 'table'): + flags |= FLAG_BLOCK + self.write(0, flags) + tattrs = self.tattrs[0] + if tag in self.tags: + index = self.tags[tag] + self.write(index) + if self.tattrs[index]: + tattrs = self.tattrs[index] + else: + self.write(FLAG_CUSTOM, len(tag)+1, tag) + last_break = self.page_breaks[-1][0] if self.page_breaks else None + if style and last_break != tag_offset \ + and style['page-break-before'] not in ('avoid', 'auto'): + self.page_breaks.append((tag_offset, list(parents))) + for attr, value in attrib.items(): + attr = prefixname(attr, nsrmap) + if attr in ('href', 'src'): + path, hash, frag = value.partition('#') + path = os.path.join(self.dir, path) + path = os.path.normpath(path) + path = path.replace('\\', '/') + prefix = unichr(3) + if path in self.manifest.hrefs: + prefix = unichr(2) + value = self.manifest.hrefs[path].id + if hash and frag: + value = '#'.join((value, frag)) + value = prefix + value + elif attr in ('id', 'name'): + self.anchors.append((value, tag_offset)) + elif attr.startswith('ms--'): + attr = '%' + attr[4:] + if attr in tattrs: + self.write(tattrs[attr]) + else: + self.write(FLAG_CUSTOM, len(attr)+1, attr) + try: + self.write(ATTR_NUMBER, int(value)+1) + except ValueError: + self.write(len(value)+1, value) + self.write(0) + if elem.text: + text = elem.text + if style and style['white-space'] == 'pre': + preserve = True + if elem.get('xml:space') == 'preserve': + preserve = True + if not preserve: + text = COLLAPSE.sub(' ', text) + self.write(text) + parents.append(tag_offset) + for child in elem: + self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + parents.pop() + if not flags & FLAG_CLOSING: + self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) + if elem.tail: + tail = elem.tail + if tag != 'pre': + tail = COLLAPSE.sub(' ', tail) + self.write(tail) + if style and style['page-break-after'] not in ('avoid', 'auto'): + self.page_breaks.append((self.buf.tell(), list(parents))) + + def build_ahc(self): + data = StringIO() + data.write(unichr(len(self.anchors)).encode('utf-8')) + for anchor, offset in self.anchors: + data.write(unichr(len(anchor)).encode('utf-8')) + data.write(anchor) + data.write(pack(' 0: + section = self._sections[secnum] + offset = section.tell() + section.write(data) + else: + offset = 0 + self._directory.append( + DirectoryEntry(name, secnum, offset, len(data))) + + def _add_folder(self, name, offset=0, size=0): + if not name.endswith('/'): + name += '/' + self._directory.append( + DirectoryEntry(name, 0, offset, size)) + + def _djoin(self, *names): + return '/'.join(names) + + def _build_sections(self): + self._add_folder('/', ROOT_OFFSET, ROOT_SIZE) + self._build_data() + self._build_manifest() + self._build_page_breaks() + self._build_meta() + self._build_drm_storage() + self._build_version() + self._build_namelist() + self._build_storage() + self._build_transforms() + + def _build_data(self): + self._add_folder('/data') + for item in self._oeb.manifest.values(): + name = '/data/' + item.id + data = item.data + secnum = 0 + if not isinstance(data, basestring): + self._add_folder(name) + rebin = ReBinary(data, item.href, self._oeb) + self._add_file(name + '/ahc', rebin.ahc, 0) + self._add_file(name + '/aht', rebin.aht, 0) + item.page_breaks = rebin.page_breaks + data = rebin.content + name = name + '/content' + secnum = 1 + self._add_file(name, data, secnum) + item.size = len(data) + + def _build_manifest(self): + states = ['linear', 'nonlinear', 'css', 'images'] + manifest = dict((state, []) for state in states) + for item in self._oeb.manifest.values(): + if item.spine_position is not None: + key = 'linear' if item.linear else 'nonlinear' + manifest[key].append(item) + elif item.media_type == CSS_MIME: + manifest['css'].append(item) + else: + manifest['images'].append(item) + data = StringIO() + data.write(pack(' 1: + pb3cur |= 0x2 + bits += 2 + if bits >= 8: + pb3.write(pack(' 0: + data = ("\000" * prepad) + data + prepad = 0 + postpad = 64 - (len(data) % 64) + if postpad < 64: + data = data + ("\000" * postpad) + hash.update(data) + digest = hash.digest() + key = [0] * 8 + for i in xrange(0, len(digest)): + key[i % 8] ^= ord(digest[i]) + return ''.join(chr(x) for x in key) + + def _build_dchunks(self): + ddata = [] + directory = list(self._directory) + directory.sort(cmp=lambda x, y: \ + cmp(x.name.lower(), y.name.lower())) + qrn = 1 + (1 << 2) + dchunk = StringIO() + dcount = 0 + quickref = [] + name = directory[0].name + for entry in directory: + next = ''.join([decint(len(entry.name)), entry.name, + decint(entry.section), decint(entry.offset), + decint(entry.size)]) + usedlen = dchunk.tell() + len(next) + (len(quickref) * 2) + 52 + if usedlen >= DCHUNK_SIZE: + ddata.append((dchunk.getvalue(), quickref, dcount, name)) + dchunk = StringIO() + dcount = 0 + quickref = [] + name = entry.name + if (dcount % qrn) == 0: + quickref.append(dchunk.tell()) + dchunk.write(next) + dcount = dcount + 1 + ddata.append((dchunk.getvalue(), quickref, dcount, name)) + cidmax = len(ddata) - 1 + rdcount = 0 + dchunks = [] + dcounts = [] + ichunk = None + if len(ddata) > 1: + ichunk = StringIO() + for cid, (content, quickref, dcount, name) in izip(count(), ddata): + dchunk = StringIO() + prev = cid - 1 if cid > 0 else ULL_NEG1 + next = cid + 1 if cid < cidmax else ULL_NEG1 + rem = DCHUNK_SIZE - (len(content) + 50) + pad = rem - (len(quickref) * 2) + dchunk.write('AOLL') + dchunk.write(pack(' Date: Tue, 9 Dec 2008 08:02:09 -0500 Subject: [PATCH 02/15] Unify handling of URIs/IRIs, storing in encoded, normalized form. --- src/calibre/ebooks/lit/oeb.py | 64 +++++++++++++++++++++++--------- src/calibre/ebooks/lit/reader.py | 12 +++--- src/calibre/ebooks/lit/writer.py | 16 ++++---- 3 files changed, 62 insertions(+), 30 deletions(-) diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py index a4ad927fed..d3773a61f1 100644 --- a/src/calibre/ebooks/lit/oeb.py +++ b/src/calibre/ebooks/lit/oeb.py @@ -4,7 +4,8 @@ import sys from collections import defaultdict from types import StringTypes from itertools import izip, count -from urlparse import urldefrag +from urlparse import urldefrag, urlparse, urlunparse +from urllib import unquote as urlunquote from lxml import etree XML_PARSER = etree.XMLParser( @@ -55,6 +56,22 @@ def barename(name): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) +URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """ +def urlquote(href): + result = [] + for char in href: + if char in URL_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +def urlnormalize(href): + parts = urlparse(href) + parts = (part.replace('\\', '/') for part in parts) + parts = (urlunquote(part) for part in parts) + parts = (urlquote(part) for part in parts) + return urlunparse(parts) + class AbstractContainer(object): def read_xml(self, path): @@ -68,12 +85,12 @@ class DirContainer(AbstractContainer): def read(self, path): path = os.path.join(self.rootdir, path) - with open(path, 'rb') as f: + with open(urlunquote(path), 'rb') as f: return f.read() def write(self, path, data): path = os.path.join(self.rootdir, path) - with open(path, 'wb') as f: + with open(urlunquote(path), 'wb') as f: return f.write(data) @@ -178,7 +195,7 @@ class Metadata(object): return elem def to_opf2(self, parent=None): - elem = element(parent, OPF('metadata'), nsmap=self.NSMAP) + elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP) for term in self.items: for item in self.items[term]: item.to_opf2(elem) @@ -189,7 +206,7 @@ class Manifest(object): class Item(object): def __init__(self, id, href, media_type, loader=str): self.id = id - self.href = self.path = href.replace('%20', ' ') + self.href = self.path = urlnormalize(href) self.media_type = media_type self.spine_position = None self.linear = True @@ -235,8 +252,8 @@ class Manifest(object): def add(self, id, href, media_type): item = self.Item(id, href, media_type, self.oeb.container.read) - self.items[id] = item - self.hrefs[href] = item + self.items[item.id] = item + self.hrefs[item.href] = item return item def remove(self, id): @@ -331,7 +348,7 @@ class Guide(object): def __init__(self, type, title, href): self.type = type self.title = title - self.href = href + self.href = urlnormalize(href) def __repr__(self): return 'Reference(type=%r, title=%r, href=%r)' \ @@ -390,7 +407,7 @@ class Guide(object): class Toc(object): def __init__(self, title=None, href=None, klass=None, id=None): self.title = title - self.href = href + self.href = urlnormalize(href) if href else href self.klass = klass self.id = id self.nodes = [] @@ -414,8 +431,8 @@ class Toc(object): def to_opf1(self, tour): for node in self.nodes: - element(tour, 'site', - attrib={'title': node.title, 'href': node.href}) + element(tour, 'site', attrib={ + 'title': node.title, 'href': node.href}) node.to_opf1(tour) return tour @@ -431,8 +448,9 @@ class Toc(object): point.attrib['id'] = self.id label = etree.SubElement(point, NCX('navLabel')) etree.SubElement(label, NCX('text')).text = node.title - href = node.href if depth > 1 else node.href.split('#', 1)[0] - etree.SubElement(point, NCX('content'), attrib={'src': href}) + href = node.href if depth > 1 else urldefrag(node.href)[0] + child = etree.SubElement(point, + NCX('content'), attrib={'src': href}) node.to_ncx(point, playorder, depth+1) return parent @@ -490,7 +508,8 @@ class Oeb(object): uid = opf.attrib['unique-identifier'] self.metadata = metadata = Metadata(self) for elem in xpath(opf, '/o2:package/o2:metadata/*'): - metadata.add(elem.tag, elem.text, elem.attrib) + if elem.text or elem.attrib: + metadata.add(elem.tag, elem.text, elem.attrib) for item in metadata.identifier: if item.id == uid: self.uid = item @@ -524,7 +543,7 @@ class Oeb(object): def _toc_from_navpoint(self, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: - title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0] + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) href = xpath(child, 'ncx:content/@src')[0] id = child.get('id') klass = child.get('class') @@ -564,8 +583,13 @@ class Oeb(object): item = self.manifest.hrefs[itempath] html = item.data if frag: - elem = xpath(html, './/*[@id="%s"]' % frag) - html = elem[0] if elem else html + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): @@ -574,6 +598,7 @@ class Oeb(object): if not path: href = '#'.join((itempath, frag)) title = ' '.join(xpath(anchor, './/text()')) + href = urlnormalize(href) if href not in titles: order.append(href) titles[href].append(title) @@ -679,10 +704,13 @@ class Oeb(object): return {OPF_MIME: ('content.opf', package), NCX_MIME: (href, ncx)} + def main(argv=sys.argv): for arg in argv[1:]: oeb = Oeb(arg) - for name, doc in oeb.to_opf2().items(): + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2().values(): print etree.tostring(doc, pretty_print=True) return 0 diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 671e48ab76..c04a845d69 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal ' \ import sys, struct, cStringIO, os import functools import re +from urlparse import urldefrag from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 +from calibre.ebooks.lit.oeb import urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] @@ -322,12 +324,12 @@ class UnBinary(object): href += c count -= 1 if count == 0: - doc, m, frag = href[1:].partition('#') + doc, frag = urldefrag(href[1:]) path = self.item_path(doc) - if m and frag: - path += m + frag - self.buf.write((u'"%s"' % path).encode( - 'ascii', 'xmlcharrefreplace')) + if frag: + path = '#'.join((path, frag)) + path = urlnormalize(path) + self.buf.write((u'"%s"' % path).encode('utf-8')) state = 'get attr' return index diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 5ed3bdf8ec..62c3877785 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -10,11 +10,14 @@ import re import copy import uuid import functools +from urlparse import urldefrag +from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit.reader import msguid, DirectoryEntry import calibre.ebooks.lit.maps as maps from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME -from calibre.ebooks.lit.oeb import Oeb, namespace, barename +from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize +from calibre.ebooks.lit.oeb import Oeb from calibre.ebooks.lit.stylizer import Stylizer from calibre.ebooks.lit.lzxcomp import Compressor import calibre @@ -173,15 +176,13 @@ class ReBinary(object): for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): - path, hash, frag = value.partition('#') - path = os.path.join(self.dir, path) - path = os.path.normpath(path) - path = path.replace('\\', '/') + value = urlnormalize(value) + path, frag = urldefrag(value) prefix = unichr(3) if path in self.manifest.hrefs: prefix = unichr(2) value = self.manifest.hrefs[path].id - if hash and frag: + if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): @@ -420,7 +421,8 @@ class LitWriter(object): items.sort() data.write(pack(' Date: Tue, 9 Dec 2008 08:54:43 -0500 Subject: [PATCH 03/15] Integrated LZX compression code. --- setup.py | 4 +- src/calibre/ebooks/lit/lzxcomp.py | 34 +- src/calibre/utils/lzx/lzc.c | 389 +++++++ src/calibre/utils/lzx/lzc.h | 60 ++ src/calibre/utils/lzx/lzxc.c | 1259 +++++++++++++++++++++++ src/calibre/utils/lzx/lzxc.h | 57 + src/calibre/utils/lzx/lzxd.c | 2 +- src/calibre/utils/lzx/{lzx.h => lzxd.h} | 0 src/calibre/utils/lzx/lzxmodule.c | 16 +- 9 files changed, 1794 insertions(+), 27 deletions(-) create mode 100644 src/calibre/utils/lzx/lzc.c create mode 100644 src/calibre/utils/lzx/lzc.h create mode 100644 src/calibre/utils/lzx/lzxc.c create mode 100644 src/calibre/utils/lzx/lzxc.h rename src/calibre/utils/lzx/{lzx.h => lzxd.h} (100%) diff --git a/setup.py b/setup.py index aa72b46f00..0465795970 100644 --- a/setup.py +++ b/setup.py @@ -374,7 +374,9 @@ if __name__ == '__main__': ext_modules = [ Extension('calibre.plugins.lzx', sources=['src/calibre/utils/lzx/lzxmodule.c', - 'src/calibre/utils/lzx/lzxd.c'], + 'src/calibre/utils/lzx/lzxd.c', + 'src/calibre/utils/lzx/lzc.c', + 'src/calibre/utils/lzx/lzxc.c'], include_dirs=['src/calibre/utils/lzx']), Extension('calibre.plugins.msdes', diff --git a/src/calibre/ebooks/lit/lzxcomp.py b/src/calibre/ebooks/lit/lzxcomp.py index 4f147a90a1..1a3f944c89 100644 --- a/src/calibre/ebooks/lit/lzxcomp.py +++ b/src/calibre/ebooks/lit/lzxcomp.py @@ -3,11 +3,11 @@ import sys import os from cStringIO import StringIO from ctypes import * +from calibre import plugins +_lzx, LzxError = plugins['lzx'] __all__ = ['Compressor'] -liblzxcomp = cdll.LoadLibrary('liblzxcomp.so') - class lzx_data(Structure): pass @@ -25,32 +25,22 @@ class lzx_results(Structure): # lzx_at_eof_t at_eof, # lzx_put_bytes_t put_bytes, void *put_bytes_arg, # lzx_mark_frame_t mark_frame, void *mark_frame_arg); -lzx_init = liblzxcomp.lzx_init -lzx_init.restype = c_int -lzx_init.argtypes = [POINTER(POINTER(lzx_data)), c_int, - lzx_get_bytes_t, c_voidp, - lzx_at_eof_t, - lzx_put_bytes_t, c_voidp, - lzx_mark_frame_t, c_voidp] +lzx_init_t = CFUNCTYPE( + c_int, POINTER(POINTER(lzx_data)), c_int, lzx_get_bytes_t, c_voidp, + lzx_at_eof_t, lzx_put_bytes_t, c_voidp, lzx_mark_frame_t, c_voidp) +lzx_init = lzx_init_t(_lzx._lzxc_init) # void lzx_reset(lzx_data *lzxd); -lzx_reset = liblzxcomp.lzx_reset -lzx_reset.restype = None -lzx_reset.argtypes = [POINTER(lzx_data)] +lzx_reset_t = CFUNCTYPE(None, POINTER(lzx_data)) +lzx_reset = lzx_reset_t(_lzx._lzxc_reset) # int lzx_compress_block(lzx_data *lzxd, int block_size, int subdivide); -lzx_compress_block = liblzxcomp.lzx_compress_block -lzx_compress_block.restype = c_int -lzx_compress_block.argtypes = [POINTER(lzx_data), c_int, c_int] +lzx_compress_block_t = CFUNCTYPE(c_int, POINTER(lzx_data), c_int, c_int) +lzx_compress_block = lzx_compress_block_t(_lzx._lzxc_compress_block) # int lzx_finish(struct lzx_data *lzxd, struct lzx_results *lzxr); -lzx_finish = liblzxcomp.lzx_finish -lzx_finish.restype = c_int -lzx_finish.argtypes = [POINTER(lzx_data), POINTER(lzx_results)] - - -class LzxError(Exception): - pass +lzx_finish_t = CFUNCTYPE(c_int, POINTER(lzx_data), POINTER(lzx_results)) +lzx_finish = lzx_finish_t(_lzx._lzxc_finish) class Compressor(object): diff --git a/src/calibre/utils/lzx/lzc.c b/src/calibre/utils/lzx/lzc.c new file mode 100644 index 0000000000..4ce6f24227 --- /dev/null +++ b/src/calibre/utils/lzx/lzc.c @@ -0,0 +1,389 @@ +/* + File lz_nonslide.c, part of lzxcomp library + Copyright (C) 2002 Matthew T. Russotto + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; version 2.1 only + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +/* + * Document here + */ +#include +#include +#include +#include +#include +#ifdef DEBUG_PERF +#include +#include +#endif +#include + +#define MAX_MATCH 253 +#define MIN_MATCH 2 + +void lz_init(lz_info *lzi, int wsize, int max_dist, + int max_match, int min_match, + int frame_size, + get_chars_t get_chars, + output_match_t output_match, + output_literal_t output_literal, void *user_data) +{ + /* the reason for the separate max_dist value is LZX can't reach the + first three characters in its nominal window. But using a smaller + window results in inefficiency when dealing with reset intervals + which are the length of the nominal window */ + + lzi->wsize = wsize; + if (max_match > wsize) + lzi->max_match = wsize; + else + lzi->max_match = max_match; + + lzi->min_match = min_match; + if (lzi->min_match < 3) lzi->min_match = 3; + + lzi->max_dist = max_dist; + lzi->block_buf_size = wsize + lzi->max_dist; + lzi->block_buf = malloc(lzi->block_buf_size); + lzi->block_bufe = lzi->block_buf + lzi->block_buf_size; + assert(lzi->block_buf != NULL); + + lzi->cur_loc = 0; + lzi->block_loc = 0; + lzi->chars_in_buf = 0; + lzi->eofcount = 0; + lzi->get_chars = get_chars; + lzi->output_match = output_match; + lzi->output_literal = output_literal; + lzi->user_data = user_data; + lzi->frame_size = frame_size; + lzi->lentab = calloc(lzi->block_buf_size + 1, sizeof(int)); + lzi->prevtab = calloc(lzi->block_buf_size + 1, sizeof(u_char *)); + lzi->analysis_valid = 0; +} + +void lz_release(lz_info *lzi) +{ + free(lzi->block_buf); + free(lzi->lentab); + free(lzi->prevtab); +} + +void lz_reset(lz_info *lzi) +{ + int residual = lzi->chars_in_buf - lzi->block_loc; + memmove(lzi->block_buf, lzi->block_buf + lzi->block_loc, residual); + lzi->chars_in_buf = residual; + lzi->block_loc = 0; + lzi->analysis_valid = 0; +} + +#ifdef LZNONSLIDE_MAIN +typedef struct lz_user_data +{ + FILE *infile; + FILE *outfile; + int R0, R1, R2; +} lz_user_data; + +int tmp_get_chars(lz_info *lzi, int n, u_char *buf) +{ + lz_user_data *lzud = (lz_user_data *)lzi->user_data; + return fread(buf, 1, n, lzud->infile); +} + +int tmp_output_match(lz_info *lzi, int match_pos, int match_len) +{ + lz_user_data *lzud = (lz_user_data *)lzi->user_data; + int mod_match_loc; + + mod_match_loc = match_pos; + + fprintf(lzud->outfile, "(%d, %d)(%d)\n", match_pos, match_len, mod_match_loc); + return 0; +} + +void tmp_output_literal(lz_info *lzi, u_char ch) +{ + lz_user_data *lzud = (lz_user_data *)lzi->user_data; + fprintf(lzud->outfile, "'%c'", ch); +} + +int main(int argc, char *argv[]) +{ + int wsize = atoi(argv[1]); + lz_info lzi; + lz_user_data lzu = {stdin, stdout, 1, 1, 1}; + + lz_init(&lzi, wsize, wsize, MAX_MATCH, MIN_MATCH, 8192, tmp_get_chars, tmp_output_match, tmp_output_literal,&lzu); + lz_compress(&lzi); + return 0; +} +#endif + +__inline__ int lz_left_to_process(lz_info *lzi) +{ + return lzi->chars_in_buf - lzi->block_loc; +} + +static void +fill_blockbuf(lz_info *lzi, int maxchars) +{ + int toread; + u_char *readhere; + int nread; + + if (lzi->eofcount) return; + maxchars -= lz_left_to_process(lzi); + toread = lzi->block_buf_size - lzi->chars_in_buf; + if (toread > maxchars) toread = maxchars; + readhere = lzi->block_buf + lzi->chars_in_buf; + nread = lzi->get_chars(lzi, toread, readhere); + lzi->chars_in_buf += nread; + if (nread != toread) + lzi->eofcount++; +} + +static void lz_analyze_block(lz_info *lzi) +{ + int *lentab, *lenp; + u_char **prevtab, **prevp; + u_char *bbp, *bbe; + u_char *chartab[256]; + u_char *cursor; + int prevlen; + int ch; + int maxlen; + long wasinc; + int max_dist = lzi->max_dist; +#ifdef DEBUG_ANALYZE_BLOCK + static short n = 0; +#endif +#ifdef DEBUG_PERF + struct rusage innerloop; + struct timeval innertime, tmptime; + struct rusage outerloop; + struct timeval outertime; + struct rusage initialloop; + struct timeval initialtime; + struct rusage totalloop; + struct timeval totaltime; +#endif + +#ifdef DEBUG_ANALYZE_BLOCK + fprintf(stderr, "Analyzing block %d, cur_loc = %06x\n", n, lzi->cur_loc); +#endif + memset(chartab, 0, sizeof(chartab)); + prevtab = prevp = lzi->prevtab; + lentab = lenp = lzi->lentab; + memset(prevtab, 0, sizeof(*prevtab) * lzi->chars_in_buf); + memset(lentab, 0, sizeof(*lentab) * lzi->chars_in_buf); +#ifdef DEBUG_PERF + memset(&innertime, 0, sizeof(innertime)); + memset(&outertime, 0, sizeof(outertime)); + getrusage(RUSAGE_SELF, &initialloop); + totalloop = initialloop; +#endif + bbp = lzi->block_buf; + bbe = bbp + lzi->chars_in_buf; + while (bbp < bbe) { + if (chartab[ch = *bbp]) { + *prevp = chartab[ch]; + *lenp = 1; + } + chartab[ch] = bbp; + bbp++; + prevp++; + lenp++; + } +#ifdef DEBUG_PERF + initialtime = initialloop.ru_utime; + getrusage(RUSAGE_SELF, &initialloop); + timersub(&initialloop.ru_utime, &initialtime, &initialtime); +#endif + wasinc = 1; + for (maxlen = 1; wasinc && (maxlen < lzi->max_match); maxlen++) { +#ifdef DEBUG_PERF + getrusage(RUSAGE_SELF, &outerloop); +#endif + bbp = bbe - maxlen - 1; + lenp = lentab + lzi->chars_in_buf - maxlen - 1; + prevp = prevtab + lzi->chars_in_buf - maxlen - 1; + wasinc = 0; + while (bbp > lzi->block_buf) { + if (*lenp == maxlen) { +#ifdef DEBUG_PERF + getrusage(RUSAGE_SELF, &innerloop); +#endif + ch = bbp[maxlen]; + cursor = *prevp; + while(cursor && ((bbp - cursor) <= max_dist)) { + prevlen = *(cursor - lzi->block_buf + lentab); + if (cursor[maxlen] == ch) { + *prevp = cursor; + (*lenp)++; + wasinc++; + break; + } + if (prevlen != maxlen) break; + cursor = *(cursor - lzi->block_buf + prevtab); + } +#ifdef DEBUG_PERF + tmptime = innerloop.ru_utime; + getrusage(RUSAGE_SELF, &innerloop); + timersub(&innerloop.ru_utime, &tmptime, &tmptime); + timeradd(&tmptime, &innertime, &innertime); +#endif + } + bbp--; + prevp--; + lenp--; + } +#ifdef DEBUG_PERF + tmptime = outerloop.ru_utime; + getrusage(RUSAGE_SELF, &outerloop); + timersub(&outerloop.ru_utime, &tmptime, &tmptime); + timeradd(&tmptime, &outertime, &outertime); +#endif + // fprintf(stderr, "maxlen = %d, wasinc = %ld\n", maxlen, wasinc); + } +#ifdef DEBUG_PERF + totaltime = totalloop.ru_utime; + getrusage(RUSAGE_SELF, &totalloop); + timersub(&totalloop.ru_utime, &totaltime, &totaltime); + fprintf(stderr, "Time spend in initial loop = %f\n", initialtime.tv_sec + initialtime.tv_usec/(double)1E6); + fprintf(stderr, "Time spend in outer loop = %f\n", outertime.tv_sec + outertime.tv_usec/(double)1E6); + fprintf(stderr, "Time spend in inner loop = %f\n", innertime.tv_sec + innertime.tv_usec/(double)1E6); + fprintf(stderr, "Time spend in all loops = %f\n", totaltime.tv_sec + totaltime.tv_usec/(double)1E6); +#endif + lzi->analysis_valid = 1; +#ifdef DEBUG_ANALYZE_BLOCK + fprintf(stderr, "Done analyzing block %d, cur_loc = %06x\n", n++, lzi->cur_loc); +#endif +} + +void lz_stop_compressing(lz_info *lzi) +{ + lzi->stop = 1; + /* fprintf(stderr, "Stopping...\n");*/ +} + +int lz_compress(lz_info *lzi, int nchars) +{ + + u_char *bbp, *bbe; + int *lentab, *lenp; + u_char **prevtab, **prevp; + int len; + int holdback; + short trimmed; + + lzi->stop = 0; + while ((lz_left_to_process(lzi) || !lzi->eofcount) && !lzi->stop && nchars > 0) { +#if 1 + if (!lzi->analysis_valid || + (!lzi->eofcount && + ((lzi->chars_in_buf- lzi->block_loc) < nchars))) { + int residual = lzi->chars_in_buf - lzi->block_loc; + int bytes_to_move = lzi->max_dist + residual; + if (bytes_to_move > lzi->chars_in_buf) + bytes_to_move = lzi->chars_in_buf; +#ifdef DEBUG_ANALYZE_BLOCK + fprintf(stderr, "Moving %06x, chars_in_buf %06x, residual = %06x, nchars= %06x block_loc = %06x\n", bytes_to_move, lzi->chars_in_buf, residual, nchars, lzi->block_loc); +#endif + memmove(lzi->block_buf, lzi->block_buf + lzi->chars_in_buf - bytes_to_move, + bytes_to_move); + + lzi->block_loc = bytes_to_move - residual; + lzi->chars_in_buf = bytes_to_move; +#ifdef DEBUG_ANALYZE_BLOCK + fprintf(stderr, "New chars_in_buf %06x, new block_loc = %06x, eof = %1d\n", lzi->chars_in_buf, lzi->block_loc, lzi->eofcount); +#endif + fill_blockbuf(lzi, nchars); +#ifdef DEBUG_ANALYZE_BLOCK + fprintf(stderr, "Really new chars_in_buf %06x, new block_loc = %06x, eof = %1d\n", lzi->chars_in_buf, lzi->block_loc, lzi->eofcount); +#endif + lz_analyze_block(lzi); + } +#else + if (!lzi->analysis_valid || + (lzi->block_loc - lzi->chars_in_buf) == 0) { + lzi->block_loc = 0; + lzi->chars_in_buf = 0; + fill_blockbuf(lzi, nchars); + lz_analyze_block(lzi); + } +#endif + prevtab = prevp = lzi->prevtab + lzi->block_loc; + lentab = lenp = lzi->lentab + lzi->block_loc; + bbp = lzi->block_buf + lzi->block_loc; + holdback = lzi->max_match; + if (lzi->eofcount) holdback = 0; + if (lzi->chars_in_buf < (nchars + lzi->block_loc)) + bbe = lzi->block_buf + lzi->chars_in_buf - holdback; + else + bbe = bbp + nchars; + while ((bbp < bbe) && (!lzi->stop)) { + trimmed = 0; + len = *lenp; + if (lzi->frame_size && (len > (lzi->frame_size - lzi->cur_loc % lzi->frame_size))) { +#ifdef DEBUG_TRIMMING + fprintf(stderr, "Trim for framing: %06x %d %d\n", lzi->cur_loc,len, (lzi->frame_size - lzi->cur_loc % lzi->frame_size)); +#endif + trimmed = 1; + len = (lzi->frame_size - lzi->cur_loc % lzi->frame_size); + } + if (len > nchars) { +#ifdef DEBUG_TRIMMING + fprintf(stderr, "Trim for blocking: %06x %d %d\n", lzi->cur_loc,len, nchars); +#endif + trimmed = 1; + len = nchars; + } + if (len >= lzi->min_match) { +#ifdef LAZY + if ((bbp < bbe -1) && !trimmed && + ((lenp[1] > (len + 1)) /* || ((lenp[1] == len) && (prevp[1] > prevp[0])) */)) { + len = 1; + /* this is the lazy eval case */ + } + else +#endif + if (lzi->output_match(lzi, (*prevp - lzi->block_buf) - lzi->block_loc, + len) < 0) { + // fprintf(stderr, "Match rejected: %06x %d\n", lzi->cur_loc, len); + len = 1; /* match rejected */ + } + } + else + len = 1; + + if (len < lzi->min_match) { + assert(len == 1); + lzi->output_literal(lzi, *bbp); + } + // fprintf(stderr, "len = %3d, *lenp = %3d, cur_loc = %06x, block_loc = %06x\n", len, *lenp, lzi->cur_loc, lzi->block_loc); + bbp += len; + prevp += len; + lenp += len; + lzi->cur_loc += len; + lzi->block_loc += len; + assert(nchars >= len); + nchars -= len; + + } + } + return 0; +} diff --git a/src/calibre/utils/lzx/lzc.h b/src/calibre/utils/lzx/lzc.h new file mode 100644 index 0000000000..a721fede60 --- /dev/null +++ b/src/calibre/utils/lzx/lzc.h @@ -0,0 +1,60 @@ +/* + File lz_nonslide.h, part of lzxcomp library + Copyright (C) 2002 Matthew T. Russotto + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; version 2.1 only + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +typedef struct lz_info lz_info; +typedef int (*get_chars_t)(lz_info *lzi, int n, u_char *buf); +typedef int (*output_match_t)(lz_info *lzi, int match_pos, int match_len); +typedef void (*output_literal_t)(lz_info *lzi, u_char ch); + +struct lz_info +{ + int wsize; /* window size in bytes */ + int max_match; /* size of longest match in bytes */ + int min_match; + u_char *block_buf; + u_char *block_bufe; + int block_buf_size; + int chars_in_buf; + int cur_loc; /* location within stream */ + int block_loc; + int frame_size; + int max_dist; + u_char **prevtab; + int *lentab; + short eofcount; + short stop; + short analysis_valid; + + get_chars_t get_chars; + output_match_t output_match; + output_literal_t output_literal; + void *user_data; +}; + +void lz_init(lz_info *lzi, int wsize, int max_dist, + int max_match, int min_match, + int frame_size, + get_chars_t get_chars, + output_match_t output_match, + output_literal_t output_literal, void *user_data); + +void lz_release(lz_info *lzi); + +void lz_reset(lz_info *lzi); +void lz_stop_compressing(lz_info *lzi); +int lz_left_to_process(lz_info *lzi); /* returns # chars read in but unprocessed */ +int lz_compress(lz_info *lzi, int nchars); diff --git a/src/calibre/utils/lzx/lzxc.c b/src/calibre/utils/lzx/lzxc.c new file mode 100644 index 0000000000..445cf92767 --- /dev/null +++ b/src/calibre/utils/lzx/lzxc.c @@ -0,0 +1,1259 @@ +/* + File lzx_layer.c, part of lzxcomp library + Copyright (C) 2002 Matthew T. Russotto + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; version 2.1 only + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include /* for memset on Linux */ +#include +#include + +#include +#include + +/* Force using (actually working) non-sliding version. */ +#define NONSLIDE + +/* these named constants are from the Microsoft LZX documentation */ +#define MIN_MATCH 2 +#define MAX_MATCH 257 +#define NUM_CHARS 256 +#define NUM_PRIMARY_LENGTHS 7 +#define NUM_SECONDARY_LENGTHS 249 + +/* Debugging defines useful during development. All add diagnostic output + at various points in the system */ + +/*#define DEBUG_MATCHES *//* When matches come in from the LZ engine */ +/*#define DEBUG_MATCHES_2 *//* When matches are being output */ +/*#define DEBUG_HUFFMAN *//* When huffman trees are built */ +/*#define DEBUG_ENTROPY *//* In entropy calculation */ +/*#define DEBUG_LZ *//* Uncompressed input reconstructed from + LZ engine */ +/*#define DEBUG_BITBUF *//* Raw output to upper layer */ +/*#define DEBUG_EXTRA_BITS *//* Savings due to extra bits huffman tree */ +/*#define DEBUG_POSITION_SLOT_LOOKUP */ +/*#define DEBUG_TREE_COMPRESSION *//* During RLE compression of trees */ + +/* number of position slots given window_size-5 */ +/* as corrected by Caie */ +short num_position_slots[] = {30, 32, 34, 36, 38, 42, 50}; +unsigned long position_base[51]; +u_char extra_bits[52]; +double rloge2; + +typedef struct ih_elem { + int freq; + short sym; + short pathlength; + struct ih_elem *parent; + struct ih_elem *left; + struct ih_elem *right; +} ih_elem; + +typedef struct h_elem { + int freq; + short sym; + short pathlength; + struct ih_elem *parent; + unsigned short code; +} h_elem; + +typedef struct huff_entry { + short codelength; + unsigned short code; +} huff_entry; + +static int cmp_leaves(const void *in_a, const void *in_b) +{ + const struct h_elem *a = in_a; + const struct h_elem *b = in_b; + + if (!a->freq && b->freq) + return 1; + if (a->freq && !b->freq) + return -1; + + if (a->freq == b->freq) + return a->sym - b->sym; + + return a->freq - b->freq; +} + +static int +cmp_pathlengths(const void *in_a, const void *in_b) +{ + const struct h_elem *a = in_a; + const struct h_elem *b = in_b; + + if (a->pathlength == b->pathlength) +#if 0 + return a->sym - b->sym; +#else + /* see note on canonical pathlengths */ + return b->sym - a->sym; +#endif + return b->pathlength - a->pathlength; +} + +/* standard huffman building algorithm */ +static void +build_huffman_tree(int nelem, int max_code_length, int *freq, huff_entry *tree) +{ + h_elem *leaves = malloc(nelem * sizeof(h_elem)); + ih_elem *inodes; + ih_elem *next_inode; + ih_elem *cur_inode; + h_elem *cur_leaf; + int leaves_left; + int nleaves; + int pathlength; + unsigned short cur_code; + short codes_too_long = 0; + ih_elem *f1, *f2; + int i; + + for (i = 0; i < nelem; i++) { + leaves[i].freq = freq[i]; + leaves[i].sym = i; + leaves[i].pathlength = 0; + } + qsort(leaves, nelem, sizeof(h_elem), cmp_leaves); + for (leaves_left = 0; leaves_left < nelem; leaves_left++) { +#ifdef DEBUG_HUFFMAN + fprintf(stderr, "%3d: %3d '%c'\n", leaves_left, leaves[leaves_left].freq, + leaves[leaves_left].sym); +#endif + if (!leaves[leaves_left].freq) break; + } + nleaves = leaves_left; + + if (nleaves >= 2) { + inodes = malloc((nelem-1) * sizeof(ih_elem)); + do { + if (codes_too_long) { + for (leaves_left = 0; leaves_left < nelem; leaves_left++) { + if (!leaves[leaves_left].freq) break; + if (leaves[leaves_left].freq != 1) { + leaves[leaves_left].freq >>= 1; + codes_too_long = 0; + } + } + assert (!codes_too_long); + } + + cur_leaf = leaves; + next_inode = cur_inode = inodes; + + do { + f1 = f2 = NULL; + if (leaves_left && + ((cur_inode == next_inode) || + (cur_leaf->freq <= cur_inode->freq))) { + f1 = (ih_elem *)cur_leaf++; + leaves_left--; + } + else if (cur_inode != next_inode) { + f1 = cur_inode++; + } + + if (leaves_left && + ((cur_inode == next_inode) || + (cur_leaf->freq <= cur_inode->freq))) { + f2 = (ih_elem *)cur_leaf++; + leaves_left--; + } + else if (cur_inode != next_inode) { + f2 = cur_inode++; + } + +#ifdef DEBUG_HUFFMAN + fprintf(stderr, "%d %d\n", f1, f2); +#endif + if (f1 && f2) { + next_inode->freq = f1->freq + f2->freq; + next_inode->sym = -1; + next_inode->left = f1; + next_inode->right = f2; + next_inode->parent = NULL; + f1->parent = next_inode; + f2->parent = next_inode; + if (f1->pathlength > f2->pathlength) + next_inode->pathlength = f1->pathlength + 1; + else + next_inode->pathlength = f2->pathlength + 1; + if (next_inode->pathlength > max_code_length) { + codes_too_long = 1; + break; + } + next_inode++; + } + } + while (f1 && f2); + } + while (codes_too_long); + +#ifdef DEBUG_HUFFMAN + cur_inode = inodes; + while (cur_inode < next_inode) { + fprintf(stderr, "%d l: %3d%c r: %3d%c freq: %8d\n", + cur_inode - inodes, + (cur_inode->left->sym!=-1)?(((struct h_elem *)cur_inode->left)-leaves):(cur_inode->left-inodes), + (cur_inode->left->sym!=-1)?'l':'i', + (cur_inode->right->sym!=-1)?(((struct h_elem *)cur_inode->right)-leaves):(cur_inode->right-inodes), + (cur_inode->right->sym!=-1)?'l':'i', + (cur_inode->freq) + ); + cur_inode++; + } +#endif + + /* now traverse tree depth-first */ + cur_inode = next_inode - 1; + pathlength = 0; + cur_inode->pathlength = -1; + do { + /* precondition: at unmarked node*/ + if (cur_inode->sym == -1) /*&& (cur_inode->left)*/ { + /* left node of unmarked node is unmarked */ + cur_inode = cur_inode->left; + cur_inode->pathlength = -1; + pathlength++; + } + else { + /* mark node */ + cur_inode->pathlength = pathlength; +#if 0 + if (cur_inode->right) { + /* right node of previously unmarked node is unmarked */ + cur_inode = cur_inode->right; + cur_inode->pathlength = -1; + pathlength++; + } + else +#endif + { + + /* time to come up. Keep coming up until an unmarked node is reached */ + /* or the tree is exhausted */ + do { + cur_inode = cur_inode->parent; + pathlength--; + } + while (cur_inode && (cur_inode->pathlength != -1)); + if (cur_inode) { + /* found unmarked node; mark it and go right */ + cur_inode->pathlength = pathlength; + cur_inode = cur_inode->right; + cur_inode->pathlength = -1; + pathlength++; + /* would be complex if cur_inode could be null here. It can't */ + } + } + } + } + while (cur_inode); + +#ifdef DEBUG_HUFFMAN + cur_inode = inodes; + while (cur_inode < next_inode) { + fprintf(stderr, "%d l: %3d%c r: %3d%c freq: %8d pathlength %4d\n", + cur_inode - inodes, + (cur_inode->left->sym!=-1)?(((struct h_elem *)cur_inode->left)-leaves):(cur_inode->left-inodes), + (cur_inode->left->sym!=-1)?'l':'i', + (cur_inode->right->sym!=-1)?(((struct h_elem *)cur_inode->right)-leaves):(cur_inode->right-inodes), + (cur_inode->right->sym!=-1)?'l':'i', + (cur_inode->freq), + (cur_inode->pathlength) + ); + cur_inode++; + } +#endif + free(inodes); + + /* the pathlengths are already in order, so this sorts by symbol */ + qsort(leaves, nelem, sizeof(h_elem), cmp_pathlengths); + + /** + Microsoft's second condition on its canonical huffman codes is: + + For each level, starting at the deepest level of the tree and then + moving upwards, leaf nodes must start as far left as possible. An + alternative way of stating this constraint is that if any tree node + has children then all tree nodes to the left of it with the same path + length must also have children. + + These 'alternatives' are not equivalent. The latter alternative gives + the common canonical code where the longest code is all zeros. The former + gives an opposite code where the longest code is all ones. Microsoft uses the + former alternative. + **/ + +#if 0 + pathlength = leaves[0].pathlength; + cur_code = 0; + for (i = 0; i < nleaves; i++) { + while (leaves[i].pathlength < pathlength) { + assert(!(cur_code & 1)); + cur_code >>= 1; + pathlength--; + } + leaves[i].code = cur_code; + cur_code++; + } +#else + pathlength = leaves[nleaves-1].pathlength; + assert(leaves[0].pathlength <= 16); /* this method cannot deal with bigger codes, though + the other canonical method can in some cases + (because it starts with zeros ) */ + cur_code = 0; + for (i = nleaves - 1; i >= 0; i--) { + while (leaves[i].pathlength > pathlength) { + cur_code <<= 1; + pathlength++; + } + leaves[i].code = cur_code; + cur_code++; + } +#endif + +#ifdef DEBUG_HUFFMAN + for (i = 0; i < nleaves; i++) { + char code[18]; + int j; + + cur_code = leaves[i].code; + code[leaves[i].pathlength] = 0; + for (j = leaves[i].pathlength-1; j >= 0; j--) { + if (cur_code & 1) code[j] = '1'; + else code[j] = '0'; + cur_code >>= 1; + } + fprintf(stderr, "%3d: %3d %3d %-16.16s '%c'\n", i, leaves[i].freq, leaves[i].pathlength, code, + leaves[i].sym); + } +#endif + } + else if (nleaves == 1) { + /* 0 symbols is OK (not according to doc, but according to Caie) */ + /* but if only one symbol is present, two symbols are required */ + nleaves = 2; + leaves[0].pathlength = leaves[1].pathlength = 1; + if (leaves[1].sym > leaves[0].sym) { + leaves[1].code = 1; + leaves[0].code = 0; + } + else { + leaves[0].code = 1; + leaves[1].code = 0; + } + } + + memset(tree, 0, nelem * sizeof(huff_entry)); + for (i = 0; i < nleaves; i++) { + tree[leaves[i].sym].codelength = leaves[i].pathlength; + tree[leaves[i].sym].code = leaves[i].code; + } + + free(leaves); +} + +/* from Stuart Caie's code -- I'm hoping this code is too small to encumber + this file. If not, you could rip it out and hard-code the tables */ + +static void lzx_init_static(void) +{ + int i, j; + + if (extra_bits[49]) return; + + rloge2 = 1.0/log(2); + for (i=0, j=0; i <= 50; i += 2) { + extra_bits[i] = extra_bits[i+1] = j; /* 0,0,0,0,1,1,2,2,3,3... */ + if ((i != 0) && (j < 17)) j++; /* 0,0,1,2,3,4...15,16,17,17,17,17... */ + } + + for (i=0, j=0; i <= 50; i++) { + position_base[i] = j; /* 0,1,2,3,4,6,8,12,16,24,32,... */ + j += 1 << extra_bits[i]; /* 1,1,1,1,2,2,4,4,8,8,16,16,32,32,... */ + } +} + +struct lzx_data +{ + void *in_arg; + void *out_arg; + void *mark_frame_arg; + lzx_get_bytes_t get_bytes; + lzx_at_eof_t at_eof; + lzx_put_bytes_t put_bytes; + lzx_mark_frame_t mark_frame; + struct lz_info *lzi; + /* a 'frame' is an 0x8000 byte thing. Called that because otherwise + I'd confuse myself overloading 'block' */ + int left_in_frame; + int left_in_block; + int R0, R1, R2; + int num_position_slots; + /* this is the LZX block size */ + int block_size; + int *main_freq_table; + int length_freq_table[NUM_SECONDARY_LENGTHS]; + int aligned_freq_table[LZX_ALIGNED_SIZE]; + uint32_t *block_codes; + uint32_t *block_codesp; + huff_entry *main_tree; + huff_entry length_tree[NUM_SECONDARY_LENGTHS]; + huff_entry aligned_tree[LZX_ALIGNED_SIZE]; + int main_tree_size; + uint16_t bit_buf; + int bits_in_buf; + double main_entropy; + double last_ratio; + uint8_t *prev_main_treelengths; + uint8_t prev_length_treelengths[NUM_SECONDARY_LENGTHS]; + uint32_t len_uncompressed_input; + uint32_t len_compressed_output; + short need_1bit_header; + short subdivide; /* 0 = don't subdivide, 1 = allowed, -1 = requested */ +}; + +static int +lzx_get_chars(lz_info *lzi, int n, u_char *buf) +{ + /* force lz compression to stop after every block */ + int chars_read; + int chars_pad; + + lzx_data *lzud = (lzx_data *)lzi->user_data; +#ifdef OLDFRAMING + if (lzud->subdivide < 0) return 0; + if (n > lzud->left_in_frame) + n = lzud->left_in_frame; + if (n > lzud->left_in_block) + n = lzud->left_in_block; +#endif + chars_read = lzud->get_bytes(lzud->in_arg, n, buf); +#ifdef OLDFRAMING + lzud->left_in_frame -= chars_read; + lzud->left_in_block -= chars_read; +#else + lzud->left_in_frame -= chars_read % LZX_FRAME_SIZE; + if (lzud->left_in_frame < 0) + lzud->left_in_frame += LZX_FRAME_SIZE; +#endif + if ((chars_read < n) && (lzud->left_in_frame)) { + chars_pad = n - chars_read; + if (chars_pad > lzud->left_in_frame) chars_pad = lzud->left_in_frame; + /* never emit a full frame of padding. This prevents silliness when + lzx_compress is called when at EOF but EOF not yet detected */ + if (chars_pad == LZX_FRAME_SIZE) chars_pad = 0; +#ifdef OLDFRAMING + if (chars_pad > lzud->left_in_block) chars_pad = lzud->left_in_block; +#endif + memset(buf + chars_read, 0, chars_pad); + lzud->left_in_frame -= chars_pad; +#ifdef OLDFRAMING + lzud->left_in_block -= chars_pad; +#endif + chars_read += chars_pad; + } + return chars_read; +} + +#ifdef NONSLIDE +static int find_match_at(lz_info *lzi, int loc, int match_len, int *match_locp) +{ + u_char *matchb; + u_char *nmatchb; + u_char *c1, *c2; + int j; + + if (-*match_locp == loc) return -1; + if (loc < match_len) return -1; + + matchb = lzi->block_buf + lzi->block_loc + *match_locp; + nmatchb = lzi->block_buf + lzi->block_loc - loc; + c1 = matchb; + c2 = nmatchb; + for (j = 0; j < match_len; j++) { + if (*c1++ != *c2++) break; + } + if (j == match_len) { +#ifdef DEBUG_MATCHES + fprintf(stderr, "match found %d, old = %d new = %d len = %d\n", lzi->cur_loc, -*match_locp, loc, match_len); +#endif + *match_locp = -loc; + return 0; + } + return -1; +} +#else +static int find_match_at(lz_info *lzi, int loc, int match_len, int *match_locp) +{ + u_char *matchb; + u_char *nmatchb; + u_char *c1, *c2; + int j; + + if (-*match_locp == loc) return -1; + if (loc < match_len) return -1; + + matchb = lzi->slide_bufp + *match_locp; + if (matchb < lzi->slide_buf) matchb += lzi->slide_buf_size; + nmatchb = lzi->slide_bufp - loc; + if (nmatchb < lzi->slide_buf) nmatchb += lzi->slide_buf_size; + c1 = matchb; + c2 = nmatchb; + for (j = 0; j < match_len; j++) { + if (*c1++ != *c2++) break; + if (c1 == lzi->slide_bufe) c1 = lzi->slide_buf; + if (c2 == lzi->slide_bufe) c2 = lzi->slide_buf; + } + if (j == match_len) { +#ifdef DEBUG_MATCHES + fprintf(stderr, "match found %d, old = %d new = %d len = %d\n", lzi->cur_loc, -*match_locp, loc, match_len); +#endif + *match_locp = -loc; + return 0; + } + return -1; +} +#endif +static void check_entropy(lzx_data *lzud, int main_index) +{ + /* entropy = - sum_alphabet P(x) * log2 P(x) */ + /* entropy = - sum_alphabet f(x)/N * log2 (f(x)/N) */ + /* entropy = - 1/N sum_alphabet f(x) * (log2 f(x) - log2 N) */ + /* entropy = - 1/N (sum_alphabet f(x) * log2 f(x)) - sum_alphabet f(x) log2 N */ + /* entropy = - 1/N (sum_alphabet f(x) * log2 f(x)) - log2 N sum_alphabet f(x) */ + /* entropy = - 1/N (sum_alphabet f(x) * log2 f(x)) - N * log2 N */ + + /* entropy = - 1/N ((sum_alphabet f(x) * log2 f(x) ) - N * log2 N) */ + /* entropy = - 1/N ((sum_alphabet f(x) * ln f(x) * 1/ln 2) - N * ln N * 1/ln 2) */ + /* entropy = 1/(N ln 2) (N * ln N - (sum_alphabet f(x) * ln f(x))) */ + /* entropy = 1/(N ln 2) (N * ln N + (sum_alphabet -f(x) * ln f(x))) */ + + /* entropy = 1/(N ln 2) ( sum_alphabet ln N * f(x) + (sum_alphabet -f(x) * ln f(x))) */ + /* entropy = 1/(N ln 2) ( sum_alphabet ln N * f(x) + (-f(x) * ln f(x))) */ + /* entropy = -1/(N ln 2) ( sum_alphabet -ln N * f(x) + (f(x) * ln f(x))) */ + /* entropy = -1/(N ln 2) ( sum_alphabet f(x)(- ln N + ln f(x))) */ + /* entropy = -1/(N ln 2) ( sum_alphabet f(x)(ln f(x)/N)) */ + /* entropy = -1/N ( sum_alphabet (1/(ln 2))f(x)(ln f(x)/N)) */ + /* entropy = -1/N ( sum_alphabet f(x)(log2 f(x)/N)) */ + /* entropy = - ( sum_alphabet f(x)/N(log2 f(x)/N)) */ + /* entropy = - ( sum_alphabet P(x)(log2 P(x))) */ + + + double freq; + double n_ln_n; + double rn_ln2; + double cur_ratio; + int n; + + /* delete old entropy accumulation */ + if (lzud->main_freq_table[main_index] != 1) { + freq = (double)lzud->main_freq_table[main_index]-1; + lzud->main_entropy += freq * log(freq); + } + /* add new entropy accumulation */ + freq = (double)lzud->main_freq_table[main_index]; + lzud->main_entropy -= freq * log(freq); + n = lzud->block_codesp - lzud->block_codes; + + if (((n & 0xFFF) == 0) && (lzud->left_in_block >= 0x1000)) { + n_ln_n = (double)n * log((double)n); + rn_ln2 = rloge2 / (double)n; + cur_ratio = (n * rn_ln2 *(n_ln_n + lzud->main_entropy) + 24 + 3 * 80 + NUM_CHARS + (lzud->main_tree_size-NUM_CHARS)*3 + NUM_SECONDARY_LENGTHS ) / (double)n; +#ifdef DEBUG_ENTROPY + fprintf(stderr, "n = %d\n", n); + fprintf(stderr, "main entropy = %f\n", rn_ln2 *(n_ln_n + lzud->main_entropy) ); + fprintf(stderr, "compression ratio (raw) = %f\n", 100.0 * rn_ln2 *(n_ln_n + lzud->main_entropy) /9.0 ); + fprintf(stderr, "compression ratio (ovh) = %f\n", 100.0 * cur_ratio/9.0); +#endif + if (cur_ratio > lzud->last_ratio) { +#ifdef DEBUG_ENTROPY + fprintf(stderr, "resetting huffman tables at %d\n", n); +#endif + lzud->subdivide = -1; + lz_stop_compressing(lzud->lzi); + } + lzud->last_ratio = cur_ratio; + } +} + +static int +lzx_output_match(lz_info *lzi, int match_pos, int match_len) +{ + lzx_data *lzud = (lzx_data *)lzi->user_data; + uint32_t formatted_offset; + uint32_t position_footer; + uint8_t length_footer; + uint8_t length_header; + uint16_t len_pos_header; + int position_slot; + short btdt; + +#ifdef DEBUG_LZ + { + int i; + int pos; + for (i = 0; i < match_len; i++) { + +#ifdef NONSLIDE + pos = match_pos + lzi->block_loc + i; + fprintf(stderr, "%c", lzi->block_buf[pos]); +#else + pos = match_pos + lzi->front_offset + i; + if (pos > lzi->slide_buf_size) + pos -= lzi->slide_buf_size; + fprintf(stderr, "%c", lzi->slide_buf[pos]); +#endif + } + } +#endif + position_footer = 0; + btdt = 0; + testforr: + if (match_pos == -lzud->R0) { + match_pos = 0; + formatted_offset = 0; + position_slot = 0; + } + else if (match_pos == -lzud->R1) { + lzud->R1 = lzud->R0; + lzud->R0 = -match_pos; + match_pos = 1; + formatted_offset = 1; + position_slot = 1; + } + else if (match_pos == -lzud->R2) { + lzud->R2 = lzud->R0; + lzud->R0 = -match_pos; + match_pos = 2; + formatted_offset = 2; + position_slot = 2; + } + else { + if (!btdt) { + btdt = 1; + if (find_match_at(lzi, lzud->R0, match_len, &match_pos) == 0) + goto testforr; + if (find_match_at(lzi, lzud->R1, match_len, &match_pos) == 0) + goto testforr; + if (find_match_at(lzi, lzud->R2, match_len, &match_pos) == 0) + goto testforr; + } + + formatted_offset = -match_pos + 2; + + if ((match_len < 3) || + ((formatted_offset >= 64) && (match_len < 4)) || + ((formatted_offset >= 2048) && (match_len < 5)) || + ((formatted_offset >= 65536) && (match_len < 6))) { + /* reject matches where extra_bits will likely be bigger than just outputting + literals. The numbers are basically derived through guessing + and trial and error */ + return -1; /* reject the match */ + } + + lzud->R2 = lzud->R1; + lzud->R1 = lzud->R0; + lzud->R0 = -match_pos; + + /* calculate position base using binary search of table; if log2 can be + done in hardware, approximation might work; + trunc(log2(formatted_offset*formatted_offset)) gets either the proper + position slot or the next one, except for slots 0, 1, and 39-49 + + Slots 0-1 are handled by the R0-R1 procedures + + Slots 36-49 (formatted_offset >= 262144) can be found by + (formatted_offset/131072) + 34 == + (formatted_offset >> 17) + 34; + */ + if (formatted_offset >= 262144) { + position_slot = (formatted_offset >> 17) + 34; + } + else { + int left, right, mid; + + left = 3; + right = lzud->num_position_slots - 1; + position_slot = -1; + while (left <= right) { + mid = (left + right)/2; + if ((position_base[mid] <= formatted_offset) && + position_base[mid+1] > formatted_offset) { + position_slot = mid; + break; + } +#if 0 + fprintf(stderr, "BEFORE: %06x %06x %06x %06x\n", + position_base[left], position_base[mid], + formatted_offset, position_base[right]); +#endif + if (formatted_offset > position_base[mid]) + /* too low */ + left = mid + 1; + else /* too high */ + right = mid; +#if 0 + fprintf(stderr, "AFTER : %06x %06x %06x %06x\n", + position_base[left], position_base[mid], + formatted_offset, position_base[right]); +#endif + } +#ifdef DEBUG_POSITION_SLOT_LOOKUP + if (position_slot < 0) { + fprintf(stderr, "lmr npr: %d %d %d %d\n", left, mid, right, lzud->num_position_slots); + fprintf(stderr, "AFTER : %07d %07d %07d %07d\n", + position_base[left], position_base[mid], + formatted_offset, position_base[right]); + fprintf(stderr, "(%d, %d, %d, %d, %d)\n", match_pos, match_len, formatted_offset, position_slot, position_footer); + } +#endif + assert(position_slot >= 0); + /* FIXME precalc extra_mask table */ + } + position_footer = ((1UL << extra_bits[position_slot]) - 1) & formatted_offset; + } +#ifdef DEBUG_MATCHES +#ifdef NONSLIDE + fprintf(stderr, "(%08x, %d, %d, %d, %d, %d)\n", lzud->lzi->cur_loc , match_pos, match_len, formatted_offset, position_slot, position_footer); +#else + fprintf(stderr, "(%08x, %d, %d, %d, %d, %d)\n", lzud->lzi->cur_loc - lzud->lzi->chars_in_match , match_pos, match_len, formatted_offset, position_slot, position_footer); +#endif +#endif + /* match length = 8 bits */ + /* position_slot = 6 bits */ + /* position_footer = 17 bits */ + /* total = 31 bits */ + /* plus one to say whether it's a literal or not */ + *lzud->block_codesp++ = 0x80000000 | /* bit 31 in intelligent bit ordering */ + (position_slot << 25) | /* bits 30-25 */ + (position_footer << 8) | /* bits 8-24 */ + (match_len - MIN_MATCH); /* bits 0-7 */ + + if (match_len < (NUM_PRIMARY_LENGTHS + MIN_MATCH)) { + length_header = match_len - MIN_MATCH; + /* length_footer = 255; */ /* not necessary */ + } + else { + length_header = NUM_PRIMARY_LENGTHS; + length_footer = match_len - (NUM_PRIMARY_LENGTHS + MIN_MATCH); + lzud->length_freq_table[length_footer]++; + } + len_pos_header = (position_slot << 3) | length_header; + lzud->main_freq_table[len_pos_header + NUM_CHARS]++; + if (extra_bits[position_slot] >= 3) { + lzud->aligned_freq_table[position_footer & 7]++; + } +#ifndef OLDFRAMING + lzud->left_in_block -= match_len; +#endif + if (lzud->subdivide) + check_entropy(lzud, len_pos_header + NUM_CHARS); + return 0; /* accept the match */ +} + +static void +lzx_output_literal(lz_info *lzi, u_char ch) +{ + lzx_data *lzud = (lzx_data *)lzi->user_data; + +#ifndef OLDFRAMING + lzud->left_in_block--; +#endif + *lzud->block_codesp++ = ch; +#ifdef DEBUG_LZ + fprintf(stderr, "%c", ch); +#endif + lzud->main_freq_table[ch]++; + if (lzud->subdivide) + check_entropy(lzud, ch); +} + +static void lzx_write_bits(lzx_data *lzxd, int nbits, uint32_t bits) +{ + int cur_bits; + int shift_bits; + int rshift_bits; + uint16_t mask_bits; + +#ifdef DEBUG_BITBUF + fprintf(stderr, "WB: %2d %08x\n", nbits, bits); +#endif + cur_bits = lzxd->bits_in_buf; + while ((cur_bits + nbits) >= 16) { + shift_bits = 16 - cur_bits; + rshift_bits = nbits - shift_bits; + if (shift_bits == 16) { + lzxd->bit_buf = (bits>>rshift_bits) & 0xFFFF; + } + else { + mask_bits = (1U << shift_bits) - 1; + lzxd->bit_buf <<= shift_bits; + lzxd->bit_buf |= (bits>>rshift_bits) & mask_bits; + } +#ifdef DEBUG_BITBUF + fprintf(stderr, "WBB: %04x\n", lzxd->bit_buf); +#endif +#ifdef LZX_BIG_ENDIAN + lzxd->bit_buf = ((lzxd->bit_buf & 0xFF)<<8) | (lzxd->bit_buf >> 8); +#endif + lzxd->put_bytes(lzxd->out_arg, sizeof(lzxd->bit_buf), &lzxd->bit_buf); + lzxd->len_compressed_output += sizeof(lzxd->bit_buf); + lzxd->bit_buf = 0; + nbits -= shift_bits; + cur_bits = 0; + } + /* (cur_bits + nbits) < 16. If nbits = 0, we're done. + otherwise move bits in */ + shift_bits = nbits; + mask_bits = (1U << shift_bits) - 1; + lzxd->bit_buf <<= shift_bits; + lzxd->bit_buf |= bits & mask_bits; + cur_bits += nbits; + +#ifdef DEBUG_BITBUF + fprintf(stderr, "OBB: %2d %04x\n", cur_bits, lzxd->bit_buf); +#endif + lzxd->bits_in_buf = cur_bits; +} + +static void lzx_align_output(lzx_data *lzxd) +{ + if (lzxd->bits_in_buf) { + lzx_write_bits(lzxd, 16 - lzxd->bits_in_buf, 0); + } + if (lzxd->mark_frame) + lzxd->mark_frame(lzxd->mark_frame_arg, lzxd->len_uncompressed_input, lzxd->len_compressed_output); +} + +static void +lzx_write_compressed_literals(lzx_data *lzxd, int block_type) +{ + uint32_t *cursor = lzxd->block_codes; + uint32_t *endp = lzxd->block_codesp; + uint16_t position_slot; + uint32_t position_footer; + uint32_t match_len_m2; /* match length minus 2, which is MIN_MATCH */ + uint32_t verbatim_bits; + uint32_t block_code; + uint16_t length_header; + uint16_t length_footer; + uint16_t len_pos_header; + huff_entry *huffe; + int frame_count = (lzxd->len_uncompressed_input % LZX_FRAME_SIZE); + + lzxd->len_uncompressed_input -= frame_count; /* will be added back in later */ + while (cursor < endp) { + block_code = *cursor++; + if (block_code & 0x80000000) { + /* + * 0x80000000 | bit 31 in intelligent bit ordering + * (position_slot << 25) | bits 30-25 + * (position_footer << 8) | bits 8-24 + * (match_len - MIN_MATCH); bits 0-7 + * + */ + + match_len_m2 = block_code & 0xFF; /* 8 bits */ + position_footer = (block_code >> 8)& 0x1FFFF; /* 17 bits */ + position_slot = (block_code >> 25) & 0x3F; /* 6 bits */ + +#ifdef DEBUG_MATCHES_2 + fprintf(stderr, "%08x, %3d %2d %d\n", lzxd->len_uncompressed_input + frame_count, match_len_m2, position_slot, position_footer); +#endif + if (match_len_m2 < NUM_PRIMARY_LENGTHS) { + length_header = match_len_m2; + length_footer = 255; /* personal encoding for NULL */ + } + else { + length_header = NUM_PRIMARY_LENGTHS; + length_footer = match_len_m2 - NUM_PRIMARY_LENGTHS; + } + len_pos_header = (position_slot << 3) | length_header; + huffe = &lzxd->main_tree[len_pos_header+NUM_CHARS]; + lzx_write_bits(lzxd, huffe->codelength, huffe->code); + if (length_footer != 255) { + huffe = &lzxd->length_tree[length_footer]; + lzx_write_bits(lzxd, huffe->codelength, huffe->code); + } + if ((block_type == LZX_ALIGNED_OFFSET_BLOCK) && (extra_bits[position_slot] >= 3)) { + /* aligned offset block and code */ + verbatim_bits = position_footer >> 3; + lzx_write_bits(lzxd, extra_bits[position_slot] - 3, verbatim_bits); + huffe = &lzxd->aligned_tree[position_footer&7]; + lzx_write_bits(lzxd, huffe->codelength, huffe->code); + } + else { + verbatim_bits = position_footer; + lzx_write_bits(lzxd, extra_bits[position_slot], verbatim_bits); + } + frame_count += match_len_m2 + 2; + } + else { + /* literal */ + assert(block_code < NUM_CHARS); + huffe = &lzxd->main_tree[block_code]; + lzx_write_bits(lzxd, huffe->codelength, huffe->code); + frame_count++; + } + if (frame_count == LZX_FRAME_SIZE) { + lzxd->len_uncompressed_input += frame_count; + lzx_align_output(lzxd); + frame_count = 0; + } +#ifdef DEBUG_MATCHES_2 + if (frame_count > LZX_FRAME_SIZE) { + fprintf(stderr, "uncomp_len = %x, frame_count = %x, block_code = %08x, match_len_m2 = %d", lzxd->len_uncompressed_input, frame_count, block_code, match_len_m2); + } +#endif + assert (frame_count < LZX_FRAME_SIZE); + } + lzxd->len_uncompressed_input += frame_count; +} + +static int +lzx_write_compressed_tree(struct lzx_data *lzxd, + struct huff_entry *tree, uint8_t *prevlengths, + int treesize) +{ + u_char *codes; + u_char *runs; + int freqs[LZX_PRETREE_SIZE]; + int cur_run; + int last_len; + huff_entry pretree[20]; + u_char *codep; + u_char *codee; + u_char *runp; + int excess; + int i; + int cur_code; + + codep = codes = malloc(treesize*sizeof(char)); + runp = runs = malloc(treesize*sizeof(char)); + memset(freqs, 0, sizeof(freqs)); + cur_run = 1; + last_len = tree[0].codelength; + for (i = 1; i <= treesize; i++) { + if ((i == treesize) || (tree[i].codelength != last_len)) { + if (last_len == 0) { + while (cur_run >= 20) { + excess = cur_run - 20; + if (excess > 31) excess = 31; + *codep++ = 18; + *runp++ = excess; + cur_run -= excess + 20; + freqs[18]++; + } + while (cur_run >= 4) { + excess = cur_run - 4; + if (excess > 15) excess = 15; + *codep++ = 17; + *runp++ = excess; + cur_run -= excess + 4; + freqs[17]++; + } + while (cur_run > 0) { + *codep = prevlengths[i - cur_run]; + freqs[*codep++]++; + *runp++ = 0; /* not necessary */ + cur_run--; + } + } + else { + while (cur_run >= 4) { + if (cur_run == 4) excess = 0; + else excess = 1; + *codep++ = 19; + *runp++ = excess; + freqs[19]++; + /* right, MS lies again. Code is NOT + prev_len + len (mod 17), it's prev_len - len (mod 17)*/ + *codep = prevlengths[i-cur_run] - last_len; + if (*codep > 16) *codep += 17; + freqs[*codep++]++; + *runp++ = 0; /* not necessary */ + cur_run -= excess+4; + } + while (cur_run > 0) { + *codep = prevlengths[i-cur_run] - last_len; + if (*codep > 16) *codep += 17; + *runp++ = 0; /* not necessary */ + cur_run--; + freqs[*codep++]++; + } + } + if (i != treesize) + last_len = tree[i].codelength; + cur_run = 0; + } + cur_run++; + } + codee = codep; +#ifdef DEBUG_TREE_COMPRESSION + *codep++ = 255; + *runp++ = 255; + fprintf(stderr, "num: len code run\n"); + for (i = 0; i < treesize; i++) { + fprintf(stderr, "%3d: %2d %2d %2d\n", i, tree[i].codelength, codes[i], runs[i]); + } +#endif + /* now create the huffman table and write out the pretree */ + build_huffman_tree(LZX_PRETREE_SIZE, 16, freqs, pretree); + for (i = 0; i < LZX_PRETREE_SIZE; i++) { + lzx_write_bits(lzxd, 4, pretree[i].codelength); + } + codep = codes; + runp = runs; + cur_run = 0; + while (codep < codee) { + cur_code = *codep++; + lzx_write_bits(lzxd, pretree[cur_code].codelength, pretree[cur_code].code); + if (cur_code == 17) { + cur_run += *runp + 4; + lzx_write_bits(lzxd, 4, *runp); + } + else if (cur_code == 18) { + cur_run += *runp + 20; + lzx_write_bits(lzxd, 5, *runp); + } + else if (cur_code == 19) { + cur_run += *runp + 4; + lzx_write_bits(lzxd, 1, *runp); + cur_code = *codep++; + lzx_write_bits(lzxd, pretree[cur_code].codelength, pretree[cur_code].code); + runp++; + } + else { + cur_run++; + } + runp++; + } + free(codes); + free(runs); + return 0; +} + +void +lzx_reset(lzx_data *lzxd) +{ + lzxd->need_1bit_header = 1; + lzxd->R0 = lzxd->R1 = lzxd->R2 = 1; + memset(lzxd->prev_main_treelengths, 0, lzxd->main_tree_size * sizeof(uint8_t)); + memset(lzxd->prev_length_treelengths, 0, NUM_SECONDARY_LENGTHS * sizeof(uint8_t)); + lz_reset(lzxd->lzi); +} + +int lzx_compress_block(lzx_data *lzxd, int block_size, int subdivide) +{ + int i; + uint32_t written_sofar = 0; + int block_type; + long uncomp_bits; + long comp_bits; + long comp_bits_ovh; + long uncomp_length; + + if ((lzxd->block_size != block_size) || (lzxd->block_codes == NULL)) { + if (lzxd->block_codes != NULL) free(lzxd->block_codes); + lzxd->block_size = block_size; + lzxd->block_codes = malloc(block_size * sizeof(uint32_t)); + } + lzxd->subdivide = subdivide?1:0; + + lzxd->left_in_block = block_size; + lzxd->left_in_frame = LZX_FRAME_SIZE; + lzxd->main_entropy = 0.0; + lzxd->last_ratio = 9999999.0; + lzxd->block_codesp = lzxd->block_codes; + + memset(lzxd->length_freq_table, 0, NUM_SECONDARY_LENGTHS * sizeof(int)); + memset(lzxd->main_freq_table, 0, lzxd->main_tree_size * sizeof(int)); + memset(lzxd->aligned_freq_table, 0, LZX_ALIGNED_SIZE * sizeof(int)); + do { + lz_compress(lzxd->lzi, lzxd->left_in_block); + if (lzxd->left_in_frame == 0) + lzxd->left_in_frame = LZX_FRAME_SIZE; + + if ((lzxd->subdivide<0) || !lzxd->left_in_block || + (!lz_left_to_process(lzxd->lzi) && lzxd->at_eof(lzxd->in_arg))) { + /* now one block is LZ-analyzed. */ + /* time to write it out */ + uncomp_length = lzxd->block_size - lzxd->left_in_block - written_sofar; + /* uncomp_length will sometimes be 0 when input length is + an exact multiple of frame size */ + if (uncomp_length == 0) + continue; + if (lzxd->subdivide < 0) { +#ifdef DEBUG_ENTROPY + fprintf(stderr, "subdivided\n"); +#endif + lzxd->subdivide = 1; + } + + if (lzxd->need_1bit_header) { + /* one bit Intel preprocessing header */ + /* always 0 because this implementation doesn't do Intel preprocessing */ + lzx_write_bits(lzxd, 1, 0); + lzxd->need_1bit_header = 0; + } + + /* handle extra bits */ + uncomp_bits = comp_bits = 0; + build_huffman_tree(LZX_ALIGNED_SIZE, 7, lzxd->aligned_freq_table, lzxd->aligned_tree); + for (i = 0; i < LZX_ALIGNED_SIZE; i++) { + uncomp_bits += lzxd->aligned_freq_table[i]* 3; + comp_bits += lzxd->aligned_freq_table[i]* lzxd->aligned_tree[i].codelength; + } + comp_bits_ovh = comp_bits + LZX_ALIGNED_SIZE * 3; + if (comp_bits_ovh < uncomp_bits) + block_type = LZX_ALIGNED_OFFSET_BLOCK; + else + block_type = LZX_VERBATIM_BLOCK; + +#ifdef DEBUG_EXTRA_BITS + fprintf(stderr, "Extra bits uncompressed: %5d compressed: %5d compressed w/overhead %5d gain/loss %5d\n", uncomp_bits, comp_bits, comp_bits_ovh, uncomp_bits - comp_bits_ovh); +#endif + + /* block type */ + lzx_write_bits(lzxd, 3, block_type); + /* uncompressed length */ + lzx_write_bits(lzxd, 24, uncomp_length); + + written_sofar = lzxd->block_size - lzxd->left_in_block; + + /* now write out the aligned offset trees if present */ + if (block_type == LZX_ALIGNED_OFFSET_BLOCK) { + for (i = 0; i < LZX_ALIGNED_SIZE; i++) { + lzx_write_bits(lzxd, 3, lzxd->aligned_tree[i].codelength); + } + } + /* end extra bits */ + build_huffman_tree(lzxd->main_tree_size, LZX_MAX_CODE_LENGTH, + lzxd->main_freq_table, lzxd->main_tree); + build_huffman_tree(NUM_SECONDARY_LENGTHS, 16, + lzxd->length_freq_table, lzxd->length_tree); + + + + /* now write the pre-tree and tree for main 1 */ + lzx_write_compressed_tree(lzxd, lzxd->main_tree, lzxd->prev_main_treelengths, NUM_CHARS); + + /* now write the pre-tree and tree for main 2*/ + lzx_write_compressed_tree(lzxd, lzxd->main_tree + NUM_CHARS, + lzxd->prev_main_treelengths + NUM_CHARS, + lzxd->main_tree_size - NUM_CHARS); + + /* now write the pre tree and tree for length */ + lzx_write_compressed_tree(lzxd, lzxd->length_tree, lzxd->prev_length_treelengths, + NUM_SECONDARY_LENGTHS); + + /* now write literals */ + lzx_write_compressed_literals(lzxd, block_type); + + /* copy treelengths somewhere safe to do delta compression */ + for (i = 0; i < lzxd->main_tree_size; i++) { + lzxd->prev_main_treelengths[i] = lzxd->main_tree[i].codelength; + } + for (i = 0; i < NUM_SECONDARY_LENGTHS; i++) { + lzxd->prev_length_treelengths[i] = lzxd->length_tree[i].codelength; + } + lzxd->main_entropy = 0.0; + lzxd->last_ratio = 9999999.0; + lzxd->block_codesp = lzxd->block_codes; + + memset(lzxd->length_freq_table, 0, NUM_SECONDARY_LENGTHS * sizeof(int)); + memset(lzxd->main_freq_table, 0, lzxd->main_tree_size * sizeof(int)); + memset(lzxd->aligned_freq_table, 0, LZX_ALIGNED_SIZE * sizeof(int)); + } + } + while (lzxd->left_in_block && (lz_left_to_process(lzxd->lzi) || !lzxd->at_eof(lzxd->in_arg))); + return 0; +} + +int lzx_init(struct lzx_data **lzxdp, int wsize_code, + lzx_get_bytes_t get_bytes, void *get_bytes_arg, + lzx_at_eof_t at_eof, + lzx_put_bytes_t put_bytes, void *put_bytes_arg, + lzx_mark_frame_t mark_frame, void *mark_frame_arg) +{ + int wsize; + struct lzx_data *lzxd; + + if ((wsize_code < 15) || (wsize_code > 21)) { + return -1; + } + lzx_init_static(); + + *lzxdp = lzxd = malloc(sizeof(*lzxd)); + if (lzxd == 0) + return -2; + + lzxd->in_arg = get_bytes_arg; + lzxd->out_arg = put_bytes_arg; + lzxd->mark_frame_arg = mark_frame_arg; + lzxd->get_bytes = get_bytes; + lzxd->put_bytes = put_bytes; + lzxd->at_eof = at_eof; + lzxd->mark_frame = mark_frame; + + wsize = 1 << (wsize_code); + + lzxd->bits_in_buf = 0; + lzxd->block_size = 0; + lzxd->block_codes = NULL; + lzxd->num_position_slots = num_position_slots[wsize_code-15]; + lzxd->main_tree_size = (NUM_CHARS + 8 * lzxd->num_position_slots); + + lzxd->main_freq_table = malloc(sizeof(int) * lzxd->main_tree_size); + lzxd->main_tree = malloc(sizeof(huff_entry)* lzxd->main_tree_size); + lzxd->prev_main_treelengths = malloc(sizeof(uint8_t)*lzxd->main_tree_size); + + lzxd->lzi = malloc(sizeof (*lzxd->lzi)); + /* the -3 prevents matches at wsize, wsize-1, wsize-2, all of which are illegal */ + lz_init(lzxd->lzi, wsize, wsize - 3, MAX_MATCH, MIN_MATCH, LZX_FRAME_SIZE, + lzx_get_chars, lzx_output_match, lzx_output_literal,lzxd); + lzxd->len_uncompressed_input = 0; + lzxd->len_compressed_output = 0; + lzx_reset(lzxd); + return 0; +} + +int lzx_finish(struct lzx_data *lzxd, struct lzx_results *lzxr) +{ + /* lzx_align_output(lzxd); Not needed as long as frame padding is in place */ + if (lzxr) { + lzxr->len_compressed_output = lzxd->len_compressed_output; + lzxr->len_uncompressed_input = lzxd->len_uncompressed_input; + } + lz_release(lzxd->lzi); + free(lzxd->lzi); + free(lzxd->prev_main_treelengths); + free(lzxd->main_tree); + free(lzxd->main_freq_table); + if (lzxd->block_codes) { + free(lzxd->block_codes); + } + free(lzxd); + return 0; +} + diff --git a/src/calibre/utils/lzx/lzxc.h b/src/calibre/utils/lzx/lzxc.h new file mode 100644 index 0000000000..32cb1f721a --- /dev/null +++ b/src/calibre/utils/lzx/lzxc.h @@ -0,0 +1,57 @@ +/* + File lzx_compress.h, part of lzxcomp library + Copyright (C) 2002 Matthew T. Russotto + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; version 2.1 only + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#if BYTE_ORDER == BIG_ENDIAN +# define LZX_BIG_ENDIAN +#endif + +/* the names of these constants are specific to this library */ +#define LZX_MAX_CODE_LENGTH 16 +#define LZX_FRAME_SIZE 32768 +#define LZX_PRETREE_SIZE 20 +#define LZX_ALIGNED_BITS 3 +#define LZX_ALIGNED_SIZE 8 + +#define LZX_VERBATIM_BLOCK 1 +#define LZX_ALIGNED_OFFSET_BLOCK 2 + +typedef struct lzx_data lzx_data; +typedef int (*lzx_get_bytes_t)(void *arg, int n, void *buf); +typedef int (*lzx_put_bytes_t)(void *arg, int n, void *buf); +typedef void (*lzx_mark_frame_t)(void *arg, uint32_t uncomp, uint32_t comp); +typedef int (*lzx_at_eof_t)(void *arg); + +typedef struct lzx_results +{ + /* add more here? Error codes, # blocks, # frames, etc? */ + long len_compressed_output; + long len_uncompressed_input; +} lzx_results; + +int lzx_init(struct lzx_data **lzxdp, int wsize_code, + lzx_get_bytes_t get_bytes, void *get_bytes_arg, + lzx_at_eof_t at_eof, + lzx_put_bytes_t put_bytes, void *put_bytes_arg, + lzx_mark_frame_t mark_frame, void *mark_frame_arg); + +void lzx_reset(lzx_data *lzxd); + +int lzx_compress_block(lzx_data *lzxd, int block_size, int subdivide); + +int lzx_finish(struct lzx_data *lzxd, struct lzx_results *lzxr); + diff --git a/src/calibre/utils/lzx/lzxd.c b/src/calibre/utils/lzx/lzxd.c index 337af441fd..e683a9ec23 100644 --- a/src/calibre/utils/lzx/lzxd.c +++ b/src/calibre/utils/lzx/lzxd.c @@ -18,7 +18,7 @@ #include #include -#include +#include /* Microsoft's LZX document and their implementation of the * com.ms.util.cab Java package do not concur. diff --git a/src/calibre/utils/lzx/lzx.h b/src/calibre/utils/lzx/lzxd.h similarity index 100% rename from src/calibre/utils/lzx/lzx.h rename to src/calibre/utils/lzx/lzxd.h diff --git a/src/calibre/utils/lzx/lzxmodule.c b/src/calibre/utils/lzx/lzxmodule.c index c45bb22c95..2f72b58ae7 100644 --- a/src/calibre/utils/lzx/lzxmodule.c +++ b/src/calibre/utils/lzx/lzxmodule.c @@ -4,14 +4,15 @@ * Python module C glue code. */ - #include #include -#include +#include +#include static char lzx_doc[] = -"Provide basic LZX decompression using the code from libmspack."; + "Provide basic LZX compression and decompression using the code from\n" + "liblzxcomp and libmspack respectively."; static PyObject *LzxError = NULL; @@ -214,6 +215,15 @@ initlzx(void) LzxError = PyErr_NewException("lzx.LzxError", NULL, NULL); Py_INCREF(LzxError); PyModule_AddObject(m, "LzxError", LzxError); + + PyModule_AddObject(m, "_lzxc_init", + Py_BuildValue("k", (unsigned long)lzx_init)); + PyModule_AddObject(m, "_lzxc_reset", + Py_BuildValue("k", (unsigned long)lzx_reset)); + PyModule_AddObject(m, "_lzxc_compress_block", + Py_BuildValue("k", (unsigned long)lzx_compress_block)); + PyModule_AddObject(m, "_lzxc_finish", + Py_BuildValue("k", (unsigned long)lzx_finish)); return; } From 210ad8d20ada2e13b927af9fbe43e4911cd31fe3 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 10 Dec 2008 00:56:10 -0500 Subject: [PATCH 04/15] Implement "ugly-printing" for LIT markup. --- src/calibre/ebooks/lit/html.css | 8 ++++- src/calibre/ebooks/lit/oeb.py | 5 +-- src/calibre/ebooks/lit/reader.py | 2 +- src/calibre/ebooks/lit/stylizer.py | 11 +++--- src/calibre/ebooks/lit/writer.py | 56 +++++++++++++++++++++--------- 5 files changed, 56 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css index 5b75ea6649..9401b19cf2 100644 --- a/src/calibre/ebooks/lit/html.css +++ b/src/calibre/ebooks/lit/html.css @@ -410,7 +410,7 @@ tr:focus, tt:focus, u:focus, ul:focus, var:focus { /* hidden elements */ area, base, basefont, head, meta, script, style, title, -noembed, param { +noembed, param, link { display: none; } @@ -418,3 +418,9 @@ noembed, param { body { page-break-before: always; } + +/* Explicit line-breaks are blocks, sure... */ +br { + display: block; +} + diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py index d3773a61f1..ae2e6136b7 100644 --- a/src/calibre/ebooks/lit/oeb.py +++ b/src/calibre/ebooks/lit/oeb.py @@ -8,8 +8,8 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree -XML_PARSER = etree.XMLParser( - remove_blank_text=True, recover=True, resolve_entities=False) +XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False) +XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF2_NS = 'http://www.idpf.org/2007/opf' @@ -23,6 +23,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} +def XML(name): return '{%s}%s' % (XML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def OPF(name): return '{%s}%s' % (OPF2_NS, name) def DC(name): return '{%s}%s' % (DC11_NS, name) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index c04a845d69..71e5b081b8 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -387,7 +387,7 @@ def preserve(function): class LitReader(object): PIECE_SIZE = 16 XML_PARSER = etree.XMLParser( - remove_blank_text=True, resolve_entities=False) + recover=True, resolve_entities=False) def magic(): @preserve diff --git a/src/calibre/ebooks/lit/stylizer.py b/src/calibre/ebooks/lit/stylizer.py index 97b7e2d91d..1986f6a2ed 100644 --- a/src/calibre/ebooks/lit/stylizer.py +++ b/src/calibre/ebooks/lit/stylizer.py @@ -14,7 +14,8 @@ import cssutils from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \ CSSValueList, cssproperties from lxml import etree -from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename +from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES +from calibre.ebooks.lit.oeb import barename, urlnormalize from calibre.resources import html_css HTML_CSS_STYLESHEET = cssutils.parseString(html_css) @@ -125,7 +126,7 @@ class Stylizer(object): elif tag == 'link' \ and elem.get('rel', 'stylesheet') == 'stylesheet' \ and elem.get('type', CSS_MIME) in OEB_STYLES: - href = elem.attrib['href'] + href = urlnormalize(elem.attrib['href']) path = os.path.join(base, href) path = os.path.normpath(path).replace('\\', '/') if path in self.STYLESHEETS: @@ -275,13 +276,13 @@ class Style(object): if name1 != name2: return False elif item.type == 'id': - name1 = item.value[1:].lower() - name2 = element.attrib.get('id', '').lower().split() + name1 = item.value[1:] + name2 = element.get('id', '') if name1 != name2: return False elif item.type == 'class': name = item.value[1:].lower() - classes = element.attrib.get('class', '').lower().split() + classes = element.get('class', '').lower().split() if name not in classes: return False elif item.type == 'child': diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 62c3877785..e1b6b645d0 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -3,7 +3,7 @@ import sys import os from cStringIO import StringIO from struct import pack, unpack -from itertools import izip, count +from itertools import izip, count, chain import time import random import re @@ -15,7 +15,7 @@ from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit.reader import msguid, DirectoryEntry import calibre.ebooks.lit.maps as maps -from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME +from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME, XML_NS, XML from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize from calibre.ebooks.lit.oeb import Oeb from calibre.ebooks.lit.stylizer import Stylizer @@ -116,6 +116,8 @@ def randbytes(n): return ''.join(chr(random.randint(0, 255)) for x in xrange(n)) class ReBinary(object): + NSRMAP = {'': None, XML_NS: 'xml'} + def __init__(self, root, path, oeb, map=HTML_MAP): self.dir = os.path.dirname(path) self.manifest = oeb.manifest @@ -135,8 +137,11 @@ class ReBinary(object): if isinstance(value, (int, long)): value = unichr(value) self.buf.write(value.encode('utf-8')) - - def tree_to_binary(self, elem, nsrmap={'': None}, parents=[], + + def is_block(self, style): + return style['display'] not in ('inline', 'inline-block') + + def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, basestring): self.write(etree.tostring(elem)) @@ -158,7 +163,7 @@ class ReBinary(object): flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD - if style and style['display'] in ('block', 'table'): + if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] @@ -198,24 +203,41 @@ class ReBinary(object): except ValueError: self.write(len(value)+1, value) self.write(0) + old_preserve = preserve + if style: + preserve = (style['white-space'] in ('pre', 'pre-wrap')) + xml_space = elem.get(XML('space')) + if xml_space == 'preserve': + preserve = True + elif xml_space == 'normal': + preserve = False if elem.text: - text = elem.text - if style and style['white-space'] == 'pre': - preserve = True - if elem.get('xml:space') == 'preserve': - preserve = True - if not preserve: - text = COLLAPSE.sub(' ', text) - self.write(text) + if preserve: + self.write(elem.text) + elif len(elem) > 0 or not elem.text.isspace(): + self.write(COLLAPSE.sub(' ', elem.text)) parents.append(tag_offset) - for child in elem: - self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + child = cstyle = nstyle = None + for next in chain(elem, [None]): + if self.stylizer: + nstyle = self.stylizer.style(next) \ + if (next is not None) else None + if child is not None: + if not preserve \ + and (inhead or not nstyle + or self.is_block(cstyle) + or self.is_block(nstyle)) \ + and child.tail and child.tail.isspace(): + child.tail = None + self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + child, cstyle = next, nstyle parents.pop() + preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) - if elem.tail: + if elem.tail and tag != 'html': tail = elem.tail - if tag != 'pre': + if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): From 475a5eb899ddc6a8a5ce1f63d00c2cdaf0fa7387 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 10 Dec 2008 08:29:55 -0500 Subject: [PATCH 05/15] Fix bracket-fixup error. --- src/calibre/ebooks/lit/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 71e5b081b8..c4f854ae10 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -112,7 +112,7 @@ class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') OPEN_ANGLE_RE = re.compile(r'<<(?![!]--)') - CLOSE_ANGLE_RE = re.compile(r'(?