diff --git a/setup.py b/setup.py index 523b2b1509..457c51342d 100644 --- a/setup.py +++ b/setup.py @@ -146,6 +146,7 @@ if __name__ == '__main__': metadata_sqlite = 'library/metadata_sqlite.sql', jquery = 'gui2/viewer/jquery.js', jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js', + html_css = 'ebooks/lit/html.css', ) DEST = os.path.join('src', APPNAME, 'resources.py') @@ -373,7 +374,10 @@ if __name__ == '__main__': ext_modules = [ Extension('calibre.plugins.lzx', sources=['src/calibre/utils/lzx/lzxmodule.c', - 'src/calibre/utils/lzx/lzxd.c'], + 'src/calibre/utils/lzx/compressor.c', + 'src/calibre/utils/lzx/lzxd.c', + 'src/calibre/utils/lzx/lzc.c', + 'src/calibre/utils/lzx/lzxc.c'], include_dirs=['src/calibre/utils/lzx']), Extension('calibre.plugins.msdes', diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 7bd6eeab50..f1a60ab646 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -88,10 +88,10 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): zf.writestr('META-INF/container.xml', CONTAINER) return zf -def config(defaults=None): +def config(defaults=None, name='epub'): desc = _('Options to control the conversion to EPUB') if defaults is None: - c = Config('epub', desc) + c = Config(name, desc) else: c = StringConfig(defaults, desc) diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index b5c1f48937..6340180562 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -148,14 +148,14 @@ def config(defaults=None): def formats(): return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys()) -def option_parser(): - - return config().option_parser(usage=_('''\ +USAGE = _('''\ %%prog [options] filename -Convert any of a large number of ebook formats to an epub file. Supported formats are: %s -''')%formats() -) +Convert any of a large number of ebook formats to a %s file. Supported formats are: %s +''') + +def option_parser(usage=USAGE): + return config().option_parser(usage=usage%('EPUB', formats())) def main(args=sys.argv): parser = option_parser() diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 66a3cebbae..3552a1bf70 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -64,7 +64,8 @@ def check(opf_path, pretty_print): ''' Find a remove all invalid links in the HTML files ''' - print '\tChecking files for bad links...' + logger = logging.getLogger('html2epub') + logger.info('\tChecking files for bad links...') pathtoopf = os.path.abspath(opf_path) with CurrentDir(os.path.dirname(pathtoopf)): opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) diff --git a/src/calibre/ebooks/lit/from_any.py b/src/calibre/ebooks/lit/from_any.py new file mode 100644 index 0000000000..75cfc01bc2 --- /dev/null +++ b/src/calibre/ebooks/lit/from_any.py @@ -0,0 +1,59 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Convert any ebook format to LIT. +''' + +import sys, os, glob, logging + +from calibre.ebooks.epub.from_any import any2epub, formats, USAGE +from calibre.ebooks.epub import config as common_config +from calibre.ptempfile import TemporaryDirectory +from calibre.ebooks.lit.writer import oeb2lit + +def config(defaults=None): + c = common_config(defaults=defaults, name='lit') + return c + +def option_parser(usage=USAGE): + return config().option_parser(usage=usage%('LIT', formats())) + +def any2lit(opts, path): + ext = os.path.splitext(path)[1] + if not ext: + raise ValueError('Unknown file type: '+path) + ext = ext.lower()[1:] + + if opts.output is None: + opts.output = os.path.splitext(os.path.basename(path))[0]+'.lit' + + opts.output = os.path.abspath(opts.output) + orig_output = opts.output + + with TemporaryDirectory('_any2lit') as tdir: + oebdir = os.path.join(tdir, 'oeb') + os.mkdir(oebdir) + opts.output = os.path.join(tdir, 'dummy.epub') + opts.extract_to = oebdir + any2epub(opts, path) + opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opts.output = orig_output + logging.getLogger('html2epub').info(_('Creating LIT file from EPUB...')) + oeb2lit(opts, opf) + + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 2: + parser.print_help() + print 'No input file specified.' + return 1 + any2lit(opts, args[1]) + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css new file mode 100644 index 0000000000..9401b19cf2 --- /dev/null +++ b/src/calibre/ebooks/lit/html.css @@ -0,0 +1,426 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Blake Ross + * + * Alternatively, the contents of this file may be used under the terms of + * either of the GNU General Public License Version 2 or later (the "GPL"), + * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +@namespace url(http://www.w3.org/1999/xhtml); /* set default namespace to HTML */ + +/* blocks */ + +html, div, map, dt, isindex, form { + display: block; +} + +body { + display: block; + margin: 8px; +} + +p, dl, multicol { + display: block; + margin: 1em 0; +} + +dd { + display: block; +} + +blockquote { + display: block; + margin: 1em 40px; +} + +address { + display: block; + font-style: italic; +} + +center { + display: block; + text-align: center; +} + +blockquote[type=cite] { + display: block; + margin: 1em 0px; + border-color: blue; + border-width: thin; +} + +span[_moz_quote=true] { + color: blue; +} + +pre[_moz_quote=true] { + color: blue; +} + +h1 { + display: block; + font-size: 2em; + font-weight: bold; + margin: .67em 0; +} + +h2 { + display: block; + font-size: 1.5em; + font-weight: bold; + margin: .83em 0; +} + +h3 { + display: block; + font-size: 1.17em; + font-weight: bold; + margin: 1em 0; +} + +h4 { + display: block; + font-weight: bold; + margin: 1.33em 0; +} + +h5 { + display: block; + font-size: 0.83em; + font-weight: bold; + margin: 1.67em 0; +} + +h6 { + display: block; + font-size: 0.67em; + font-weight: bold; + margin: 2.33em 0; +} + +listing { + display: block; + font-family: monospace; + font-size: medium; + white-space: pre; + margin: 1em 0; +} + +xmp, pre, plaintext { + display: block; + font-family: monospace; + white-space: pre; + margin: 1em 0; +} + +/* tables */ + +table { + display: table; + border-spacing: 2px; + border-collapse: separate; + margin-top: 0; + margin-bottom: 0; + text-indent: 0; +} + +table[align="left"] { + float: left; +} + +table[align="right"] { + float: right; +} + +table[rules]:not([rules="none"]) { + border-collapse: collapse; +} + +/* caption inherits from table not table-outer */ +caption { + display: table-caption; + text-align: center; +} + +table[align="center"] > caption { + margin-left: auto; + margin-right: auto; +} + +table[align="center"] > caption[align="left"] { + margin-right: 0; +} + +table[align="center"] > caption[align="right"] { + margin-left: 0; +} + +tr { + display: table-row; + vertical-align: inherit; +} + +col { + display: table-column; +} + +colgroup { + display: table-column-group; +} + +tbody { + display: table-row-group; + vertical-align: middle; +} + +thead { + display: table-header-group; + vertical-align: middle; +} + +tfoot { + display: table-footer-group; + vertical-align: middle; +} + +/* for XHTML tables without tbody */ +table > tr { + vertical-align: middle; +} + +td { + display: table-cell; + vertical-align: inherit; + text-align: inherit; + padding: 1px; +} + +th { + display: table-cell; + vertical-align: inherit; + font-weight: bold; + padding: 1px; +} + +/* inlines */ + +q:before { + content: open-quote; +} + +q:after { + content: close-quote; +} + +b, strong { + font-weight: bolder; +} + +i, cite, em, var, dfn { + font-style: italic; +} + +tt, code, kbd, samp { + font-family: monospace; +} + +u, ins { + text-decoration: underline; +} + +s, strike, del { + text-decoration: line-through; +} + +blink { + text-decoration: blink; +} + +big { + font-size: larger; +} + +small { + font-size: smaller; +} + +sub { + vertical-align: sub; + font-size: smaller; + line-height: normal; +} + +sup { + vertical-align: super; + font-size: smaller; + line-height: normal; +} + +nobr { + white-space: nowrap; +} + +/* titles */ +abbr[title], acronym[title] { + border-bottom: dotted 1px; +} + +/* lists */ + +ul, menu, dir { + display: block; + list-style-type: disc; + margin: 1em 0; +} + +ol { + display: block; + list-style-type: decimal; + margin: 1em 0; +} + +li { + display: list-item; +} + +/* nested lists have no top/bottom margins */ +ul ul, ul ol, ul dir, ul menu, ul dl, +ol ul, ol ol, ol dir, ol menu, ol dl, +dir ul, dir ol, dir dir, dir menu, dir dl, +menu ul, menu ol, menu dir, menu menu, menu dl, +dl ul, dl ol, dl dir, dl menu, dl dl { + margin-top: 0; + margin-bottom: 0; +} + +/* 2 deep unordered lists use a circle */ +ol ul, ul ul, menu ul, dir ul, +ol menu, ul menu, menu menu, dir menu, +ol dir, ul dir, menu dir, dir dir { + list-style-type: circle; +} + +/* 3 deep (or more) unordered lists use a square */ +ol ol ul, ol ul ul, ol menu ul, ol dir ul, +ol ol menu, ol ul menu, ol menu menu, ol dir menu, +ol ol dir, ol ul dir, ol menu dir, ol dir dir, +ul ol ul, ul ul ul, ul menu ul, ul dir ul, +ul ol menu, ul ul menu, ul menu menu, ul dir menu, +ul ol dir, ul ul dir, ul menu dir, ul dir dir, +menu ol ul, menu ul ul, menu menu ul, menu dir ul, +menu ol menu, menu ul menu, menu menu menu, menu dir menu, +menu ol dir, menu ul dir, menu menu dir, menu dir dir, +dir ol ul, dir ul ul, dir menu ul, dir dir ul, +dir ol menu, dir ul menu, dir menu menu, dir dir menu, +dir ol dir, dir ul dir, dir menu dir, dir dir dir { + list-style-type: square; +} + + +/* leafs */ + +/*
noshade and color attributes are handled completely by + * the nsHTMLHRElement attribute mapping code + */ +hr { + display: block; + height: 2px; + border: 1px inset; + margin: 0.5em auto 0.5em auto; + color: gray; +} + +hr[size="1"] { + border-style: solid none none none; +} + +img[usemap], object[usemap] { + color: blue; +} + +frameset { + display: block ! important; + position: static ! important; + float: none ! important; + border: none ! important; +} + +frame { + border: none ! important; +} + +iframe { + border: 2px inset; +} + +noframes { + display: none; +} + +spacer { + position: static ! important; + float: none ! important; +} + +/* focusable content: anything w/ tabindex >=0 is focusable */ +abbr:focus, acronym:focus, address:focus, applet:focus, b:focus, +base:focus, big:focus, blockquote:focus, br:focus, canvas:focus, caption:focus, +center:focus, cite:focus, code:focus, col:focus, colgroup:focus, dd:focus, +del:focus, dfn:focus, dir:focus, div:focus, dl:focus, dt:focus, em:focus, +fieldset:focus, font:focus, form:focus, h1:focus, h2:focus, h3:focus, h4:focus, +h5:focus, h6:focus, hr:focus, i:focus, img:focus, ins:focus, +kbd:focus, label:focus, legend:focus, li:focus, link:focus, menu:focus, +object:focus, ol:focus, p:focus, pre:focus, q:focus, s:focus, samp:focus, +small:focus, span:focus, strike:focus, strong:focus, sub:focus, sup:focus, +table:focus, tbody:focus, td:focus, tfoot:focus, th:focus, thead:focus, +tr:focus, tt:focus, u:focus, ul:focus, var:focus { + /* Don't specify the outline-color, we should always use initial value. */ + outline: 1px dotted; +} + +/* hidden elements */ +area, base, basefont, head, meta, script, style, title, +noembed, param, link { + display: none; +} + +/* Page breaks at body tags, to help out with LIT-generation */ +body { + page-break-before: always; +} + +/* Explicit line-breaks are blocks, sure... */ +br { + display: block; +} + diff --git a/src/calibre/ebooks/lit/lzx.py b/src/calibre/ebooks/lit/lzx.py new file mode 100644 index 0000000000..f91f3871b7 --- /dev/null +++ b/src/calibre/ebooks/lit/lzx.py @@ -0,0 +1,27 @@ +''' +LZX compression/decompression wrapper. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys +from calibre import plugins +_lzx, LZXError = plugins['lzx'] + +__all__ = ['Compressor', 'Decompressor', 'LZXError'] + +Compressor = _lzx.Compressor + +class Decompressor(object): + def __init__(self, wbits): + self.wbits = wbits + self.blocksize = 1 << wbits + _lzx.init(wbits) + + def decompress(self, data, outlen): + return _lzx.decompress(data, outlen) + + def reset(self): + return _lzx.reset() diff --git a/src/calibre/ebooks/lit/mssha1.py b/src/calibre/ebooks/lit/mssha1.py index 1708c8dd8b..29261e7313 100644 --- a/src/calibre/ebooks/lit/mssha1.py +++ b/src/calibre/ebooks/lit/mssha1.py @@ -4,6 +4,9 @@ Modified version of SHA-1 used in Microsoft LIT files. Adapted from the PyPy pure-Python SHA-1 implementation. """ +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + import struct, copy # ====================================================================== diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py new file mode 100644 index 0000000000..6378c99219 --- /dev/null +++ b/src/calibre/ebooks/lit/oeb.py @@ -0,0 +1,737 @@ +''' +Basic support for manipulating OEB 1.x/2.0 content and metadata. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import os +import sys +from collections import defaultdict +from types import StringTypes +from itertools import izip, count +from urlparse import urldefrag, urlparse, urlunparse +from urllib import unquote as urlunquote +from lxml import etree + +XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False) +XML_NS = 'http://www.w3.org/XML/1998/namespace' +XHTML_NS = 'http://www.w3.org/1999/xhtml' +OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' +OPF2_NS = 'http://www.idpf.org/2007/opf' +DC09_NS = 'http://purl.org/metadata/dublin_core' +DC10_NS = 'http://purl.org/dc/elements/1.0/' +DC11_NS = 'http://purl.org/dc/elements/1.1/' +XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' +DCTERMS_NS = 'http://purl.org/dc/terms/' +NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' +XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} + +def XML(name): return '{%s}%s' % (XML_NS, name) +def XHTML(name): return '{%s}%s' % (XHTML_NS, name) +def OPF(name): return '{%s}%s' % (OPF2_NS, name) +def DC(name): return '{%s}%s' % (DC11_NS, name) +def NCX(name): return '{%s}%s' % (NCX_NS, name) + +XHTML_MIME = 'application/xhtml+xml' +CSS_MIME = 'text/css' +NCX_MIME = 'application/x-dtbncx+xml' +OPF_MIME = 'application/oebps-package+xml' +OEB_DOC_MIME = 'text/x-oeb1-document' +OEB_CSS_MIME = 'text/x-oeb1-css' + +OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) +OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) + + +def element(parent, *args, **kwargs): + if parent is not None: + return etree.SubElement(parent, *args, **kwargs) + return etree.Element(*args, **kwargs) + +def namespace(name): + if '}' in name: + return name.split('}', 1)[0][1:] + return '' + +def barename(name): + if '}' in name: + return name.split('}', 1)[1] + return name + +def xpath(elem, expr): + return elem.xpath(expr, namespaces=XPNSMAP) + +URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """ +def urlquote(href): + result = [] + for char in href: + if char in URL_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +def urlnormalize(href): + parts = urlparse(href) + parts = (part.replace('\\', '/') for part in parts) + parts = (urlunquote(part) for part in parts) + parts = (urlquote(part) for part in parts) + return urlunparse(parts) + + +class AbstractContainer(object): + def read_xml(self, path): + return etree.fromstring( + self.read(path), parser=XML_PARSER, + base_url=os.path.dirname(path)) + +class DirContainer(AbstractContainer): + def __init__(self, rootdir): + self.rootdir = rootdir + + def read(self, path): + path = os.path.join(self.rootdir, path) + with open(urlunquote(path), 'rb') as f: + return f.read() + + def write(self, path, data): + path = os.path.join(self.rootdir, path) + with open(urlunquote(path), 'wb') as f: + return f.write(data) + + +class Metadata(object): + TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description', + 'format', 'identifier', 'language', 'publisher', 'relation', + 'rights', 'source', 'subject', 'title', 'type']) + OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} + OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS} + + class Item(object): + def __init__(self, term, value, fq_attrib={}): + self.fq_attrib = dict(fq_attrib) + if term == OPF('meta') and not value: + term = self.fq_attrib.pop('name') + value = self.fq_attrib.pop('content') + elif term in Metadata.TERMS and not namespace(term): + term = DC(term) + self.term = term + self.value = value + self.attrib = attrib = {} + for fq_attr in fq_attrib: + attr = barename(fq_attr) + attrib[attr] = fq_attrib[fq_attr] + + def __getattr__(self, name): + name = name.replace('_', '-') + try: + return self.attrib[name] + except KeyError: + raise AttributeError( + '%r object has no attribute %r' \ + % (self.__class__.__name__, name)) + + def __repr__(self): + return 'Item(term=%r, value=%r, attrib=%r)' \ + % (barename(self.term), self.value, self.attrib) + + def __str__(self): + return str(self.value) + + def __unicode__(self): + return unicode(self.value) + + def to_opf1(self, dcmeta=None, xmeta=None): + if namespace(self.term) == DC11_NS: + name = DC(barename(self.term).title()) + elem = element(dcmeta, name, attrib=self.attrib) + elem.text = self.value + else: + elem = element(xmeta, 'meta', attrib=self.attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def to_opf2(self, parent=None): + if namespace(self.term) == DC11_NS: + elem = element(parent, self.term, attrib=self.fq_attrib) + elem.text = self.value + else: + elem = element(parent, OPF('meta'), attrib=self.fq_attrib) + elem.attrib['name'] = self.term + elem.attrib['content'] = self.value + return elem + + def __init__(self, oeb): + self.oeb = oeb + self.items = defaultdict(list) + + def add(self, term, value, attrib={}): + item = self.Item(term, value, attrib) + items = self.items[barename(item.term)] + items.append(item) + return item + + def iterkeys(self): + for key in self.items: + yield key + __iter__ = iterkeys + + def __getitem__(self, key): + return self.items[key] + + def __contains__(self, key): + return key in self.items + + def __getattr__(self, term): + return self.items[term] + + def to_opf1(self, parent=None): + elem = element(parent, 'metadata') + dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + xmeta = element(elem, 'x-metadata') + for term in self.items: + for item in self.items[term]: + item.to_opf1(dcmeta, xmeta) + if 'ms-chaptertour' not in self.items: + chaptertour = self.Item('ms-chaptertour', 'chaptertour') + chaptertour.to_opf1(dcmeta, xmeta) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP) + for term in self.items: + for item in self.items[term]: + item.to_opf2(elem) + return elem + + +class Manifest(object): + class Item(object): + def __init__(self, id, href, media_type, fallback=None, loader=str): + self.id = id + self.href = self.path = urlnormalize(href) + self.media_type = media_type + self.fallback = fallback + self.spine_position = None + self.linear = True + self._loader = loader + self._data = None + + def __repr__(self): + return 'Item(id=%r, href=%r, media_type=%r)' \ + % (self.id, self.href, self.media_type) + + def data(): + def fget(self): + if self._data: + return self._data + data = self._loader(self.href) + if self.media_type == XHTML_MIME: + data = etree.fromstring(data, parser=XML_PARSER) + if namespace(data.tag) != XHTML_NS: + data.attrib['xmlns'] = XHTML_NS + data = etree.tostring(data) + data = etree.fromstring(data, parser=XML_PARSER) + elif self.media_type.startswith('application/') \ + and self.media_type.endswith('+xml'): + data = etree.fromstring(data, parser=XML_PARSER) + return data + def fset(self, value): + self._data = value + def fdel(self): + self._data = None + return property(fget, fset, fdel) + data = data() + + def __cmp__(self, other): + result = cmp(self.spine_position, other.spine_position) + if result != 0: + return result + return cmp(self.id, other.id) + + def __init__(self, oeb): + self.oeb = oeb + self.items = {} + self.hrefs = {} + + def add(self, id, href, media_type, fallback=None): + item = self.Item( + id, href, media_type, fallback, self.oeb.container.read) + self.items[item.id] = item + self.hrefs[item.href] = item + return item + + def remove(self, id): + href = self.items[id].href + del self.items[id] + del self.hrefs[href] + + def __iter__(self): + for id in self.items: + yield id + + def __getitem__(self, id): + return self.items[id] + + def values(self): + for item in self.items.values(): + yield item + + def items(self): + for id, item in self.refs.items(): + yield id, items + + def __contains__(self, key): + return id in self.items + + def to_opf1(self, parent=None): + elem = element(parent, 'manifest') + for item in self.items.values(): + media_type = item.media_type + if media_type == XHTML_MIME: + media_type = OEB_DOC_MIME + elif media_type == CSS_MIME: + media_type = OEB_CSS_MIME + attrib = {'id': item.id, 'href': item.href, + 'media-type': media_type} + if item.fallback: + attrib['fallback'] = item.fallback + element(elem, 'item', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('manifest')) + for item in self.items.values(): + attrib = {'id': item.id, 'href': item.href, + 'media-type': item.media_type} + if item.fallback: + attrib['fallback'] = item.fallback + element(elem, OPF('item'), attrib=attrib) + return elem + + +class Spine(object): + def __init__(self, oeb): + self.oeb = oeb + self.items = [] + + def add(self, item, linear): + if isinstance(linear, StringTypes): + linear = linear.lower() + if linear is None or linear in ('yes', 'true'): + linear = True + elif linear in ('no', 'false'): + linear = False + item.linear = linear + item.spine_position = len(self.items) + self.items.append(item) + return item + + def __iter__(self): + for item in self.items: + yield item + + def __getitem__(self, index): + return self.items[index] + + def __len__(self): + return len(self.items) + + def __contains__(self, item): + return (item in self.items) + + def to_opf1(self, parent=None): + elem = element(parent, 'spine') + for item in self.items: + if item.linear: + element(elem, 'itemref', attrib={'idref': item.id}) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('spine')) + for item in self.items: + attrib = {'idref': item.id} + if not item.linear: + attrib['linear'] = 'no' + element(elem, OPF('itemref'), attrib=attrib) + return elem + + +class Guide(object): + class Reference(object): + def __init__(self, type, title, href): + self.type = type + self.title = title + self.href = urlnormalize(href) + + def __repr__(self): + return 'Reference(type=%r, title=%r, href=%r)' \ + % (self.type, self.title, self.href) + + def __init__(self, oeb): + self.oeb = oeb + self.refs = {} + + def add(self, type, title, href): + ref = self.Reference(type, title, href) + self.refs[type] = ref + return ref + + def by_type(self, type): + return self.ref_types[type] + + def iterkeys(self): + for type in self.refs: + yield type + __iter__ = iterkeys + + def values(self): + for ref in self.refs.values(): + yield ref + + def items(self): + for type, ref in self.refs.items(): + yield type, ref + + def __getitem__(self, index): + return self.refs[index] + + def __contains__(self, key): + return key in self.refs + + def to_opf1(self, parent=None): + elem = element(parent, 'guide') + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, 'reference', attrib=attrib) + return elem + + def to_opf2(self, parent=None): + elem = element(parent, OPF('guide')) + for ref in self.refs.values(): + attrib = {'type': ref.type, 'href': ref.href} + if ref.title: + attrib['title'] = ref.title + element(elem, OPF('reference'), attrib=attrib) + return elem + + +class TOC(object): + def __init__(self, title=None, href=None, klass=None, id=None): + self.title = title + self.href = urlnormalize(href) if href else href + self.klass = klass + self.id = id + self.nodes = [] + + def add(self, title, href, klass=None, id=None): + node = TOC(title, href, klass, id) + self.nodes.append(node) + return node + + def __iter__(self): + for node in self.nodes: + yield node + + def __getitem__(self, index): + return self.nodes[index] + + def depth(self, level=0): + if self.nodes: + return self.nodes[0].depth(level+1) + return level + + def to_opf1(self, tour): + for node in self.nodes: + element(tour, 'site', attrib={ + 'title': node.title, 'href': node.href}) + node.to_opf1(tour) + return tour + + def to_ncx(self, parent, playorder=None, depth=1): + if not playorder: playorder = [0] + for node in self.nodes: + playorder[0] += 1 + point = etree.SubElement(parent, + NCX('navPoint'), attrib={'playOrder': str(playorder[0])}) + if self.klass: + point.attrib['class'] = node.klass + if self.id: + point.attrib['id'] = node.id + label = etree.SubElement(point, NCX('navLabel')) + etree.SubElement(label, NCX('text')).text = node.title + href = node.href if depth > 1 else urldefrag(node.href)[0] + child = etree.SubElement(point, + NCX('content'), attrib={'src': href}) + node.to_ncx(point, playorder, depth+1) + return parent + + +class OEBBook(object): + def __init__(self, opfpath, container=None): + if not container: + container = DirContainer(os.path.dirname(opfpath)) + opfpath = os.path.basename(opfpath) + self.container = container + opf = self._read_opf(opfpath) + self._all_from_opf(opf) + + def _convert_opf1(self, opf): + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib)) + metadata = etree.SubElement(nroot, OPF('metadata'), + nsmap={'opf': OPF2_NS, 'dc': DC11_NS, + 'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) + for prefix in ('d11', 'd10', 'd09'): + elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix) + if elements: break + for element in elements: + if not element.text: continue + tag = barename(element.tag).lower() + element.tag = '{%s}%s' % (DC11_NS, tag) + for name in element.attrib: + if name in ('role', 'file-as', 'scheme'): + nsname = '{%s}%s' % (OPF2_NS, name) + element.attrib[nsname] = element.attrib[name] + del element.attrib[name] + metadata.append(element) + for element in opf.xpath('metadata/x-metadata/meta'): + metadata.append(element) + for item in opf.xpath('manifest/item'): + media_type = item.attrib['media-type'].lower() + if media_type in OEB_DOCS: + media_type = XHTML_MIME + elif media_type in OEB_STYLES: + media_type = CSS_MIME + item.attrib['media-type'] = media_type + for tag in ('manifest', 'spine', 'tours', 'guide'): + for element in opf.xpath(tag): + nroot.append(element) + return etree.fromstring(etree.tostring(nroot), parser=XML_PARSER) + + def _read_opf(self, opfpath): + opf = self.container.read_xml(opfpath) + version = float(opf.get('version', 1.0)) + if version < 2.0: + opf = self._convert_opf1(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.attrib['unique-identifier'] + self.metadata = metadata = Metadata(self) + for elem in xpath(opf, '/o2:package/o2:metadata/*'): + if elem.text or elem.attrib: + metadata.add(elem.tag, elem.text, elem.attrib) + for item in metadata.identifier: + if item.id == uid: + self.uid = item + break + + def _manifest_from_opf(self, opf): + self.manifest = manifest = Manifest(self) + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + manifest.add(elem.get('id'), elem.get('href'), + elem.get('media-type'), elem.get('fallback')) + + def _spine_from_opf(self, opf): + self.spine = spine = Spine(self) + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + item = self.manifest[elem.get('idref')] + spine.add(item, elem.get('linear')) + extras = [] + for item in self.manifest.values(): + if item.media_type == XHTML_MIME \ + and item not in spine: + extras.append(item) + extras.sort() + for item in extras: + spine.add(item, False) + + def _guide_from_opf(self, opf): + self.guide = guide = Guide(self) + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + guide.add(elem.get('type'), elem.get('title'), elem.get('href')) + + def _toc_from_navpoint(self, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) + href = xpath(child, 'ncx:content/@src')[0] + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(node, child) + + def _toc_from_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if not result: + return False + id = result[0] + ncx = self.manifest[id].data + self.manifest.remove(id) + title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0] + self.toc = toc = TOC(title) + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, '/o2:package/o2:tours/o2:tour') + if not result: + return False + tour = result[0] + self.toc = toc = TOC(tour.get('title')) + sites = xpath(tour, 'o2:site') + for site in sites: + toc.add(site.get('title'), site.get('href')) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.guide: + return False + self.toc = toc = TOC() + itempath, frag = urldefrag(self.guide['toc'].href) + item = self.manifest.hrefs[itempath] + html = item.data + if frag: + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + path, frag = urldefrag(href) + if not path: + href = '#'.join((itempath, frag)) + title = ' '.join(xpath(anchor, './/text()')) + href = urlnormalize(href) + if href not in titles: + order.append(href) + titles[href].append(title) + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + self.toc = toc = TOC() + titles = [] + headers = [] + for item in self.spine: + if not item.linear: continue + html = item.data + title = xpath(html, '/h:html/h:head/h:title/text()') + if title: titles.append(title[0]) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) + header = xpath(html, expr) + if header: + headers[-1] = header[0] + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf): + if self._toc_from_ncx(opf): return + if self._toc_from_tour(opf): return + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _all_from_opf(self, opf): + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + self._toc_from_opf(opf) + + def to_opf1(self): + package = etree.Element('package', + attrib={'unique-identifier': self.uid.id}) + metadata = self.metadata.to_opf1(package) + manifest = self.manifest.to_opf1(package) + spine = self.spine.to_opf1(package) + tours = element(package, 'tours') + tour = element(tours, 'tour', + attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) + self.toc.to_opf1(tour) + guide = self.guide.to_opf1(package) + return {OPF_MIME: ('content.opf', package)} + + def _generate_ncx_item(self): + id = 'ncx' + index = 0 + while id in self.manifest: + id = 'ncx' + str(index) + index = index + 1 + href = 'toc' + index = 0 + while (href + '.ncx') in self.manifest.hrefs: + href = 'toc' + str(index) + href += '.ncx' + return (id, href) + + def _to_ncx(self): + ncx = etree.Element(NCX('ncx'), attrib={'version': '2005-1'}, + nsmap={None: NCX_NS}) + head = etree.SubElement(ncx, NCX('head')) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:uid', 'content': unicode(self.uid)}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:depth', 'content': str(self.toc.depth())}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:totalPageCount', 'content': '0'}) + etree.SubElement(head, NCX('meta'), + attrib={'name': 'dtb:maxPageNumber', 'content': '0'}) + title = etree.SubElement(ncx, NCX('docTitle')) + text = etree.SubElement(title, NCX('text')) + text.text = unicode(self.metadata.title[0]) + navmap = etree.SubElement(ncx, NCX('navMap')) + self.toc.to_ncx(navmap) + return ncx + + def to_opf2(self): + package = etree.Element(OPF('package'), + attrib={'version': '2.0', 'unique-identifier': self.uid.id}, + nsmap={None: OPF2_NS}) + metadata = self.metadata.to_opf2(package) + manifest = self.manifest.to_opf2(package) + id, href = self._generate_ncx_item() + etree.SubElement(manifest, OPF('item'), + attrib={'id': id, 'href': href, 'media-type': NCX_MIME}) + spine = self.spine.to_opf2(package) + spine.attrib['toc'] = id + guide = self.guide.to_opf2(package) + ncx = self._to_ncx() + return {OPF_MIME: ('content.opf', package), + NCX_MIME: (href, ncx)} + + +def main(argv=sys.argv): + for arg in argv[1:]: + oeb = OEBBook(arg) + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2().values(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 671e48ab76..90df14e2c0 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal ' \ import sys, struct, cStringIO, os import functools import re +from urlparse import urldefrag from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 +from calibre.ebooks.lit.oeb import urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] @@ -110,7 +112,7 @@ class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') OPEN_ANGLE_RE = re.compile(r'<<(?![!]--)') - CLOSE_ANGLE_RE = re.compile(r'(?