diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css index 5b75ea6649..9401b19cf2 100644 --- a/src/calibre/ebooks/lit/html.css +++ b/src/calibre/ebooks/lit/html.css @@ -410,7 +410,7 @@ tr:focus, tt:focus, u:focus, ul:focus, var:focus { /* hidden elements */ area, base, basefont, head, meta, script, style, title, -noembed, param { +noembed, param, link { display: none; } @@ -418,3 +418,9 @@ noembed, param { body { page-break-before: always; } + +/* Explicit line-breaks are blocks, sure... */ +br { + display: block; +} + diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py index d3773a61f1..ae2e6136b7 100644 --- a/src/calibre/ebooks/lit/oeb.py +++ b/src/calibre/ebooks/lit/oeb.py @@ -8,8 +8,8 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree -XML_PARSER = etree.XMLParser( - remove_blank_text=True, recover=True, resolve_entities=False) +XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False) +XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF2_NS = 'http://www.idpf.org/2007/opf' @@ -23,6 +23,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} +def XML(name): return '{%s}%s' % (XML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def OPF(name): return '{%s}%s' % (OPF2_NS, name) def DC(name): return '{%s}%s' % (DC11_NS, name) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index c04a845d69..71e5b081b8 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -387,7 +387,7 @@ def preserve(function): class LitReader(object): PIECE_SIZE = 16 XML_PARSER = etree.XMLParser( - remove_blank_text=True, resolve_entities=False) + recover=True, resolve_entities=False) def magic(): @preserve diff --git a/src/calibre/ebooks/lit/stylizer.py b/src/calibre/ebooks/lit/stylizer.py index 97b7e2d91d..1986f6a2ed 100644 --- a/src/calibre/ebooks/lit/stylizer.py +++ b/src/calibre/ebooks/lit/stylizer.py @@ -14,7 +14,8 @@ import cssutils from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \ CSSValueList, cssproperties from lxml import etree -from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename +from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES +from calibre.ebooks.lit.oeb import barename, urlnormalize from calibre.resources import html_css HTML_CSS_STYLESHEET = cssutils.parseString(html_css) @@ -125,7 +126,7 @@ class Stylizer(object): elif tag == 'link' \ and elem.get('rel', 'stylesheet') == 'stylesheet' \ and elem.get('type', CSS_MIME) in OEB_STYLES: - href = elem.attrib['href'] + href = urlnormalize(elem.attrib['href']) path = os.path.join(base, href) path = os.path.normpath(path).replace('\\', '/') if path in self.STYLESHEETS: @@ -275,13 +276,13 @@ class Style(object): if name1 != name2: return False elif item.type == 'id': - name1 = item.value[1:].lower() - name2 = element.attrib.get('id', '').lower().split() + name1 = item.value[1:] + name2 = element.get('id', '') if name1 != name2: return False elif item.type == 'class': name = item.value[1:].lower() - classes = element.attrib.get('class', '').lower().split() + classes = element.get('class', '').lower().split() if name not in classes: return False elif item.type == 'child': diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 62c3877785..e1b6b645d0 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -3,7 +3,7 @@ import sys import os from cStringIO import StringIO from struct import pack, unpack -from itertools import izip, count +from itertools import izip, count, chain import time import random import re @@ -15,7 +15,7 @@ from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit.reader import msguid, DirectoryEntry import calibre.ebooks.lit.maps as maps -from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME +from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME, XML_NS, XML from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize from calibre.ebooks.lit.oeb import Oeb from calibre.ebooks.lit.stylizer import Stylizer @@ -116,6 +116,8 @@ def randbytes(n): return ''.join(chr(random.randint(0, 255)) for x in xrange(n)) class ReBinary(object): + NSRMAP = {'': None, XML_NS: 'xml'} + def __init__(self, root, path, oeb, map=HTML_MAP): self.dir = os.path.dirname(path) self.manifest = oeb.manifest @@ -135,8 +137,11 @@ class ReBinary(object): if isinstance(value, (int, long)): value = unichr(value) self.buf.write(value.encode('utf-8')) - - def tree_to_binary(self, elem, nsrmap={'': None}, parents=[], + + def is_block(self, style): + return style['display'] not in ('inline', 'inline-block') + + def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, basestring): self.write(etree.tostring(elem)) @@ -158,7 +163,7 @@ class ReBinary(object): flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD - if style and style['display'] in ('block', 'table'): + if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] @@ -198,24 +203,41 @@ class ReBinary(object): except ValueError: self.write(len(value)+1, value) self.write(0) + old_preserve = preserve + if style: + preserve = (style['white-space'] in ('pre', 'pre-wrap')) + xml_space = elem.get(XML('space')) + if xml_space == 'preserve': + preserve = True + elif xml_space == 'normal': + preserve = False if elem.text: - text = elem.text - if style and style['white-space'] == 'pre': - preserve = True - if elem.get('xml:space') == 'preserve': - preserve = True - if not preserve: - text = COLLAPSE.sub(' ', text) - self.write(text) + if preserve: + self.write(elem.text) + elif len(elem) > 0 or not elem.text.isspace(): + self.write(COLLAPSE.sub(' ', elem.text)) parents.append(tag_offset) - for child in elem: - self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + child = cstyle = nstyle = None + for next in chain(elem, [None]): + if self.stylizer: + nstyle = self.stylizer.style(next) \ + if (next is not None) else None + if child is not None: + if not preserve \ + and (inhead or not nstyle + or self.is_block(cstyle) + or self.is_block(nstyle)) \ + and child.tail and child.tail.isspace(): + child.tail = None + self.tree_to_binary(child, nsrmap, parents, inhead, preserve) + child, cstyle = next, nstyle parents.pop() + preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) - if elem.tail: + if elem.tail and tag != 'html': tail = elem.tail - if tag != 'pre': + if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'):