diff --git a/src/calibre/ebooks/lit/html.css b/src/calibre/ebooks/lit/html.css
index 5b75ea6649..9401b19cf2 100644
--- a/src/calibre/ebooks/lit/html.css
+++ b/src/calibre/ebooks/lit/html.css
@@ -410,7 +410,7 @@ tr:focus, tt:focus, u:focus, ul:focus, var:focus {
/* hidden elements */
area, base, basefont, head, meta, script, style, title,
-noembed, param {
+noembed, param, link {
display: none;
}
@@ -418,3 +418,9 @@ noembed, param {
body {
page-break-before: always;
}
+
+/* Explicit line-breaks are blocks, sure... */
+br {
+ display: block;
+}
+
diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py
index d3773a61f1..ae2e6136b7 100644
--- a/src/calibre/ebooks/lit/oeb.py
+++ b/src/calibre/ebooks/lit/oeb.py
@@ -8,8 +8,8 @@ from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
from lxml import etree
-XML_PARSER = etree.XMLParser(
- remove_blank_text=True, recover=True, resolve_entities=False)
+XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False)
+XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
OPF2_NS = 'http://www.idpf.org/2007/opf'
@@ -23,6 +23,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS}
+def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
def OPF(name): return '{%s}%s' % (OPF2_NS, name)
def DC(name): return '{%s}%s' % (DC11_NS, name)
diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py
index c04a845d69..71e5b081b8 100644
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@@ -387,7 +387,7 @@ def preserve(function):
class LitReader(object):
PIECE_SIZE = 16
XML_PARSER = etree.XMLParser(
- remove_blank_text=True, resolve_entities=False)
+ recover=True, resolve_entities=False)
def magic():
@preserve
diff --git a/src/calibre/ebooks/lit/stylizer.py b/src/calibre/ebooks/lit/stylizer.py
index 97b7e2d91d..1986f6a2ed 100644
--- a/src/calibre/ebooks/lit/stylizer.py
+++ b/src/calibre/ebooks/lit/stylizer.py
@@ -14,7 +14,8 @@ import cssutils
from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \
CSSValueList, cssproperties
from lxml import etree
-from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename
+from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES
+from calibre.ebooks.lit.oeb import barename, urlnormalize
from calibre.resources import html_css
HTML_CSS_STYLESHEET = cssutils.parseString(html_css)
@@ -125,7 +126,7 @@ class Stylizer(object):
elif tag == 'link' \
and elem.get('rel', 'stylesheet') == 'stylesheet' \
and elem.get('type', CSS_MIME) in OEB_STYLES:
- href = elem.attrib['href']
+ href = urlnormalize(elem.attrib['href'])
path = os.path.join(base, href)
path = os.path.normpath(path).replace('\\', '/')
if path in self.STYLESHEETS:
@@ -275,13 +276,13 @@ class Style(object):
if name1 != name2:
return False
elif item.type == 'id':
- name1 = item.value[1:].lower()
- name2 = element.attrib.get('id', '').lower().split()
+ name1 = item.value[1:]
+ name2 = element.get('id', '')
if name1 != name2:
return False
elif item.type == 'class':
name = item.value[1:].lower()
- classes = element.attrib.get('class', '').lower().split()
+ classes = element.get('class', '').lower().split()
if name not in classes:
return False
elif item.type == 'child':
diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py
index 62c3877785..e1b6b645d0 100644
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@@ -3,7 +3,7 @@ import sys
import os
from cStringIO import StringIO
from struct import pack, unpack
-from itertools import izip, count
+from itertools import izip, count, chain
import time
import random
import re
@@ -15,7 +15,7 @@ from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit.reader import msguid, DirectoryEntry
import calibre.ebooks.lit.maps as maps
-from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
+from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
from calibre.ebooks.lit.oeb import Oeb
from calibre.ebooks.lit.stylizer import Stylizer
@@ -116,6 +116,8 @@ def randbytes(n):
return ''.join(chr(random.randint(0, 255)) for x in xrange(n))
class ReBinary(object):
+ NSRMAP = {'': None, XML_NS: 'xml'}
+
def __init__(self, root, path, oeb, map=HTML_MAP):
self.dir = os.path.dirname(path)
self.manifest = oeb.manifest
@@ -135,8 +137,11 @@ class ReBinary(object):
if isinstance(value, (int, long)):
value = unichr(value)
self.buf.write(value.encode('utf-8'))
-
- def tree_to_binary(self, elem, nsrmap={'': None}, parents=[],
+
+ def is_block(self, style):
+ return style['display'] not in ('inline', 'inline-block')
+
+ def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[],
inhead=False, preserve=False):
if not isinstance(elem.tag, basestring):
self.write(etree.tostring(elem))
@@ -158,7 +163,7 @@ class ReBinary(object):
flags |= FLAG_CLOSING
if inhead:
flags |= FLAG_HEAD
- if style and style['display'] in ('block', 'table'):
+ if style and self.is_block(style):
flags |= FLAG_BLOCK
self.write(0, flags)
tattrs = self.tattrs[0]
@@ -198,24 +203,41 @@ class ReBinary(object):
except ValueError:
self.write(len(value)+1, value)
self.write(0)
+ old_preserve = preserve
+ if style:
+ preserve = (style['white-space'] in ('pre', 'pre-wrap'))
+ xml_space = elem.get(XML('space'))
+ if xml_space == 'preserve':
+ preserve = True
+ elif xml_space == 'normal':
+ preserve = False
if elem.text:
- text = elem.text
- if style and style['white-space'] == 'pre':
- preserve = True
- if elem.get('xml:space') == 'preserve':
- preserve = True
- if not preserve:
- text = COLLAPSE.sub(' ', text)
- self.write(text)
+ if preserve:
+ self.write(elem.text)
+ elif len(elem) > 0 or not elem.text.isspace():
+ self.write(COLLAPSE.sub(' ', elem.text))
parents.append(tag_offset)
- for child in elem:
- self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
+ child = cstyle = nstyle = None
+ for next in chain(elem, [None]):
+ if self.stylizer:
+ nstyle = self.stylizer.style(next) \
+ if (next is not None) else None
+ if child is not None:
+ if not preserve \
+ and (inhead or not nstyle
+ or self.is_block(cstyle)
+ or self.is_block(nstyle)) \
+ and child.tail and child.tail.isspace():
+ child.tail = None
+ self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
+ child, cstyle = next, nstyle
parents.pop()
+ preserve = old_preserve
if not flags & FLAG_CLOSING:
self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0)
- if elem.tail:
+ if elem.tail and tag != 'html':
tail = elem.tail
- if tag != 'pre':
+ if not preserve:
tail = COLLAPSE.sub(' ', tail)
self.write(tail)
if style and style['page-break-after'] not in ('avoid', 'auto'):