Implement "ugly-printing" for LIT markup.

2025-12-25 14:27:21 -05:00 · 2008-12-10 00:56:10 -05:00 · 2008-12-10 00:56:10 -05:00 · 210ad8d20a
commit 210ad8d20a
parent 946b91f767
5 changed files with 56 additions and 26 deletions
--- a/src/calibre/ebooks/lit/html.css
+++ b/src/calibre/ebooks/lit/html.css
@ -410,7 +410,7 @@ tr:focus, tt:focus, u:focus, ul:focus, var:focus {

 /* hidden elements */
 area, base, basefont, head, meta, script, style, title,
-noembed, param {
+noembed, param, link {
   display: none;
 }

@ -418,3 +418,9 @@ noembed, param {
 body {
  page-break-before: always;
 }
+
+/* Explicit line-breaks are blocks, sure... */
+br {
+  display: block;
+}
+
--- a/src/calibre/ebooks/lit/oeb.py
+++ b/src/calibre/ebooks/lit/oeb.py
@ -8,8 +8,8 @@ from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
 from lxml import etree

-XML_PARSER = etree.XMLParser(
-    remove_blank_text=True, recover=True, resolve_entities=False)
+XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False)
+XML_NS = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS = 'http://www.w3.org/1999/xhtml'
 OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
 OPF2_NS = 'http://www.idpf.org/2007/opf'
@ -23,6 +23,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
           'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
           'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS}

+def XML(name): return '{%s}%s' % (XML_NS, name)
 def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
 def OPF(name): return '{%s}%s' % (OPF2_NS, name)
 def DC(name): return '{%s}%s' % (DC11_NS, name)
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@ -387,7 +387,7 @@ def preserve(function):
 class LitReader(object):
    PIECE_SIZE = 16
    XML_PARSER = etree.XMLParser(
-        remove_blank_text=True, resolve_entities=False)
+        recover=True, resolve_entities=False)

    def magic():
        @preserve
--- a/src/calibre/ebooks/lit/stylizer.py
+++ b/src/calibre/ebooks/lit/stylizer.py
@ -14,7 +14,8 @@ import cssutils
 from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \
    CSSValueList, cssproperties
 from lxml import etree
-from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES, barename
+from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES
+from calibre.ebooks.lit.oeb import barename, urlnormalize
 from calibre.resources import html_css

 HTML_CSS_STYLESHEET = cssutils.parseString(html_css)
@ -125,7 +126,7 @@ class Stylizer(object):
            elif tag == 'link' \
                 and elem.get('rel', 'stylesheet') == 'stylesheet' \
                 and elem.get('type', CSS_MIME) in OEB_STYLES:
-                href = elem.attrib['href']
+                href = urlnormalize(elem.attrib['href'])
                path = os.path.join(base, href)
                path = os.path.normpath(path).replace('\\', '/')
                if path in self.STYLESHEETS:
@ -275,13 +276,13 @@ class Style(object):
                if name1 != name2:
                    return False
            elif item.type == 'id':
-                name1 = item.value[1:].lower()
-                name2 = element.attrib.get('id', '').lower().split()
+                name1 = item.value[1:]
+                name2 = element.get('id', '')
                if name1 != name2:
                    return False
            elif item.type == 'class':
                name = item.value[1:].lower()
-                classes = element.attrib.get('class', '').lower().split()
+                classes = element.get('class', '').lower().split()
                if name not in classes:
                    return False
            elif item.type == 'child':
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@ -3,7 +3,7 @@ import sys
 import os
 from cStringIO import StringIO
 from struct import pack, unpack
-from itertools import izip, count
+from itertools import izip, count, chain
 import time
 import random
 import re
@ -15,7 +15,7 @@ from urllib import unquote as urlunquote
 from lxml import etree
 from calibre.ebooks.lit.reader import msguid, DirectoryEntry
 import calibre.ebooks.lit.maps as maps
-from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
+from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME, XML_NS, XML
 from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
 from calibre.ebooks.lit.oeb import Oeb
 from calibre.ebooks.lit.stylizer import Stylizer
@ -116,6 +116,8 @@ def randbytes(n):
    return ''.join(chr(random.randint(0, 255)) for x in xrange(n))

 class ReBinary(object):
+    NSRMAP = {'': None, XML_NS: 'xml'}
+    
    def __init__(self, root, path, oeb, map=HTML_MAP):
        self.dir = os.path.dirname(path)
        self.manifest = oeb.manifest
@ -135,8 +137,11 @@ class ReBinary(object):
            if isinstance(value, (int, long)):
                value = unichr(value)
            self.buf.write(value.encode('utf-8'))
-        
-    def tree_to_binary(self, elem, nsrmap={'': None}, parents=[],
+
+    def is_block(self, style):
+        return style['display'] not in ('inline', 'inline-block')
+            
+    def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[],
                       inhead=False, preserve=False):
        if not isinstance(elem.tag, basestring):
            self.write(etree.tostring(elem))
@ -158,7 +163,7 @@ class ReBinary(object):
            flags |= FLAG_CLOSING
        if inhead:
            flags |= FLAG_HEAD
-        if style and style['display'] in ('block', 'table'):
+        if style and self.is_block(style):
            flags |= FLAG_BLOCK
        self.write(0, flags)
        tattrs = self.tattrs[0]
@ -198,24 +203,41 @@ class ReBinary(object):
            except ValueError:
                self.write(len(value)+1, value)
        self.write(0)
+        old_preserve = preserve
+        if style:
+            preserve = (style['white-space'] in ('pre', 'pre-wrap'))
+        xml_space = elem.get(XML('space'))
+        if xml_space == 'preserve':
+            preserve = True
+        elif xml_space == 'normal':
+            preserve = False
        if elem.text:
-            text = elem.text
-            if style and style['white-space'] == 'pre':
-                preserve = True
-            if elem.get('xml:space') == 'preserve':
-                preserve = True
-            if not preserve:
-                text = COLLAPSE.sub(' ', text)
-            self.write(text)
+            if preserve:
+                self.write(elem.text)
+            elif len(elem) > 0 or not elem.text.isspace():
+                self.write(COLLAPSE.sub(' ', elem.text))
        parents.append(tag_offset)
-        for child in elem:
-            self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
+        child = cstyle = nstyle = None
+        for next in chain(elem, [None]):
+            if self.stylizer:
+                nstyle = self.stylizer.style(next) \
+                    if (next is not None) else None
+            if child is not None:
+                if not preserve \
+                   and (inhead or not nstyle
+                        or self.is_block(cstyle)
+                        or self.is_block(nstyle)) \
+                   and child.tail and child.tail.isspace():
+                    child.tail = None
+                self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
+            child, cstyle = next, nstyle
        parents.pop()
+        preserve = old_preserve
        if not flags & FLAG_CLOSING:
            self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0)
-        if elem.tail:
+        if elem.tail and tag != 'html':
            tail = elem.tail
-            if tag != 'pre':
+            if not preserve:
                tail = COLLAPSE.sub(' ', tail)
            self.write(tail)
        if style and style['page-break-after'] not in ('avoid', 'auto'):