From 29486d653e262f4174bcfb0a1189e6490166fd68 Mon Sep 17 00:00:00 2001
From: "Marshall T. Vandegrift"
Date: Wed, 18 Mar 2009 19:51:35 -0400
Subject: [PATCH] Convert OEBBook to store cssutils-parsed CSS.
---
src/calibre/ebooks/lit/writer.py | 4 +-
src/calibre/ebooks/oeb/base.py | 58 ++++++++++++++-----
src/calibre/ebooks/oeb/factory.py | 7 ++-
src/calibre/ebooks/oeb/reader.py | 2 +-
src/calibre/ebooks/oeb/stylizer.py | 26 ++++-----
.../ebooks/oeb/transforms/trimmanifest.py | 2 +-
src/calibre/ebooks/oeb/writer.py | 2 +-
7 files changed, 64 insertions(+), 37 deletions(-)
diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py
index bebba8938b..73216057b5 100644
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@@ -27,7 +27,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
-from calibre.ebooks.oeb.base import Logger, OEBBook
+from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
@@ -732,7 +732,7 @@ def option_parser():
return parser
def oeb2lit(opts, inpath):
- logger = Logger(logging.getLogger('oeb2lit'))
+ logger = logging.getLogger('oeb2lit')
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 59ce1f7b95..1e91fbe17d 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -13,8 +13,11 @@ from collections import defaultdict
from itertools import count
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
+import logging
from lxml import etree, html
import calibre
+from cssutils import CSSParser
+from cssutils.css import CSSStyleSheet
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
@@ -99,6 +102,8 @@ PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream'
+XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
+
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document'])
@@ -565,7 +570,7 @@ class Manifest(object):
return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type)
- def _force_xhtml(self, data):
+ def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings
data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data)
@@ -645,6 +650,27 @@ class Manifest(object):
'File %r missing element' % self.href)
etree.SubElement(data, XHTML('body'))
return data
+
+ def _parse_css(self, data):
+ data = self.oeb.decode(data)
+ data = XHTML_CSS_NAMESPACE + data
+ parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
+ fetcher=self._fetch_css)
+ data = parser.parseString(data, href=self.href)
+ data.namespaces['h'] = XHTML_NS
+ return data
+
+ def _fetch_css(self, path):
+ hrefs = self.oeb.manifest.hrefs
+ if path not in hrefs:
+ self.oeb.logger.warn('CSS import of missing file %r' % path)
+ return (None, None)
+ item = hrefs[path]
+ if item.media_type not in OEB_STYLES:
+ self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
+ return (None, None)
+ data = item.data.cssText
+ return ('utf-8', data)
@dynamic_property
def data(self):
@@ -661,15 +687,19 @@ class Manifest(object):
special parsing.
"""
def fget(self):
- if self._data is not None:
- return self._data
- data = self._loader(self.href)
- if self.media_type in OEB_DOCS:
- data = self._force_xhtml(data)
+ data = self._data
+ if data is None:
+ if self._loader is None:
+ return None
+ data = self._loader(self.href)
+ if not isinstance(data, basestring):
+ pass # already parsed
+ elif self.media_type in OEB_DOCS:
+ data = self._parse_xhtml(data)
elif self.media_type[-4:] in ('+xml', '/xml'):
data = etree.fromstring(data)
elif self.media_type in OEB_STYLES:
- data = self.oeb.decode(data)
+ data = self._parse_css(data)
self._data = data
return data
def fset(self, value):
@@ -677,7 +707,7 @@ class Manifest(object):
def fdel(self):
self._data = None
return property(fget, fset, fdel, doc=doc)
-
+
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
@@ -726,7 +756,7 @@ class Manifest(object):
if frag:
relhref = '#'.join((relhref, frag))
return relhref
-
+
def abshref(self, href):
"""Convert the URL provided in :param:`href` from a reference
relative to this manifest item to a book-absolute reference.
@@ -748,7 +778,7 @@ class Manifest(object):
self.items = set()
self.ids = {}
self.hrefs = {}
-
+
def add(self, id, href, media_type, fallback=None, loader=None, data=None):
"""Add a new item to the book manifest.
@@ -765,7 +795,7 @@ class Manifest(object):
self.ids[item.id] = item
self.hrefs[item.href] = item
return item
-
+
def remove(self, item):
"""Removes :param:`item` from the manifest."""
if item in self.ids:
@@ -775,7 +805,7 @@ class Manifest(object):
self.items.remove(item)
if item in self.oeb.spine:
self.oeb.spine.remove(item)
-
+
def generate(self, id=None, href=None):
"""Generate a new unique identifier and/or internal path for use in
creating a new manifest item, using the provided :param:`id` and/or
@@ -803,13 +833,13 @@ class Manifest(object):
def __iter__(self):
for item in self.items:
yield item
-
+
def values(self):
return list(self.items)
def __contains__(self, item):
return item in self.items
-
+
def to_opf1(self, parent=None):
elem = element(parent, 'manifest')
for item in self.items:
diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py
index 684451044b..8add71d20d 100644
--- a/src/calibre/ebooks/oeb/factory.py
+++ b/src/calibre/ebooks/oeb/factory.py
@@ -8,6 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift '
import sys, os, logging
from itertools import chain
+import calibre
from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter
@@ -15,7 +16,7 @@ from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter
-from calibre.ebooks.oeb.base import Logger, OEBBook
+from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config
@@ -77,8 +78,8 @@ def main(argv=sys.argv):
if len(args) != 0:
parser.print_help()
return 1
- logger = Logger(logging.getLogger('ebook-convert'))
- logger.setup_cli_handler(opts.verbose)
+ logger = logging.getLogger('ebook-convert')
+ calibre.setup_cli_handlers(logger, logging.DEBUG)
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index dbafa5afac..c62540e15a 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -181,7 +181,7 @@ class OEBReader(object):
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:
- for match in CSSURL_RE.finditer(item.data):
+ for match in CSSURL_RE.finditer(item.data.cssText):
href, _ = urldefrag(match.group('url'))
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py
index 3b5c3e19d0..8bc82883e3 100644
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@@ -115,8 +115,7 @@ class Stylizer(object):
cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [HTML_CSS_STYLESHEET]
head = xpath(tree, '/h:html/h:head')[0]
- parser = cssutils.CSSParser()
- parser.setFetcher(self._fetch_css_file)
+ parser = cssutils.CSSParser(fetcher=self._fetch_css_file)
for elem in head:
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
@@ -135,14 +134,7 @@ class Stylizer(object):
'Stylesheet %r referenced by file %r not in manifest' %
(path, item.href))
continue
- if sitem in self.STYLESHEETS:
- stylesheet = self.STYLESHEETS[sitem]
- else:
- data = self._fetch_css_file(path)[1]
- stylesheet = parser.parseString(data, href=path)
- stylesheet.namespaces['h'] = XHTML_NS
- self.STYLESHEETS[sitem] = stylesheet
- stylesheets.append(stylesheet)
+ stylesheets.append(sitem.data)
rules = []
index = 0
self.stylesheets = set()
@@ -159,9 +151,9 @@ class Stylizer(object):
for _, _, cssdict, text, _ in rules:
try:
selector = CSSSelector(text)
- except (AssertionError, ExpressionError, etree.XPathSyntaxError,\
- NameError, # gets thrown on OS X instead of SelectorSyntaxError
- SelectorSyntaxError):
+ except (AssertionError, ExpressionError, etree.XPathSyntaxError,
+ NameError, # thrown on OS X instead of SelectorSyntaxError
+ SelectorSyntaxError):
continue
for elem in selector(tree):
self.style(elem)._update_cssdict(cssdict)
@@ -171,9 +163,13 @@ class Stylizer(object):
def _fetch_css_file(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
+ self.logger.warn('CSS import of missing file %r' % path)
return (None, None)
- data = hrefs[path].data
- data = XHTML_CSS_NAMESPACE + data
+ item = hrefs[path]
+ if item.media_type not in OEB_STYLES:
+ self.logger.warn('CSS import of non-CSS file %r' % path)
+ return (None, None)
+ data = item.data.cssText
return ('utf-8', data)
def flatten_rule(self, rule, href, index):
diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
index c731800999..119ebcc73d 100644
--- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py
+++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
@@ -53,7 +53,7 @@ class ManifestTrimmer(object):
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
- for match in CSSURL_RE.finditer(item.data):
+ for match in CSSURL_RE.finditer(item.data.cssText):
href = match.group('url')
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:
diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py
index 235965b50f..8789d03470 100644
--- a/src/calibre/ebooks/oeb/writer.py
+++ b/src/calibre/ebooks/oeb/writer.py
@@ -8,7 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift '
import sys, os, logging
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
-from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook
+from calibre.ebooks.oeb.base import DirContainer, OEBBook
__all__ = ['OEBWriter']