Convert OEBBook to store cssutils-parsed CSS.

This commit is contained in:
Marshall T. Vandegrift 2009-03-18 19:51:35 -04:00
parent 57aebdff7d
commit 29486d653e
7 changed files with 64 additions and 37 deletions

View File

@ -27,7 +27,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
@ -732,7 +732,7 @@ def option_parser():
return parser
def oeb2lit(opts, inpath):
logger = Logger(logging.getLogger('oeb2lit'))
logger = logging.getLogger('oeb2lit')
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:

View File

@ -13,8 +13,11 @@ from collections import defaultdict
from itertools import count
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
from lxml import etree, html
import calibre
from cssutils import CSSParser
from cssutils.css import CSSStyleSheet
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
@ -99,6 +102,8 @@ PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document'])
@ -565,7 +570,7 @@ class Manifest(object):
return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings
data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data)
@ -646,6 +651,27 @@ class Manifest(object):
etree.SubElement(data, XHTML('body'))
return data
def _parse_css(self, data):
data = self.oeb.decode(data)
data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
fetcher=self._fetch_css)
data = parser.parseString(data, href=self.href)
data.namespaces['h'] = XHTML_NS
return data
def _fetch_css(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.oeb.logger.warn('CSS import of missing file %r' % path)
return (None, None)
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
return ('utf-8', data)
@dynamic_property
def data(self):
doc = """Provides MIME type sensitive access to the manifest
@ -661,15 +687,19 @@ class Manifest(object):
special parsing.
"""
def fget(self):
if self._data is not None:
return self._data
data = self._loader(self.href)
if self.media_type in OEB_DOCS:
data = self._force_xhtml(data)
data = self._data
if data is None:
if self._loader is None:
return None
data = self._loader(self.href)
if not isinstance(data, basestring):
pass # already parsed
elif self.media_type in OEB_DOCS:
data = self._parse_xhtml(data)
elif self.media_type[-4:] in ('+xml', '/xml'):
data = etree.fromstring(data)
elif self.media_type in OEB_STYLES:
data = self.oeb.decode(data)
data = self._parse_css(data)
self._data = data
return data
def fset(self, value):

View File

@ -8,6 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
from itertools import chain
import calibre
from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter
@ -15,7 +16,7 @@ from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config
@ -77,8 +78,8 @@ def main(argv=sys.argv):
if len(args) != 0:
parser.print_help()
return 1
logger = Logger(logging.getLogger('ebook-convert'))
logger.setup_cli_handler(opts.verbose)
logger = logging.getLogger('ebook-convert')
calibre.setup_cli_handlers(logger, logging.DEBUG)
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)

View File

@ -181,7 +181,7 @@ class OEBReader(object):
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:
for match in CSSURL_RE.finditer(item.data):
for match in CSSURL_RE.finditer(item.data.cssText):
href, _ = urldefrag(match.group('url'))
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme

View File

@ -115,8 +115,7 @@ class Stylizer(object):
cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [HTML_CSS_STYLESHEET]
head = xpath(tree, '/h:html/h:head')[0]
parser = cssutils.CSSParser()
parser.setFetcher(self._fetch_css_file)
parser = cssutils.CSSParser(fetcher=self._fetch_css_file)
for elem in head:
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -135,14 +134,7 @@ class Stylizer(object):
'Stylesheet %r referenced by file %r not in manifest' %
(path, item.href))
continue
if sitem in self.STYLESHEETS:
stylesheet = self.STYLESHEETS[sitem]
else:
data = self._fetch_css_file(path)[1]
stylesheet = parser.parseString(data, href=path)
stylesheet.namespaces['h'] = XHTML_NS
self.STYLESHEETS[sitem] = stylesheet
stylesheets.append(stylesheet)
stylesheets.append(sitem.data)
rules = []
index = 0
self.stylesheets = set()
@ -159,9 +151,9 @@ class Stylizer(object):
for _, _, cssdict, text, _ in rules:
try:
selector = CSSSelector(text)
except (AssertionError, ExpressionError, etree.XPathSyntaxError,\
NameError, # gets thrown on OS X instead of SelectorSyntaxError
SelectorSyntaxError):
except (AssertionError, ExpressionError, etree.XPathSyntaxError,
NameError, # thrown on OS X instead of SelectorSyntaxError
SelectorSyntaxError):
continue
for elem in selector(tree):
self.style(elem)._update_cssdict(cssdict)
@ -171,9 +163,13 @@ class Stylizer(object):
def _fetch_css_file(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.logger.warn('CSS import of missing file %r' % path)
return (None, None)
data = hrefs[path].data
data = XHTML_CSS_NAMESPACE + data
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
return ('utf-8', data)
def flatten_rule(self, rule, href, index):

View File

@ -53,7 +53,7 @@ class ManifestTrimmer(object):
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
for match in CSSURL_RE.finditer(item.data):
for match in CSSURL_RE.finditer(item.data.cssText):
href = match.group('url')
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:

View File

@ -8,7 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook
from calibre.ebooks.oeb.base import DirContainer, OEBBook
__all__ = ['OEBWriter']