From 0a1c9f9919b7e0642913166f6d4918a2a4e302aa Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 8 Mar 2009 14:03:23 -0400 Subject: [PATCH 1/2] Clean up merge artifacts. --- src/calibre/ebooks/lit/reader.py | 36 +++++++++++--------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 1ac68f3866..f32a65e010 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -129,8 +129,6 @@ class UnBinary(object): self.tag_map, self.attr_map, self.tag_to_attr_map = map self.is_html = map is HTML_MAP self.tag_atoms, self.attr_atoms = atoms - self.opf = map is OPF_MAP - self.bin = bin self.dir = os.path.dirname(path) buf = StringIO() self.binary_to_text(bin, buf) @@ -210,7 +208,8 @@ class UnBinary(object): continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: - raise LitError("atom tag %d not in atom tag list" % tag) + raise LitError( + "atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): @@ -295,7 +294,7 @@ class UnBinary(object): c = '"' elif c == '<': c = '<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(c.encode('ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: @@ -841,24 +840,7 @@ class LitFile(object): if len(attrs) != nentries: self._warn("damaged or invalid atoms attributes table") return (tags, attrs) - - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - atoms = self.get_atoms(entry) - content = decl + unicode(UnBinary(raw, path, self.manifest, map, atoms)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') - else: - internal = '/'.join(('/data', entry.internal)) - content = self._litfile.get_file(internal) - return content - + class LitContainer(object): """Simple Container-interface, read-only accessor for LIT files.""" @@ -879,9 +861,15 @@ class LitContainer(object): elif 'spine' in entry.state: internal = '/'.join(('/data', entry.internal, 'content')) raw = self._litfile.get_file(internal) - unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + manifest = self._litfile.manifest + atoms = self._litfile.get_atoms(entry) + unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms) content = HTML_DECL + str(unbin) - + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + def _read_meta(self): path = 'content.opf' raw = self._litfile.get_file('/meta') From 29486d653e262f4174bcfb0a1189e6490166fd68 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 18 Mar 2009 19:51:35 -0400 Subject: [PATCH 2/2] Convert OEBBook to store cssutils-parsed CSS. --- src/calibre/ebooks/lit/writer.py | 4 +- src/calibre/ebooks/oeb/base.py | 58 ++++++++++++++----- src/calibre/ebooks/oeb/factory.py | 7 ++- src/calibre/ebooks/oeb/reader.py | 2 +- src/calibre/ebooks/oeb/stylizer.py | 26 ++++----- .../ebooks/oeb/transforms/trimmanifest.py | 2 +- src/calibre/ebooks/oeb/writer.py | 2 +- 7 files changed, 64 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index bebba8938b..73216057b5 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -27,7 +27,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \ CSS_MIME, OPF_MIME, XML_NS, XML from calibre.ebooks.oeb.base import namespace, barename, prefixname, \ urlnormalize, xpath -from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener @@ -732,7 +732,7 @@ def option_parser(): return parser def oeb2lit(opts, inpath): - logger = Logger(logging.getLogger('oeb2lit')) + logger = logging.getLogger('oeb2lit') logger.setup_cli_handler(opts.verbose) outpath = opts.output if outpath is None: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 59ce1f7b95..1e91fbe17d 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -13,8 +13,11 @@ from collections import defaultdict from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote +import logging from lxml import etree, html import calibre +from cssutils import CSSParser +from cssutils.css import CSSStyleSheet from calibre.translations.dynamic import translate from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS @@ -99,6 +102,8 @@ PNG_MIME = types_map['.png'] SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' +XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS + OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) @@ -565,7 +570,7 @@ class Manifest(object): return 'Item(id=%r, href=%r, media_type=%r)' \ % (self.id, self.href, self.media_type) - def _force_xhtml(self, data): + def _parse_xhtml(self, data): # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) @@ -645,6 +650,27 @@ class Manifest(object): 'File %r missing element' % self.href) etree.SubElement(data, XHTML('body')) return data + + def _parse_css(self, data): + data = self.oeb.decode(data) + data = XHTML_CSS_NAMESPACE + data + parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING, + fetcher=self._fetch_css) + data = parser.parseString(data, href=self.href) + data.namespaces['h'] = XHTML_NS + return data + + def _fetch_css(self, path): + hrefs = self.oeb.manifest.hrefs + if path not in hrefs: + self.oeb.logger.warn('CSS import of missing file %r' % path) + return (None, None) + item = hrefs[path] + if item.media_type not in OEB_STYLES: + self.oeb.logger.warn('CSS import of non-CSS file %r' % path) + return (None, None) + data = item.data.cssText + return ('utf-8', data) @dynamic_property def data(self): @@ -661,15 +687,19 @@ class Manifest(object): special parsing. """ def fget(self): - if self._data is not None: - return self._data - data = self._loader(self.href) - if self.media_type in OEB_DOCS: - data = self._force_xhtml(data) + data = self._data + if data is None: + if self._loader is None: + return None + data = self._loader(self.href) + if not isinstance(data, basestring): + pass # already parsed + elif self.media_type in OEB_DOCS: + data = self._parse_xhtml(data) elif self.media_type[-4:] in ('+xml', '/xml'): data = etree.fromstring(data) elif self.media_type in OEB_STYLES: - data = self.oeb.decode(data) + data = self._parse_css(data) self._data = data return data def fset(self, value): @@ -677,7 +707,7 @@ class Manifest(object): def fdel(self): self._data = None return property(fget, fset, fdel, doc=doc) - + def __str__(self): data = self.data if isinstance(data, etree._Element): @@ -726,7 +756,7 @@ class Manifest(object): if frag: relhref = '#'.join((relhref, frag)) return relhref - + def abshref(self, href): """Convert the URL provided in :param:`href` from a reference relative to this manifest item to a book-absolute reference. @@ -748,7 +778,7 @@ class Manifest(object): self.items = set() self.ids = {} self.hrefs = {} - + def add(self, id, href, media_type, fallback=None, loader=None, data=None): """Add a new item to the book manifest. @@ -765,7 +795,7 @@ class Manifest(object): self.ids[item.id] = item self.hrefs[item.href] = item return item - + def remove(self, item): """Removes :param:`item` from the manifest.""" if item in self.ids: @@ -775,7 +805,7 @@ class Manifest(object): self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) - + def generate(self, id=None, href=None): """Generate a new unique identifier and/or internal path for use in creating a new manifest item, using the provided :param:`id` and/or @@ -803,13 +833,13 @@ class Manifest(object): def __iter__(self): for item in self.items: yield item - + def values(self): return list(self.items) def __contains__(self, item): return item in self.items - + def to_opf1(self, parent=None): elem = element(parent, 'manifest') for item in self.items: diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index 684451044b..8add71d20d 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -8,6 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from itertools import chain +import calibre from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.writer import OEBWriter @@ -15,7 +16,7 @@ from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.writer import LitWriter from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.writer import MobiWriter -from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context from calibre.utils.config import Config @@ -77,8 +78,8 @@ def main(argv=sys.argv): if len(args) != 0: parser.print_help() return 1 - logger = Logger(logging.getLogger('ebook-convert')) - logger.setup_cli_handler(opts.verbose) + logger = logging.getLogger('ebook-convert') + calibre.setup_cli_handlers(logger, logging.DEBUG) encoding = opts.encoding pretty_print = opts.pretty_print oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index dbafa5afac..c62540e15a 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -181,7 +181,7 @@ class OEBReader(object): if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): + for match in CSSURL_RE.finditer(item.data.cssText): href, _ = urldefrag(match.group('url')) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 3b5c3e19d0..8bc82883e3 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -115,8 +115,7 @@ class Stylizer(object): cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [HTML_CSS_STYLESHEET] head = xpath(tree, '/h:html/h:head')[0] - parser = cssutils.CSSParser() - parser.setFetcher(self._fetch_css_file) + parser = cssutils.CSSParser(fetcher=self._fetch_css_file) for elem in head: if elem.tag == XHTML('style') and elem.text \ and elem.get('type', CSS_MIME) in OEB_STYLES: @@ -135,14 +134,7 @@ class Stylizer(object): 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue - if sitem in self.STYLESHEETS: - stylesheet = self.STYLESHEETS[sitem] - else: - data = self._fetch_css_file(path)[1] - stylesheet = parser.parseString(data, href=path) - stylesheet.namespaces['h'] = XHTML_NS - self.STYLESHEETS[sitem] = stylesheet - stylesheets.append(stylesheet) + stylesheets.append(sitem.data) rules = [] index = 0 self.stylesheets = set() @@ -159,9 +151,9 @@ class Stylizer(object): for _, _, cssdict, text, _ in rules: try: selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError,\ - NameError, # gets thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): + except (AssertionError, ExpressionError, etree.XPathSyntaxError, + NameError, # thrown on OS X instead of SelectorSyntaxError + SelectorSyntaxError): continue for elem in selector(tree): self.style(elem)._update_cssdict(cssdict) @@ -171,9 +163,13 @@ class Stylizer(object): def _fetch_css_file(self, path): hrefs = self.oeb.manifest.hrefs if path not in hrefs: + self.logger.warn('CSS import of missing file %r' % path) return (None, None) - data = hrefs[path].data - data = XHTML_CSS_NAMESPACE + data + item = hrefs[path] + if item.media_type not in OEB_STYLES: + self.logger.warn('CSS import of non-CSS file %r' % path) + return (None, None) + data = item.data.cssText return ('utf-8', data) def flatten_rule(self, rule, href, index): diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index c731800999..119ebcc73d 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -53,7 +53,7 @@ class ManifestTrimmer(object): if found not in used: new.add(found) elif item.media_type == CSS_MIME: - for match in CSSURL_RE.finditer(item.data): + for match in CSSURL_RE.finditer(item.data.cssText): href = match.group('url') href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 235965b50f..8789d03470 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -8,7 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from calibre.ebooks.oeb.base import OPF_MIME, xml2str -from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook +from calibre.ebooks.oeb.base import DirContainer, OEBBook __all__ = ['OEBWriter']