This commit is contained in:
Kovid Goyal 2009-03-19 19:12:07 -07:00
commit d7257ad5f2
8 changed files with 76 additions and 61 deletions

View File

@ -129,8 +129,6 @@ class UnBinary(object):
self.tag_map, self.attr_map, self.tag_to_attr_map = map self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.is_html = map is HTML_MAP self.is_html = map is HTML_MAP
self.tag_atoms, self.attr_atoms = atoms self.tag_atoms, self.attr_atoms = atoms
self.opf = map is OPF_MAP
self.bin = bin
self.dir = os.path.dirname(path) self.dir = os.path.dirname(path)
buf = StringIO() buf = StringIO()
self.binary_to_text(bin, buf) self.binary_to_text(bin, buf)
@ -210,7 +208,8 @@ class UnBinary(object):
continue continue
if flags & FLAG_ATOM: if flags & FLAG_ATOM:
if not self.tag_atoms or tag not in self.tag_atoms: if not self.tag_atoms or tag not in self.tag_atoms:
raise LitError("atom tag %d not in atom tag list" % tag) raise LitError(
"atom tag %d not in atom tag list" % tag)
tag_name = self.tag_atoms[tag] tag_name = self.tag_atoms[tag]
current_map = self.attr_atoms current_map = self.attr_atoms
elif tag < len(self.tag_map): elif tag < len(self.tag_map):
@ -295,7 +294,7 @@ class UnBinary(object):
c = '&quot;' c = '&quot;'
elif c == '<': elif c == '<':
c = '&lt;' c = '&lt;'
self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) buf.write(c.encode('ascii', 'xmlcharrefreplace'))
count -= 1 count -= 1
if count == 0: if count == 0:
if not in_censorship: if not in_censorship:
@ -841,24 +840,7 @@ class LitFile(object):
if len(attrs) != nentries: if len(attrs) != nentries:
self._warn("damaged or invalid atoms attributes table") self._warn("damaged or invalid atoms attributes table")
return (tags, attrs) return (tags, attrs)
def get_entry_content(self, entry, pretty_print=False):
if 'spine' in entry.state:
name = '/'.join(('/data', entry.internal, 'content'))
path = entry.path
raw = self.get_file(name)
decl, map = (OPF_DECL, OPF_MAP) \
if name == '/meta' else (HTML_DECL, HTML_MAP)
atoms = self.get_atoms(entry)
content = decl + unicode(UnBinary(raw, path, self.manifest, map, atoms))
if pretty_print:
content = self._pretty_print(content)
content = content.encode('utf-8')
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
class LitContainer(object): class LitContainer(object):
"""Simple Container-interface, read-only accessor for LIT files.""" """Simple Container-interface, read-only accessor for LIT files."""
@ -879,9 +861,15 @@ class LitContainer(object):
elif 'spine' in entry.state: elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content')) internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal) raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) manifest = self._litfile.manifest
atoms = self._litfile.get_atoms(entry)
unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
content = HTML_DECL + str(unbin) content = HTML_DECL + str(unbin)
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def _read_meta(self): def _read_meta(self):
path = 'content.opf' path = 'content.opf'
raw = self._litfile.get_file('/meta') raw = self._litfile.get_file('/meta')

View File

@ -27,7 +27,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \
CSS_MIME, OPF_MIME, XML_NS, XML CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \ from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath urlnormalize, xpath
from calibre.ebooks.oeb.base import Logger, OEBBook from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
@ -732,7 +732,7 @@ def option_parser():
return parser return parser
def oeb2lit(opts, inpath): def oeb2lit(opts, inpath):
logger = Logger(logging.getLogger('oeb2lit')) logger = logging.getLogger('oeb2lit')
logger.setup_cli_handler(opts.verbose) logger.setup_cli_handler(opts.verbose)
outpath = opts.output outpath = opts.output
if outpath is None: if outpath is None:

View File

@ -13,8 +13,11 @@ from collections import defaultdict
from itertools import count from itertools import count
from urlparse import urldefrag, urlparse, urlunparse from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
import logging
from lxml import etree, html from lxml import etree, html
import calibre import calibre
from cssutils import CSSParser
from cssutils.css import CSSStyleSheet
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
@ -99,6 +102,8 @@ PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg'] SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream' BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document']) 'text/x-oeb-document'])
@ -565,7 +570,7 @@ class Manifest(object):
return 'Item(id=%r, href=%r, media_type=%r)' \ return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _force_xhtml(self, data): def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data) data = XMLDECL_RE.sub('', data)
@ -645,6 +650,27 @@ class Manifest(object):
'File %r missing <body/> element' % self.href) 'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body')) etree.SubElement(data, XHTML('body'))
return data return data
def _parse_css(self, data):
data = self.oeb.decode(data)
data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
fetcher=self._fetch_css)
data = parser.parseString(data, href=self.href)
data.namespaces['h'] = XHTML_NS
return data
def _fetch_css(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.oeb.logger.warn('CSS import of missing file %r' % path)
return (None, None)
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
return ('utf-8', data)
@dynamic_property @dynamic_property
def data(self): def data(self):
@ -661,15 +687,19 @@ class Manifest(object):
special parsing. special parsing.
""" """
def fget(self): def fget(self):
if self._data is not None: data = self._data
return self._data if data is None:
data = self._loader(self.href) if self._loader is None:
if self.media_type in OEB_DOCS: return None
data = self._force_xhtml(data) data = self._loader(self.href)
if not isinstance(data, basestring):
pass # already parsed
elif self.media_type in OEB_DOCS:
data = self._parse_xhtml(data)
elif self.media_type[-4:] in ('+xml', '/xml'): elif self.media_type[-4:] in ('+xml', '/xml'):
data = etree.fromstring(data) data = etree.fromstring(data)
elif self.media_type in OEB_STYLES: elif self.media_type in OEB_STYLES:
data = self.oeb.decode(data) data = self._parse_css(data)
self._data = data self._data = data
return data return data
def fset(self, value): def fset(self, value):
@ -677,7 +707,7 @@ class Manifest(object):
def fdel(self): def fdel(self):
self._data = None self._data = None
return property(fget, fset, fdel, doc=doc) return property(fget, fset, fdel, doc=doc)
def __str__(self): def __str__(self):
data = self.data data = self.data
if isinstance(data, etree._Element): if isinstance(data, etree._Element):
@ -726,7 +756,7 @@ class Manifest(object):
if frag: if frag:
relhref = '#'.join((relhref, frag)) relhref = '#'.join((relhref, frag))
return relhref return relhref
def abshref(self, href): def abshref(self, href):
"""Convert the URL provided in :param:`href` from a reference """Convert the URL provided in :param:`href` from a reference
relative to this manifest item to a book-absolute reference. relative to this manifest item to a book-absolute reference.
@ -748,7 +778,7 @@ class Manifest(object):
self.items = set() self.items = set()
self.ids = {} self.ids = {}
self.hrefs = {} self.hrefs = {}
def add(self, id, href, media_type, fallback=None, loader=None, data=None): def add(self, id, href, media_type, fallback=None, loader=None, data=None):
"""Add a new item to the book manifest. """Add a new item to the book manifest.
@ -765,7 +795,7 @@ class Manifest(object):
self.ids[item.id] = item self.ids[item.id] = item
self.hrefs[item.href] = item self.hrefs[item.href] = item
return item return item
def remove(self, item): def remove(self, item):
"""Removes :param:`item` from the manifest.""" """Removes :param:`item` from the manifest."""
if item in self.ids: if item in self.ids:
@ -775,7 +805,7 @@ class Manifest(object):
self.items.remove(item) self.items.remove(item)
if item in self.oeb.spine: if item in self.oeb.spine:
self.oeb.spine.remove(item) self.oeb.spine.remove(item)
def generate(self, id=None, href=None): def generate(self, id=None, href=None):
"""Generate a new unique identifier and/or internal path for use in """Generate a new unique identifier and/or internal path for use in
creating a new manifest item, using the provided :param:`id` and/or creating a new manifest item, using the provided :param:`id` and/or
@ -803,13 +833,13 @@ class Manifest(object):
def __iter__(self): def __iter__(self):
for item in self.items: for item in self.items:
yield item yield item
def values(self): def values(self):
return list(self.items) return list(self.items)
def __contains__(self, item): def __contains__(self, item):
return item in self.items return item in self.items
def to_opf1(self, parent=None): def to_opf1(self, parent=None):
elem = element(parent, 'manifest') elem = element(parent, 'manifest')
for item in self.items: for item in self.items:

View File

@ -8,6 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging import sys, os, logging
from itertools import chain from itertools import chain
import calibre
from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.writer import OEBWriter
@ -15,7 +16,7 @@ from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter from calibre.ebooks.mobi.writer import MobiWriter
from calibre.ebooks.oeb.base import Logger, OEBBook from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config from calibre.utils.config import Config
@ -77,8 +78,8 @@ def main(argv=sys.argv):
if len(args) != 0: if len(args) != 0:
parser.print_help() parser.print_help()
return 1 return 1
logger = Logger(logging.getLogger('ebook-convert')) logger = logging.getLogger('ebook-convert')
logger.setup_cli_handler(opts.verbose) calibre.setup_cli_handlers(logger, logging.DEBUG)
encoding = opts.encoding encoding = opts.encoding
pretty_print = opts.pretty_print pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)

View File

@ -181,7 +181,7 @@ class OEBReader(object):
if not scheme and href not in known: if not scheme and href not in known:
new.add(href) new.add(href)
elif item.media_type in OEB_STYLES: elif item.media_type in OEB_STYLES:
for match in CSSURL_RE.finditer(item.data): for match in CSSURL_RE.finditer(item.data.cssText):
href, _ = urldefrag(match.group('url')) href, _ = urldefrag(match.group('url'))
href = item.abshref(urlnormalize(href)) href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme scheme = urlparse(href).scheme

View File

@ -115,8 +115,7 @@ class Stylizer(object):
cssname = os.path.splitext(basename)[0] + '.css' cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [HTML_CSS_STYLESHEET] stylesheets = [HTML_CSS_STYLESHEET]
head = xpath(tree, '/h:html/h:head')[0] head = xpath(tree, '/h:html/h:head')[0]
parser = cssutils.CSSParser() parser = cssutils.CSSParser(fetcher=self._fetch_css_file)
parser.setFetcher(self._fetch_css_file)
for elem in head: for elem in head:
if elem.tag == XHTML('style') and elem.text \ if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES: and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -135,14 +134,7 @@ class Stylizer(object):
'Stylesheet %r referenced by file %r not in manifest' % 'Stylesheet %r referenced by file %r not in manifest' %
(path, item.href)) (path, item.href))
continue continue
if sitem in self.STYLESHEETS: stylesheets.append(sitem.data)
stylesheet = self.STYLESHEETS[sitem]
else:
data = self._fetch_css_file(path)[1]
stylesheet = parser.parseString(data, href=path)
stylesheet.namespaces['h'] = XHTML_NS
self.STYLESHEETS[sitem] = stylesheet
stylesheets.append(stylesheet)
rules = [] rules = []
index = 0 index = 0
self.stylesheets = set() self.stylesheets = set()
@ -159,9 +151,9 @@ class Stylizer(object):
for _, _, cssdict, text, _ in rules: for _, _, cssdict, text, _ in rules:
try: try:
selector = CSSSelector(text) selector = CSSSelector(text)
except (AssertionError, ExpressionError, etree.XPathSyntaxError,\ except (AssertionError, ExpressionError, etree.XPathSyntaxError,
NameError, # gets thrown on OS X instead of SelectorSyntaxError NameError, # thrown on OS X instead of SelectorSyntaxError
SelectorSyntaxError): SelectorSyntaxError):
continue continue
for elem in selector(tree): for elem in selector(tree):
self.style(elem)._update_cssdict(cssdict) self.style(elem)._update_cssdict(cssdict)
@ -171,9 +163,13 @@ class Stylizer(object):
def _fetch_css_file(self, path): def _fetch_css_file(self, path):
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
if path not in hrefs: if path not in hrefs:
self.logger.warn('CSS import of missing file %r' % path)
return (None, None) return (None, None)
data = hrefs[path].data item = hrefs[path]
data = XHTML_CSS_NAMESPACE + data if item.media_type not in OEB_STYLES:
self.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
return ('utf-8', data) return ('utf-8', data)
def flatten_rule(self, rule, href, index): def flatten_rule(self, rule, href, index):

View File

@ -53,7 +53,7 @@ class ManifestTrimmer(object):
if found not in used: if found not in used:
new.add(found) new.add(found)
elif item.media_type == CSS_MIME: elif item.media_type == CSS_MIME:
for match in CSSURL_RE.finditer(item.data): for match in CSSURL_RE.finditer(item.data.cssText):
href = match.group('url') href = match.group('url')
href = item.abshref(urlnormalize(href)) href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs: if href in oeb.manifest.hrefs:

View File

@ -8,7 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging import sys, os, logging
from calibre.ebooks.oeb.base import OPF_MIME, xml2str from calibre.ebooks.oeb.base import OPF_MIME, xml2str
from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook from calibre.ebooks.oeb.base import DirContainer, OEBBook
__all__ = ['OEBWriter'] __all__ = ['OEBWriter']