diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e9252c7609..854f8bef94 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -24,6 +24,7 @@ import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate from calibre.startup import get_lang +from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.metadata.epub import CoverRenderer from calibre.ptempfile import TemporaryDirectory @@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') def element(parent, *args, **kwargs): if parent is not None: @@ -447,9 +449,10 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - # Possibly decode in user-specified encoding - if self.oeb.encoding is not None: - data = data.decode(self.oeb.encoding, 'replace') + # Convert to Unicode and normalize line endings + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + data = data.replace('\r\n', '\n').replace('\r', '\n') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in data and SVG_NS not in data: data = data.replace( @@ -1381,6 +1384,20 @@ class OEBBook(object): lang = lang.split('-', 1)[0].lower() return translate(lang, text) + def decode(self, data): + if isinstance(data, unicode): + return data + encodings = ['utf-8', 'utf-16'] + if self.encoding is not None: + encodings.append(self.encoding) + for encoding in encodings: + try: + return data.decode(encoding) + except UnicodeDecodeError: + pass + data, _ = xml_to_unicode(data) + return data + def to_opf1(self): package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 03a1fade10..ae42e063b7 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -109,6 +109,7 @@ class Stylizer(object): STYLESHEETS = {} def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']): + self.oeb = oeb self.profile = profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] @@ -117,7 +118,7 @@ class Stylizer(object): stylesheets = [HTML_CSS_STYLESHEET] head = xpath(tree, '/h:html/h:head')[0] parser = cssutils.CSSParser() - parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path))) + parser.setFetcher(self._fetch_css_file) for elem in head: if elem.tag == XHTML('style') and elem.text \ and elem.get('type', CSS_MIME) in OEB_STYLES: @@ -138,8 +139,7 @@ class Stylizer(object): if path in self.STYLESHEETS: stylesheet = self.STYLESHEETS[path] else: - data = XHTML_CSS_NAMESPACE - data += oeb.manifest.hrefs[path].data + data = self._fetch_css_file(path)[1] stylesheet = parser.parseString(data, href=path) stylesheet.namespaces['h'] = XHTML_NS self.STYLESHEETS[path] = stylesheet @@ -167,6 +167,15 @@ class Stylizer(object): for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr() + def _fetch_css_file(self, path): + hrefs = self.oeb.manifest.hrefs + if path not in hrefs: + return (None, None) + data = hrefs[path].data + data = self.oeb.decode(data) + data = XHTML_CSS_NAMESPACE + data + return (None, data) + def flatten_rule(self, rule, href, index): results = [] if isinstance(rule, CSSStyleRule):