Fix #1746. Improve handling of encoding.

2025-07-07 18:24:30 -04:00 · 2009-02-01 21:20:41 -05:00 · 2009-02-01 21:20:41 -05:00 · 6fbf78aa7f
commit 6fbf78aa7f
parent 2f3680e563
2 changed files with 32 additions and 6 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -24,6 +24,7 @@ import calibre
 from calibre import LoggingInterface
 from calibre.translations.dynamic import translate
 from calibre.startup import get_lang
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.metadata.epub import CoverRenderer
 from calibre.ptempfile import TemporaryDirectory
@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
 COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
 QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
 PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
 XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
 def element(parent, *args, **kwargs):
    if parent is not None:
@ -447,9 +449,10 @@ class Manifest(object):
                % (self.id, self.href, self.media_type)
        def _force_xhtml(self, data):
-            # Possibly decode in user-specified encoding
+            # Convert to Unicode and normalize line endings
-            if self.oeb.encoding is not None:
+            data = self.oeb.decode(data)
-                data = data.decode(self.oeb.encoding, 'replace')
+            data = XMLDECL_RE.sub('', data)
            data = data.replace('\r\n', '\n').replace('\r', '\n')
            # Handle broken XHTML w/ SVG (ugh)
            if 'svg:' in data and SVG_NS not in data:
                data = data.replace(
@ -1381,6 +1384,20 @@ class OEBBook(object):
        lang = lang.split('-', 1)[0].lower()
        return translate(lang, text)
    def decode(self, data):
        if isinstance(data, unicode):
            return data
        encodings = ['utf-8', 'utf-16']
        if self.encoding is not None:
            encodings.append(self.encoding)
        for encoding in encodings:
            try:
                return data.decode(encoding)
            except UnicodeDecodeError:
                pass
        data, _ = xml_to_unicode(data)
        return data
    def to_opf1(self):
        package = etree.Element('package',
            attrib={'unique-identifier': self.uid.id})
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -109,6 +109,7 @@ class Stylizer(object):
    STYLESHEETS = {}
    def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
        self.oeb = oeb
        self.profile = profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
@ -117,7 +118,7 @@ class Stylizer(object):
        stylesheets = [HTML_CSS_STYLESHEET]
        head = xpath(tree, '/h:html/h:head')[0]
        parser = cssutils.CSSParser()
-        parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
+        parser.setFetcher(self._fetch_css_file)
        for elem in head:
            if elem.tag == XHTML('style') and elem.text \
               and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -138,8 +139,7 @@ class Stylizer(object):
                if path in self.STYLESHEETS:
                    stylesheet = self.STYLESHEETS[path]
                else:
-                    data = XHTML_CSS_NAMESPACE
+                    data = self._fetch_css_file(path)[1]
                    data += oeb.manifest.hrefs[path].data
                    stylesheet = parser.parseString(data, href=path)
                    stylesheet.namespaces['h'] = XHTML_NS
                    self.STYLESHEETS[path] = stylesheet
@ -167,6 +167,15 @@ class Stylizer(object):
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr()
    def _fetch_css_file(self, path):
        hrefs = self.oeb.manifest.hrefs
        if path not in hrefs:
            return (None, None)
        data = hrefs[path].data
        data = self.oeb.decode(data)
        data = XHTML_CSS_NAMESPACE + data
        return (None, data)
    def flatten_rule(self, rule, href, index):
        results = []
        if isinstance(rule, CSSStyleRule):