Use same decoding logic for OPF as for (X)HTML.

2025-07-09 03:04:10 -04:00 · 2009-02-02 11:46:52 -05:00 · 2009-02-02 11:46:52 -05:00 · cecfa09b95
commit cecfa09b95
parent 7b2064221e
1 changed files with 7 additions and 4 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1009,13 +1009,16 @@ class OEBBook(object):
        return nroot
    
    def _read_opf(self, opfpath):
-        opf = self.container.read(opfpath)
+        data = self.container.read(opfpath)
+        data = self.decode(data)
+        data = XMLDECL_RE.sub('', data)
+        data = data.replace('\r\n', '\n').replace('\r', '\n')
        try:
-            opf = etree.fromstring(opf)
+            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
-            opf = ENTITY_RE.sub(repl, opf)
-            opf = etree.fromstring(opf)
+            data = ENTITY_RE.sub(repl, data)
+            opf = etree.fromstring(data)
            self.logger.warn('OPF contains invalid HTML named entities')
        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):