From cecfa09b959052a05b6081bdb25ec24fe240bd6e Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Mon, 2 Feb 2009 11:46:52 -0500 Subject: [PATCH] Use same decoding logic for OPF as for (X)HTML. --- src/calibre/ebooks/oeb/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index d17973b88e..4642f36336 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1009,13 +1009,16 @@ class OEBBook(object): return nroot def _read_opf(self, opfpath): - opf = self.container.read(opfpath) + data = self.container.read(opfpath) + data = self.decode(data) + data = XMLDECL_RE.sub('', data) + data = data.replace('\r\n', '\n').replace('\r', '\n') try: - opf = etree.fromstring(opf) + opf = etree.fromstring(data) except etree.XMLSyntaxError: repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - opf = ENTITY_RE.sub(repl, opf) - opf = etree.fromstring(opf) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS):