diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 282bb03a30..61a41443bc 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -26,6 +26,7 @@ from calibre.ebooks.oeb.entitydefs import ENTITYDEFS XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' +OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF2_NS = 'http://www.idpf.org/2007/opf' DC09_NS = 'http://purl.org/metadata/dublin_core' @@ -334,8 +335,6 @@ class Manifest(object): # Possibly decode in user-specified encoding if self.oeb.encoding is not None: data = data.decode(self.oeb.encoding, 'replace') - # Force to UNIX line encodings - data = data.replace('\r\n', '\n').replace('\r', '\n') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in data and SVG_NS not in data: data = data.replace( @@ -358,10 +357,26 @@ class Manifest(object): data = etree.tostring(data, encoding=unicode) data = etree.fromstring(data) # Force into the XHTML namespace - if namespace(data.tag) != XHTML_NS: + if barename(data.tag) != 'html': + raise OEBError( + 'File %r does not appear to be (X)HTML' % self.href) + elif not namespace(data.tag): data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode) data = etree.fromstring(data) + elif namespace(data.tag) != XHTML_NS: + # OEB_DOC_NS, but possibly others + ns = namespace(data.tag) + attrib = dict(data.attrib) + nroot = etree.Element(XHTML('html'), + nsmap={None: XHTML_NS}, attrib=attrib) + for elem in data.iterdescendants(): + if isinstance(elem.tag, basestring) and \ + namespace(elem.tag) == ns: + elem.tag = XHTML(barename(elem.tag)) + for elem in data: + nroot.append(elem) + data = nroot # Remove any encoding-specifying elements for meta in self.META_XP(data): meta.getparent().remove(meta) @@ -824,7 +839,6 @@ class OEBBook(object): def _read_opf(self, opfpath): opf = self.container.read(opfpath) - opf = opf.replace('\r\n', '\n').replace('\r', '\n') try: opf = etree.fromstring(opf) except etree.XMLSyntaxError: