From d0be2b91a99b29ea54c5796342d20b03e4cb6b1a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 17 Jun 2009 12:40:59 -0700 Subject: [PATCH] Fix #2630 (lxml.etree.XMLSyntaxError: error parsing attribute name, line 5, column 1340) --- src/calibre/ebooks/oeb/reader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 09807ea636..89ef6b1ba3 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -6,7 +6,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys, os, uuid, copy +import sys, os, uuid, copy, re from itertools import izip from urlparse import urldefrag, urlparse from urllib import unquote as urlunquote @@ -107,8 +107,14 @@ class OEBReader(object): except etree.XMLSyntaxError: repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) data = ENTITY_RE.sub(repl, data) - opf = etree.fromstring(data) - self.logger.warn('OPF contains invalid HTML named entities') + try: + opf = etree.fromstring(data) + self.logger.warn('OPF contains invalid HTML named entities') + except etree.XMLSyntaxError: + data = re.sub(r'(?is).+', '', data) + self.logger.warn('OPF contains invalid tours section') + opf = etree.fromstring(data) + ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns)