From 8fb38c386245c836bd2f08d7b3f9d62f8326bd2d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 29 Oct 2010 12:17:54 -0600 Subject: [PATCH] FB2 Input: Make parsing of malformed FB2 files a little more robust --- resources/templates/fb2.xsl | 6 +++--- src/calibre/ebooks/fb2/input.py | 12 ++++++++---- src/calibre/ebooks/metadata/fb2.py | 3 ++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/resources/templates/fb2.xsl b/resources/templates/fb2.xsl index ed2d8359db..753bd97373 100644 --- a/resources/templates/fb2.xsl +++ b/resources/templates/fb2.xsl @@ -178,13 +178,13 @@ - TOC_ + TOC_ - + @@ -194,7 +194,7 @@ - + diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py index e541de46f7..2b08a716cc 100644 --- a/src/calibre/ebooks/fb2/input.py +++ b/src/calibre/ebooks/fb2/input.py @@ -40,14 +40,18 @@ class FB2Input(InputFormatPlugin): accelerators): from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata - from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS + from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} log.debug('Parsing XML...') - raw = stream.read() + raw = stream.read().replace('\0', '') try: - doc = etree.fromstring(raw.replace('\0', '')) + doc = etree.fromstring(raw) except etree.XMLSyntaxError: - doc = etree.fromstring(raw.replace('& ', '&')) + try: + doc = etree.fromstring(raw, parser=RECOVER_PARSER) + except: + doc = etree.fromstring(raw.replace('& ', '&'), + parser=RECOVER_PARSER) stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index 6e0d56dfa0..3636b89df4 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -22,7 +22,8 @@ def get_metadata(stream): 'xlink':XLINK_NS}) tostring = lambda x : etree.tostring(x, method='text', encoding=unicode).strip() - root = etree.fromstring(stream.read()) + parser = etree.XMLParser(recover=True, no_network=True) + root = etree.fromstring(stream.read(), parser=parser) authors, author_sort = [], None for au in XPath('//fb2:author')(root): fname = lname = author = None