From 868fa550ee4c420620be9fdf4ab13269eae742a4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 7 Jan 2011 10:58:20 -0700 Subject: [PATCH] When parsing XML if the XML starts with a UTF-8 BOM decode as UTF-8. FB2 Input: Handle entities --- src/calibre/ebooks/chardet/__init__.py | 8 +++++--- src/calibre/ebooks/fb2/input.py | 6 +++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index dd279c6559..f9bca3c8d4 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -18,7 +18,7 @@ __version__ = "1.0" -import re +import re, codecs def detect(aBuf): import calibre.ebooks.chardet.universaldetector as universaldetector @@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, if not raw: return u'', encoding if not isinstance(raw, unicode): - if raw.startswith('\xff\xfe'): + if raw.startswith(codecs.BOM_UTF8): + raw, encoding = raw.decode('utf-8')[1:], 'utf-8' + elif raw.startswith(codecs.BOM_UTF16_LE): raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' - elif raw.startswith('\xfe\xff'): + elif raw.startswith(codecs.BOM_UTF16_BE): raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be' if not isinstance(raw, unicode): for pat in ENCODING_PATS: diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py index 1f9a3ffe95..b019873d39 100644 --- a/src/calibre/ebooks/fb2/input.py +++ b/src/calibre/ebooks/fb2/input.py @@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin): log.debug('Parsing XML...') raw = stream.read().replace('\0', '') raw = xml_to_unicode(raw, strip_encoding_pats=True, - assume_utf8=True)[0] + assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: try: doc = etree.fromstring(raw, parser=RECOVER_PARSER) + if doc is None: + raise Exception('parse failed') except: doc = etree.fromstring(raw.replace('& ', '&'), parser=RECOVER_PARSER) + if doc is None: + raise ValueError('The FB2 file is not valid XML') stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: