When parsing XML if the XML starts with a UTF-8 BOM decode as UTF-8. FB2 Input: Handle entities

This commit is contained in:
Kovid Goyal 2011-01-07 10:58:20 -07:00
parent d5848cdb2d
commit 868fa550ee
2 changed files with 10 additions and 4 deletions

View File

@ -18,7 +18,7 @@
__version__ = "1.0"
import re
import re, codecs
def detect(aBuf):
import calibre.ebooks.chardet.universaldetector as universaldetector
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if not raw:
return u'', encoding
if not isinstance(raw, unicode):
if raw.startswith('\xff\xfe'):
if raw.startswith(codecs.BOM_UTF8):
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
elif raw.startswith(codecs.BOM_UTF16_LE):
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
elif raw.startswith('\xfe\xff'):
elif raw.startswith(codecs.BOM_UTF16_BE):
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
if not isinstance(raw, unicode):
for pat in ENCODING_PATS:

View File

@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
log.debug('Parsing XML...')
raw = stream.read().replace('\0', '')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True)[0]
assume_utf8=True, resolve_entities=True)[0]
try:
doc = etree.fromstring(raw)
except etree.XMLSyntaxError:
try:
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
if doc is None:
raise Exception('parse failed')
except:
doc = etree.fromstring(raw.replace('& ', '&'),
parser=RECOVER_PARSER)
if doc is None:
raise ValueError('The FB2 file is not valid XML')
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = ''
for s in stylesheets: