mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When parsing XML if the XML starts with a UTF-8 BOM decode as UTF-8. FB2 Input: Handle entities
This commit is contained in:
parent
d5848cdb2d
commit
868fa550ee
@ -18,7 +18,7 @@
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
import re
|
||||
import re, codecs
|
||||
|
||||
def detect(aBuf):
|
||||
import calibre.ebooks.chardet.universaldetector as universaldetector
|
||||
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
if not raw:
|
||||
return u'', encoding
|
||||
if not isinstance(raw, unicode):
|
||||
if raw.startswith('\xff\xfe'):
|
||||
if raw.startswith(codecs.BOM_UTF8):
|
||||
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||
elif raw.startswith('\xfe\xff'):
|
||||
elif raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||
if not isinstance(raw, unicode):
|
||||
for pat in ENCODING_PATS:
|
||||
|
@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
|
||||
log.debug('Parsing XML...')
|
||||
raw = stream.read().replace('\0', '')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True)[0]
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = etree.fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
try:
|
||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise Exception('parse failed')
|
||||
except:
|
||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
||||
parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
|
Loading…
x
Reference in New Issue
Block a user