mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When parsing XML if the XML starts with a UTF-8 BOM decode as UTF-8. FB2 Input: Handle entities
This commit is contained in:
parent
d5848cdb2d
commit
868fa550ee
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
__version__ = "1.0"
|
__version__ = "1.0"
|
||||||
|
|
||||||
import re
|
import re, codecs
|
||||||
|
|
||||||
def detect(aBuf):
|
def detect(aBuf):
|
||||||
import calibre.ebooks.chardet.universaldetector as universaldetector
|
import calibre.ebooks.chardet.universaldetector as universaldetector
|
||||||
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
|||||||
if not raw:
|
if not raw:
|
||||||
return u'', encoding
|
return u'', encoding
|
||||||
if not isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
if raw.startswith('\xff\xfe'):
|
if raw.startswith(codecs.BOM_UTF8):
|
||||||
|
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
||||||
|
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||||
elif raw.startswith('\xfe\xff'):
|
elif raw.startswith(codecs.BOM_UTF16_BE):
|
||||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||||
if not isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
|
@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
|
|||||||
log.debug('Parsing XML...')
|
log.debug('Parsing XML...')
|
||||||
raw = stream.read().replace('\0', '')
|
raw = stream.read().replace('\0', '')
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
assume_utf8=True)[0]
|
assume_utf8=True, resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
doc = etree.fromstring(raw)
|
doc = etree.fromstring(raw)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
try:
|
try:
|
||||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||||
|
if doc is None:
|
||||||
|
raise Exception('parse failed')
|
||||||
except:
|
except:
|
||||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
doc = etree.fromstring(raw.replace('& ', '&'),
|
||||||
parser=RECOVER_PARSER)
|
parser=RECOVER_PARSER)
|
||||||
|
if doc is None:
|
||||||
|
raise ValueError('The FB2 file is not valid XML')
|
||||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||||
css = ''
|
css = ''
|
||||||
for s in stylesheets:
|
for s in stylesheets:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user