Clean XML of invalid chars before parsing OPF

This commit is contained in:
Kovid Goyal 2020-01-20 15:42:46 +05:30
parent ebbefb5de3
commit 0b4ae4a23c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,17 +3,17 @@
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import namedtuple
from polyglot.builtins import map
from collections import namedtuple
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.base import OPF
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.spell import parse_lang_code
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.localization import lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import filter
from polyglot.builtins import filter, map
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
@ -43,7 +43,7 @@ def parse_opf(stream_or_path):
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
raw = raw[raw.find('<'):]
root = safe_xml_fromstring(raw)
root = safe_xml_fromstring(clean_xml_chars(raw))
if root is None:
raise ValueError('Not an OPF file')
return root