Clean XML of invalid chars before parsing OPF

2025-07-09 03:04:10 -04:00 · 2020-01-20 15:42:46 +05:30 · 2020-01-20 15:42:46 +05:30 · 0b4ae4a23c
commit 0b4ae4a23c
parent ebbefb5de3
1 changed files with 4 additions and 4 deletions
--- a/src/calibre/ebooks/metadata/utils.py
+++ b/src/calibre/ebooks/metadata/utils.py
@ -3,17 +3,17 @@
 # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>

 from __future__ import absolute_import, division, print_function, unicode_literals
-from collections import namedtuple
-from polyglot.builtins import map

+from collections import namedtuple

 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.base import OPF
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.spell import parse_lang_code
+from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.localization import lang_as_iso639_1
 from calibre.utils.xml_parse import safe_xml_fromstring
-from polyglot.builtins import filter
+from polyglot.builtins import filter, map

 OPFVersion = namedtuple('OPFVersion', 'major minor patch')

@ -43,7 +43,7 @@ def parse_opf(stream_or_path):
        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
    raw = raw[raw.find('<'):]
-    root = safe_xml_fromstring(raw)
+    root = safe_xml_fromstring(clean_xml_chars(raw))
    if root is None:
        raise ValueError('Not an OPF file')
    return root