diff --git a/src/calibre/utils/unicode-test.opf b/src/calibre/utils/unicode-test.opf new file mode 100644 index 0000000000..dbe2e0282f --- /dev/null +++ b/src/calibre/utils/unicode-test.opf @@ -0,0 +1,9 @@ +fanficfare-uid:www.wattpad.com-u5sosgoofs-s91590087𝗜𝗡𝗦𝗧𝗔𝗚𝗥𝗔𝗠 ༄ 𝙽𝙰𝙻𝚄 (Ch 1-3)5sosgoofsFanFicFare [https://github.com/JimmXinu/FanFicFare]en2016-11-292020-07-202019-05-12𝐧𝐚𝐭𝐬𝐮_𝐝𝐫𝐚𝐠𝐨𝐧: 𝚠𝚊𝚗𝚗𝚊 𝚑𝚘𝚘𝚔 𝚞𝚙 ? + +𝐥𝐮𝐜𝐲𝐡𝐞𝐚𝐫𝐭: @𝐧𝐚𝐭𝐬𝐮_𝐝𝐫𝐚𝐠𝐨𝐧 𝚐𝚎𝚝 𝚘𝚏𝚏 𝚖𝚢 𝚙𝚊𝚐𝚎 + + +𝘪𝘯 𝘸𝘩𝘪𝘤𝘩 𝘵𝘸𝘰 𝘵𝘦𝘦𝘯𝘢𝘨𝘦𝘳𝘴 𝘮𝘦𝘦𝘵 𝘵𝘩𝘳𝘰𝘶𝘨𝘩 𝘪𝘯𝘴𝘵𝘢𝘨𝘳𝘢𝘮 + + + 𝐟𝐚𝐢𝐫𝐲 𝐭𝐚𝐢𝐥 | 𝐬𝐨𝐜𝐢𝐚𝐥 𝐦𝐞𝐝𝐢𝐚 | 𝐚𝐧𝐢𝐦𝐞juvialocksergrayfullbusterfairytaillevymcgardenerzascarletgajeelredfoxanimehighschoolnaluIn-Progressinstagramgruviawattys2017wendymarvellromancenatsudragneellucyheartfiliafanfictionFanfictionFanFictiongajevywww.wattpad.comhttps://www.wattpad.com/story/91590087https://www.wattpad.com/story/91590087 diff --git a/src/calibre/utils/xml_parse.py b/src/calibre/utils/xml_parse.py index 549b04ba03..ab47061812 100644 --- a/src/calibre/utils/xml_parse.py +++ b/src/calibre/utils/xml_parse.py @@ -17,14 +17,24 @@ class Resolver(etree.Resolver): return self.resolve_string('', context) -def create_parser(recover): - parser = etree.XMLParser(recover=recover, no_network=True) +def create_parser(recover, encoding=None): + parser = etree.XMLParser(recover=recover, no_network=True, encoding=encoding) parser.resolvers.add(Resolver()) return parser def safe_xml_fromstring(string_or_bytes, recover=True): - return fs(string_or_bytes, parser=create_parser(recover)) + ans = fs(string_or_bytes, parser=create_parser(recover)) + if ans is None and recover: + # this happens on windows where if string_or_bytes is unicode and + # contains non-BMP chars lxml chokes + if not isinstance(string_or_bytes, bytes): + string_or_bytes = string_or_bytes.encode('utf-8') + ans = fs(string_or_bytes, parser=create_parser(True, encoding='utf-8')) + if ans is not None: + return ans + ans = fs(string_or_bytes, parser=create_parser(False)) + return ans def find_tests(): @@ -57,6 +67,13 @@ def find_tests(): got = getattr(safe_xml_fromstring(templ.format(id=eid, val=val)), 'text', None) self.assertEqual(got, expected) + def test_lxml_unicode_parsing(self): + from calibre.ebooks.chardet import xml_to_unicode + with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'unicode-test.opf'), 'rb') as f: + raw = f.read() + text = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0] + self.assertIsNotNone(safe_xml_fromstring(text)) + return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse)