py3: Workaround for lxml choking while parsing unicode strings with non-BMP characters on windows

This commit is contained in:
Kovid Goyal 2020-08-18 09:53:25 +05:30
parent 8fbf98aac2
commit 6e155612b4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 29 additions and 3 deletions

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="utf-8"?><package unique-identifier="fanficfare-uid" version="2.0" xmlns="http://www.idpf.org/2007/opf"><metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"><dc:identifier id="fanficfare-uid">fanficfare-uid:www.wattpad.com-u5sosgoofs-s91590087</dc:identifier><dc:title id="id">𝗜𝗡𝗦𝗧𝗔𝗚𝗥𝗔𝗠𝙽𝙰𝙻𝚄 (Ch 1-3)</dc:title><dc:creator opf:role="aut">5sosgoofs</dc:creator><dc:contributor id="id-2">FanFicFare [https://github.com/JimmXinu/FanFicFare]</dc:contributor><dc:language>en</dc:language><dc:date opf:event="publication">2016-11-29</dc:date><dc:date opf:event="creation">2020-07-20</dc:date><dc:date opf:event="modification">2019-05-12</dc:date><meta content="2019-05-12T10:57:02" name="calibre:timestamp"/><dc:description>𝐧𝐚𝐭𝐬𝐮_𝐝𝐫𝐚𝐠𝐨𝐧: 𝚠𝚊𝚗𝚗𝚊 𝚑𝚘𝚘𝚔 𝚞𝚙 ?
𝐥𝐮𝐜𝐲𝐡𝐞𝐚𝐫𝐭: @𝐧𝐚𝐭𝐬𝐮_𝐝𝐫𝐚𝐠𝐨𝐧 𝚐𝚎𝚝 𝚘𝚏𝚏 𝚖𝚢 𝚙𝚊𝚐𝚎
𝘪𝘯 𝘸𝘩𝘪𝘤𝘩 𝘵𝘸𝘰 𝘵𝘦𝘦𝘯𝘢𝘨𝘦𝘳𝘴 𝘮𝘦𝘦𝘵 𝘵𝘩𝘳𝘰𝘶𝘨𝘩 𝘪𝘯𝘴𝘵𝘢𝘨𝘳𝘢𝘮
𝐟𝐚𝐢𝐫𝐲 𝐭𝐚𝐢𝐥 | 𝐬𝐨𝐜𝐢𝐚𝐥 𝐦𝐞𝐝𝐢𝐚 | 𝐚𝐧𝐢𝐦𝐞</dc:description><dc:subject>juvialockser</dc:subject><dc:subject>grayfullbuster</dc:subject><dc:subject>fairytail</dc:subject><dc:subject>levymcgarden</dc:subject><dc:subject>erzascarlet</dc:subject><dc:subject>gajeelredfox</dc:subject><dc:subject>anime</dc:subject><dc:subject>highschool</dc:subject><dc:subject>nalu</dc:subject><dc:subject>In-Progress</dc:subject><dc:subject>instagram</dc:subject><dc:subject>gruvia</dc:subject><dc:subject>wattys2017</dc:subject><dc:subject>wendymarvell</dc:subject><dc:subject>romance</dc:subject><dc:subject>natsudragneel</dc:subject><dc:subject>lucyheartfilia</dc:subject><dc:subject>fanfiction</dc:subject><dc:subject>Fanfiction</dc:subject><dc:subject>FanFiction</dc:subject><dc:subject>gajevy</dc:subject><dc:publisher>www.wattpad.com</dc:publisher><dc:identifier opf:scheme="URL">https://www.wattpad.com/story/91590087</dc:identifier><dc:source>https://www.wattpad.com/story/91590087</dc:source><meta name="cover" content="image0000"/></metadata><manifest><item href="toc.ncx" id="ncx" media-type="application/x-dtbncx+xml"/><item href="OEBPS/images/cover.jpg" id="image0000" media-type="image/jpeg"/><item href="OEBPS/images/ffdl-1.jpg" id="image0001" media-type="image/jpeg"/><item href="OEBPS/images/ffdl-2.jpg" id="image0002" media-type="image/jpeg"/><item href="OEBPS/stylesheet.css" id="style" media-type="text/css"/><item href="OEBPS/cover.xhtml" id="cover" media-type="application/xhtml+xml"/><item href="OEBPS/title_page.xhtml" id="title_page" media-type="application/xhtml+xml"/><item href="OEBPS/log_page.xhtml" id="log_page" media-type="application/xhtml+xml"/><item href="OEBPS/file0001.xhtml" id="file0001" media-type="application/xhtml+xml"/><item href="OEBPS/file0002.xhtml" id="file0002" media-type="application/xhtml+xml"/><item href="OEBPS/file0003.xhtml" id="file0003" media-type="application/xhtml+xml"/></manifest><spine toc="ncx"><itemref idref="cover" linear="yes"/><itemref idref="title_page" linear="yes"/><itemref idref="log_page" linear="yes"/><itemref idref="file0001" linear="yes"/><itemref idref="file0002" linear="yes"/><itemref idref="file0003" linear="yes"/></spine><guide><reference href="OEBPS/cover.xhtml" title="Cover" type="cover"/></guide></package>

View File

@ -17,14 +17,24 @@ class Resolver(etree.Resolver):
return self.resolve_string('', context)
def create_parser(recover):
parser = etree.XMLParser(recover=recover, no_network=True)
def create_parser(recover, encoding=None):
parser = etree.XMLParser(recover=recover, no_network=True, encoding=encoding)
parser.resolvers.add(Resolver())
return parser
def safe_xml_fromstring(string_or_bytes, recover=True):
return fs(string_or_bytes, parser=create_parser(recover))
ans = fs(string_or_bytes, parser=create_parser(recover))
if ans is None and recover:
# this happens on windows where if string_or_bytes is unicode and
# contains non-BMP chars lxml chokes
if not isinstance(string_or_bytes, bytes):
string_or_bytes = string_or_bytes.encode('utf-8')
ans = fs(string_or_bytes, parser=create_parser(True, encoding='utf-8'))
if ans is not None:
return ans
ans = fs(string_or_bytes, parser=create_parser(False))
return ans
def find_tests():
@ -57,6 +67,13 @@ def find_tests():
got = getattr(safe_xml_fromstring(templ.format(id=eid, val=val)), 'text', None)
self.assertEqual(got, expected)
def test_lxml_unicode_parsing(self):
from calibre.ebooks.chardet import xml_to_unicode
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'unicode-test.opf'), 'rb') as f:
raw = f.read()
text = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]
self.assertIsNotNone(safe_xml_fromstring(text))
return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse)