diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 174ff7d9e0..3c99c4915e 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -16,6 +16,7 @@ from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.ihatexml import InfosetFilter, DataLossWarning from html5lib.html5parser import HTMLParser +from calibre import xml_replace_entities from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags from calibre.utils.cleantext import clean_xml_chars @@ -355,8 +356,8 @@ class TreeBuilder(BaseTreeBuilder): def parse(raw, decoder=None, log=None): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) - # TODO: Replace entities? raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser + raw = xml_replace_entities(raw) while True: try: parser = HTMLParser(tree=TreeBuilder) @@ -375,7 +376,7 @@ def parse(raw, decoder=None, log=None): if __name__ == '__main__': from lxml import etree - root = parse(''.format(html=namespaces['html'])) + root = parse('

 ') print (etree.tostring(root)) print() diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index ed83d0342c..01fb5ec9e9 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -117,7 +117,13 @@ def case_insensitive_element_names(test, parse_function): err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root) test.assertEqual(len(XPath('//h:p')(root)), 1, err) -basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names) +def entities(test, parse_function): + markup = '

 '

' + root = parse_function(markup) + err = 'Entities not handled, parsed markup:\n' + etree.tostring(root) + test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err) + +basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities) class ParsingTests(BaseTest):