diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index 174ff7d9e0..3c99c4915e 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -16,6 +16,7 @@ from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
from html5lib.ihatexml import InfosetFilter, DataLossWarning
from html5lib.html5parser import HTMLParser
+from calibre import xml_replace_entities
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
from calibre.utils.cleantext import clean_xml_chars
@@ -355,8 +356,8 @@ class TreeBuilder(BaseTreeBuilder):
def parse(raw, decoder=None, log=None):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
- # TODO: Replace entities?
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
+ raw = xml_replace_entities(raw)
while True:
try:
parser = HTMLParser(tree=TreeBuilder)
@@ -375,7 +376,7 @@ def parse(raw, decoder=None, log=None):
if __name__ == '__main__':
from lxml import etree
- root = parse('
') print (etree.tostring(root)) print() diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index ed83d0342c..01fb5ec9e9 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -117,7 +117,13 @@ def case_insensitive_element_names(test, parse_function): err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root) test.assertEqual(len(XPath('//h:p')(root)), 1, err) -basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names) +def entities(test, parse_function): + markup = '
'
' + root = parse_function(markup) + err = 'Entities not handled, parsed markup:\n' + etree.tostring(root) + test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err) + +basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities) class ParsingTests(BaseTest):