Replace entities

This commit is contained in:
Kovid Goyal 2013-10-25 15:58:37 +05:30
parent 03a39f15d1
commit 002895ff42
2 changed files with 10 additions and 3 deletions

View File

@ -16,6 +16,7 @@ from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
from html5lib.ihatexml import InfosetFilter, DataLossWarning from html5lib.ihatexml import InfosetFilter, DataLossWarning
from html5lib.html5parser import HTMLParser from html5lib.html5parser import HTMLParser
from calibre import xml_replace_entities
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
@ -355,8 +356,8 @@ class TreeBuilder(BaseTreeBuilder):
def parse(raw, decoder=None, log=None): def parse(raw, decoder=None, log=None):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
# TODO: Replace entities?
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
raw = xml_replace_entities(raw)
while True: while True:
try: try:
parser = HTMLParser(tree=TreeBuilder) parser = HTMLParser(tree=TreeBuilder)
@ -375,7 +376,7 @@ def parse(raw, decoder=None, log=None):
if __name__ == '__main__': if __name__ == '__main__':
from lxml import etree from lxml import etree
root = parse('<html:html xmlns:html="{html}" id="a"><html:p><html:p></html:html>'.format(html=namespaces['html'])) root = parse('<html><p>&nbsp;')
print (etree.tostring(root)) print (etree.tostring(root))
print() print()

View File

@ -117,7 +117,13 @@ def case_insensitive_element_names(test, parse_function):
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root) err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
test.assertEqual(len(XPath('//h:p')(root)), 1, err) test.assertEqual(len(XPath('//h:p')(root)), 1, err)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names) def entities(test, parse_function):
markup = '<html><p>&nbsp;&apos;</p>'
root = parse_function(markup)
err = 'Entities not handled, parsed markup:\n' + etree.tostring(root)
test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities)
class ParsingTests(BaseTest): class ParsingTests(BaseTest):