mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Replace entities
This commit is contained in:
parent
03a39f15d1
commit
002895ff42
@ -16,6 +16,7 @@ from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
|
|||||||
from html5lib.ihatexml import InfosetFilter, DataLossWarning
|
from html5lib.ihatexml import InfosetFilter, DataLossWarning
|
||||||
from html5lib.html5parser import HTMLParser
|
from html5lib.html5parser import HTMLParser
|
||||||
|
|
||||||
|
from calibre import xml_replace_entities
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
@ -355,8 +356,8 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
def parse(raw, decoder=None, log=None):
|
def parse(raw, decoder=None, log=None):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
# TODO: Replace entities?
|
|
||||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||||
|
raw = xml_replace_entities(raw)
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
parser = HTMLParser(tree=TreeBuilder)
|
parser = HTMLParser(tree=TreeBuilder)
|
||||||
@ -375,7 +376,7 @@ def parse(raw, decoder=None, log=None):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
root = parse('<html:html xmlns:html="{html}" id="a"><html:p><html:p></html:html>'.format(html=namespaces['html']))
|
root = parse('<html><p> ')
|
||||||
print (etree.tostring(root))
|
print (etree.tostring(root))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -117,7 +117,13 @@ def case_insensitive_element_names(test, parse_function):
|
|||||||
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
|
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
|
||||||
test.assertEqual(len(XPath('//h:p')(root)), 1, err)
|
test.assertEqual(len(XPath('//h:p')(root)), 1, err)
|
||||||
|
|
||||||
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names)
|
def entities(test, parse_function):
|
||||||
|
markup = '<html><p> '</p>'
|
||||||
|
root = parse_function(markup)
|
||||||
|
err = 'Entities not handled, parsed markup:\n' + etree.tostring(root)
|
||||||
|
test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
|
|
||||||
|
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities)
|
||||||
|
|
||||||
class ParsingTests(BaseTest):
|
class ParsingTests(BaseTest):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user