Implement namespace less parsing

This commit is contained in:
Kovid Goyal 2013-10-25 18:02:42 +05:30
parent ea7930ee83
commit 2c6d5985a5

View File

@ -24,6 +24,7 @@ from calibre.utils.cleantext import clean_xml_chars
infoset_filter = InfosetFilter() infoset_filter = InfosetFilter()
to_xml_name = infoset_filter.toXmlName to_xml_name = infoset_filter.toXmlName
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')} known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
html_ns = namespaces['html']
class NamespacedHTMLPresent(ValueError): class NamespacedHTMLPresent(ValueError):
@ -32,11 +33,6 @@ class NamespacedHTMLPresent(ValueError):
self.prefix = prefix self.prefix = prefix
# Nodes {{{ # Nodes {{{
def create_lxml_context():
parser = XMLParser(no_network=True)
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
return parser
def ElementFactory(name, namespace=None, context=None): def ElementFactory(name, namespace=None, context=None):
context = context or create_lxml_context() context = context or create_lxml_context()
ns = namespace or namespaces['html'] ns = namespace or namespaces['html']
@ -45,9 +41,6 @@ def ElementFactory(name, namespace=None, context=None):
except ValueError: except ValueError:
return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns}) return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns})
def CommentFactory(text):
return Comment(text.replace('--', '- -'))
class Element(ElementBase): class Element(ElementBase):
''' Implements the interface required by the html5lib tree builders (see ''' Implements the interface required by the html5lib tree builders (see
@ -146,6 +139,24 @@ class Element(ElementBase):
for child in self: for child in self:
new_parent.append(child) new_parent.append(child)
class NoNameSpaceElement(Element):
@property
def namespace(self):
return None
@dynamic_property
def name(self):
def fget(self):
return self.tag
def fset(self, val):
self.tag = val
return property(fget=fget, fset=fset)
@property
def nameTuple(self):
return html_ns, self.tag
class Comment(CommentBase): class Comment(CommentBase):
@dynamic_property @dynamic_property
@ -214,6 +225,12 @@ class DocType(object):
def __init__(self, name, public_id, system_id): def __init__(self, name, public_id, system_id):
self.text = self.name = name self.text = self.name = name
self.public_id, self.system_id = public_id, system_id self.public_id, self.system_id = public_id, system_id
def create_lxml_context(element=Element):
parser = XMLParser(no_network=True)
parser.set_element_class_lookup(ElementDefaultClassLookup(element=element, comment=Comment))
return parser
# }}} # }}}
def process_attribs(attrs, nsmap): def process_attribs(attrs, nsmap):
@ -262,15 +279,13 @@ def process_attribs(attrs, nsmap):
class TreeBuilder(BaseTreeBuilder): class TreeBuilder(BaseTreeBuilder):
elementClass = ElementFactory elementClass = ElementFactory
commentClass = Comment
documentClass = Document documentClass = Document
doctypeClass = DocType doctypeClass = DocType
def __init__(self, namespaceHTMLElements=True): def __init__(self, namespaceHTMLElements=True):
BaseTreeBuilder.__init__(self, True) BaseTreeBuilder.__init__(self, namespaceHTMLElements)
self.lxml_context = create_lxml_context() self.lxml_context = create_lxml_context()
self.elementClass = partial(ElementFactory, context=self.lxml_context) self.elementClass = partial(ElementFactory, context=self.lxml_context)
self.seen_extra_html = False
def getDocument(self): def getDocument(self):
return self.document.root return self.document.root
@ -288,10 +303,10 @@ class TreeBuilder(BaseTreeBuilder):
nsmap = nsmap or {} nsmap = nsmap or {}
attribs = process_attribs(token['data'], nsmap) attribs = process_attribs(token['data'], nsmap)
name = token["name"] name = token["name"]
if name.endswith(':html'):
raise NamespacedHTMLPresent(name.rpartition(':')[0])
namespace = token.get("namespace", self.defaultNamespace) namespace = token.get("namespace", self.defaultNamespace)
if ':' in name: if ':' in name:
if name.endswith(':html'):
raise NamespacedHTMLPresent(name.rpartition(':')[0])
prefix, name = name.partition(':')[0::2] prefix, name = name.partition(':')[0::2]
namespace = nsmap.get(prefix, namespace) namespace = nsmap.get(prefix, namespace)
try: try:
@ -353,14 +368,56 @@ class TreeBuilder(BaseTreeBuilder):
for child in html: for child in html:
newroot.append(copy.copy(child)) newroot.append(copy.copy(child))
def parse(raw, decoder=None, log=None): def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(Comment(token["data"].replace('--', '- -')))
def process_namespace_free_attribs(attrs):
attribs = {k:v for k, v in attrs.iteritems() if ':' not in k}
for k in set(attrs) - set(attribs):
prefix, name = k.partition(':')[0::2]
if prefix != 'xmlns' and name not in attribs:
attribs[name] = attrs[k]
return attribs
class NoNamespaceTreeBuilder(TreeBuilder):
def __init__(self, namespaceHTMLElements=False):
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
self.lxml_context = create_lxml_context(element=NoNameSpaceElement)
self.elementClass = partial(ElementFactory, context=self.lxml_context)
def createElement(self, token, nsmap=None):
name = token['name'].rpartition(':')[2]
attribs = process_namespace_free_attribs(token['data'])
try:
return self.lxml_context.makeelement(name, attrib=attribs)
except ValueError:
attribs = {to_xml_name(k):v for k, v in attribs.iteritems()}
return self.lxml_context.makeelement(to_xml_name(name), attrib=attribs)
def apply_html_attributes(self, attrs):
if not attrs:
return
html = self.openElements[0]
attribs = process_namespace_free_attribs(attrs)
for k, v in attribs.iteritems():
if k not in html.attrib:
try:
html.set(k, v)
except ValueError:
html.set(to_xml_name(k), v)
def parse(raw, decoder=None, log=None, discard_namespaces=False):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
raw = xml_replace_entities(raw) raw = xml_replace_entities(raw)
builder = NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder
while True: while True:
try: try:
parser = HTMLParser(tree=TreeBuilder) parser = HTMLParser(tree=builder, namespaceHTMLElements=not discard_namespaces)
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter('ignore', category=DataLossWarning) warnings.simplefilter('ignore', category=DataLossWarning)
parser.parse(raw, parseMeta=False, useChardet=False) parser.parse(raw, parseMeta=False, useChardet=False)
@ -369,14 +426,15 @@ def parse(raw, decoder=None, log=None):
continue continue
break break
root = parser.tree.getDocument() root = parser.tree.getDocument()
if root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix: if (discard_namespaces and root.tag != 'html') or (
not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
return root return root
if __name__ == '__main__': if __name__ == '__main__':
from lxml import etree from lxml import etree
root = parse('<html><p>&nbsp;') root = parse('<html><p>&nbsp;<b>b', discard_namespaces=True)
print (etree.tostring(root)) print (etree.tostring(root, encoding='utf-8'))
print() print()