diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index dda22db9f3..e6a56a2591 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -367,6 +367,18 @@ class TreeBuilder(BaseTreeBuilder): for child in html: newroot.append(copy.copy(child)) + def apply_body_attributes(self, attrs): + body = self.openElements[1] + nsmap = body.nsmap.copy() + attribs = process_attribs(attrs, nsmap) + for k, v in attribs.iteritems(): + if k not in body.attrib: + try: + body.set(k, v) + except ValueError: + body.set(to_xml_name(k), v) + # We ignore xmlns attributes on non-first tags + def insertComment(self, token, parent=None): if parent is None: parent = self.openElements[-1] @@ -425,6 +437,18 @@ class NoNamespaceTreeBuilder(TreeBuilder): except ValueError: html.set(to_xml_name(k), v) + def apply_body_attributes(self, attrs): + if not attrs: + return + body = self.openElements[1] + attribs = process_namespace_free_attribs(attrs) + for k, v in attribs.iteritems(): + if k not in body.attrib: + try: + body.set(k, v) + except ValueError: + body.set(to_xml_name(k), v) + # Input Stream {{{ _regex_cache = {} diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index a70df254f0..ec65fe6d51 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -126,7 +126,16 @@ def entities(test, parse_function): err = 'Entities not handled, parsed markup:\n' + etree.tostring(root) test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err) -basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities) +def multiple_html_and_body(test, parse_function): + markup = '

' + root = parse_function(markup) + err = 'multiple html and body not handled, parsed markup:\n' + etree.tostring(root) + test.assertEqual(len(XPath('//h:html')(root)), 1, err) + test.assertEqual(len(XPath('//h:body')(root)), 1, err) + test.assertEqual(len(XPath('//h:html[@id and @lang]')(root)), 1, err) + test.assertEqual(len(XPath('//h:body[@id and @lang]')(root)), 1, err) + +basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities, multiple_html_and_body) class ParsingTests(BaseTest): diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py index 936a99c20e..49d290e37c 100644 --- a/src/html5lib/html5parser.py +++ b/src/html5lib/html5parser.py @@ -1019,9 +1019,7 @@ def getPhases(debug): assert self.parser.innerHTML else: self.parser.framesetOK = False - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[1].attributes: - self.tree.openElements[1].attributes[attr] = value + self.tree.apply_body_attributes(token['data']) def startTagFrameset(self, token): self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) diff --git a/src/html5lib/treebuilders/_base.py b/src/html5lib/treebuilders/_base.py index f426823c1f..250cc06dfc 100644 --- a/src/html5lib/treebuilders/_base.py +++ b/src/html5lib/treebuilders/_base.py @@ -274,6 +274,11 @@ class TreeBuilder(object): if attr not in self.openElements[0].attributes: self.openElements[0].attributes[attr] = value + def apply_body_attributes(self, attrs): + for attr, value in attrs.items(): + if attr not in self.tree.openElements[1].attributes: + self.tree.openElements[1].attributes[attr] = value + def _getInsertFromTable(self): return self._insertFromTable