diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e0e7a665cd..647c52c581 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -88,7 +88,7 @@ self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b 'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var', -'video'} +'video', 'title', 'script', 'style'} _self_closing_pat = re.compile( r'<(?P%s)(?=[\s/])(?P[^>]*)/>'%('|'.join(self_closing_bad_tags)), diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index dd4ee11cc6..7ee02a27ac 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -83,7 +83,7 @@ def node_depth(node): def html5_parse(data, max_nesting_depth=100): import html5lib # html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195 - data = re.sub(r'<\s*title\s*[^>]*/\s*>', '', data) + data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1>', data, flags=re.I) data = html5lib.parse(data, treebuilder='lxml').getroot() @@ -116,6 +116,7 @@ def html5_parse(data, max_nesting_depth=100): prefix = x[11:] namespaces[prefix] = val + remapped_namespaces = {} if namespaces: # Some destroyed namespace declarations were found p = elem.getparent() @@ -127,6 +128,7 @@ def html5_parse(data, max_nesting_depth=100): p.remove(elem) elem = clone_element(elem, nsmap=namespaces) p.insert(idx, elem) + remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)} b = barename(elem.tag) idx = b.find('U0003A') @@ -135,6 +137,8 @@ def html5_parse(data, max_nesting_depth=100): ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) + if ns is None: + ns = remapped_namespaces.get(prefix, None) if ns is not None: elem.tag = '{%s}%s'%(ns, tag) @@ -145,6 +149,8 @@ def html5_parse(data, max_nesting_depth=100): ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) + if ns is None: + ns = remapped_namespaces.get(prefix, None) if ns is not None: elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b) diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py new file mode 100644 index 0000000000..ba47735018 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from lxml import etree + +from calibre.ebooks.oeb.polish.tests.base import BaseTest +from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS +from calibre.ebooks.oeb.parse_utils import html5_parse + +def nonvoid_cdata_elements(test, parse_function): + ''' If self closed version of non-void cdata elements like are + present, the HTML5 parsing algorithm treats all following data as CDATA ''' + markup = ''' + <html> <head><{0}/></head> <body id="test"> </html> + ''' + for tag in ('title', 'style', 'script', 'textarea'): + for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '): + root = parse_function(markup.format(x)) + test.assertEqual( + len(XPath('//h:body[@id="test"]')(root)), 1, + 'Incorrect parsing for <%s/>, parsed markup:\n' % x + etree.tostring(root)) + +def namespaces(test, parse_function): + ae = test.assertEqual + markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS) + root = parse_function(markup) + ae( + len(XPath('//h:body[@id="test"]')(root)), 1, + 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)) + + markup = ''' + <html xmlns="{xhtml}"><head><body id="test"> + <svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg> + '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) + root = parse_function(markup) + err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) + ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) + ae(len(XPath('//svg:svg')(root)), 1, err) + ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + + markup = ''' + <html xmlns="{xhtml}"><head><body id="test"> + <svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg> + '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) + root = parse_function(markup) + err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) + ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) + ae(len(XPath('//svg:svg')(root)), 1, err) + ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + + markup = '<html><body><svg><image xlink:href="xxx"></svg>' + root = parse_function(markup) + err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root) + ae(len(XPath('//svg:svg')(root)), 1, err) + ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + + markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>' + root = parse_function(markup) + err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root) + def xpath(expr): + return etree.XPath(expr, namespaces={'ns1':'NS', 'ns2':'NS2'})(root) + ae(len(xpath('//ns1:tag1')), 1, err) + ae(len(xpath('//ns1:tag2')), 1, err) + ae(len(xpath('//ns2:tag3')), 1, err) + ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err) + ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err) + +all_checks = (nonvoid_cdata_elements, namespaces) + +class ParsingTests(BaseTest): + + def test_conversion_parser(self): + ' Test parsing with the parser used for conversion ' + for test in all_checks: + test(self, html5_parse)