mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers
This commit is contained in:
parent
8c1ebd1f40
commit
2515bd0f4e
@ -17,6 +17,7 @@ from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
|||||||
|
|
||||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
|
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
|
||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
|
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
|
||||||
|
|
||||||
class NotHTML(Exception):
|
class NotHTML(Exception):
|
||||||
|
|
||||||
@ -28,9 +29,7 @@ def barename(name):
|
|||||||
return name.rpartition('}')[-1]
|
return name.rpartition('}')[-1]
|
||||||
|
|
||||||
def namespace(name):
|
def namespace(name):
|
||||||
if '}' in name:
|
return name.rpartition('}')[0][1:]
|
||||||
return name.split('}', 1)[0][1:]
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def XHTML(name):
|
def XHTML(name):
|
||||||
return '{%s}%s' % (XHTML_NS, name)
|
return '{%s}%s' % (XHTML_NS, name)
|
||||||
@ -60,26 +59,86 @@ def merge_multiple_html_heads_and_bodies(root, log=None):
|
|||||||
log.warn('Merging multiple <head> and <body> sections')
|
log.warn('Merging multiple <head> and <body> sections')
|
||||||
return root
|
return root
|
||||||
|
|
||||||
def _html5_parse(data):
|
def clone_element(elem, nsmap={}, in_context=True):
|
||||||
|
if in_context:
|
||||||
|
maker = elem.getroottree().getroot().makeelement
|
||||||
|
else:
|
||||||
|
maker = etree.Element
|
||||||
|
nelem = maker(elem.tag, attrib=elem.attrib,
|
||||||
|
nsmap=nsmap)
|
||||||
|
nelem.text, nelem.tail = elem.text, elem.tail
|
||||||
|
nelem.extend(elem)
|
||||||
|
return nelem
|
||||||
|
|
||||||
|
def html5_parse(data):
|
||||||
import html5lib
|
import html5lib
|
||||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||||
html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
|
# Set lang correctly
|
||||||
ns is not None)]
|
xl = data.attrib.pop('xmlU0003Alang', None)
|
||||||
if html_ns:
|
if xl is not None and 'lang' not in data.attrib:
|
||||||
# html5lib causes the XHTML namespace to not
|
data.attrib['lang'] = xl
|
||||||
# be set as the default namespace
|
|
||||||
nsmap = dict(data.nsmap)
|
# html5lib has the most inelegant handling of namespaces I have ever seen
|
||||||
nsmap[None] = XHTML_NS
|
# Try to reconstitute destroyed namespace info
|
||||||
for x in html_ns:
|
xmlns_declaration = '{%s}'%XMLNS_NS
|
||||||
nsmap.pop(x)
|
non_html5_namespaces = {}
|
||||||
nroot = etree.Element(data.tag, nsmap=nsmap,
|
seen_namespaces = set()
|
||||||
attrib=dict(data.attrib))
|
for elem in tuple(data.iter()):
|
||||||
nroot.text = data.text
|
elem.attrib.pop('xmlns', None)
|
||||||
nroot.tail = data.tail
|
namespaces = {}
|
||||||
for child in data:
|
for x in tuple(elem.attrib):
|
||||||
nroot.append(child)
|
if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
|
||||||
data = nroot
|
# A namespace declaration
|
||||||
return data
|
val = elem.attrib.pop(x)
|
||||||
|
if x.startswith('xmlnsU0003A'):
|
||||||
|
prefix = x[11:]
|
||||||
|
namespaces[prefix] = val
|
||||||
|
|
||||||
|
if namespaces:
|
||||||
|
# Some destroyed namespace declarations were found
|
||||||
|
p = elem.getparent()
|
||||||
|
if p is None:
|
||||||
|
# We handle the root node later
|
||||||
|
non_html5_namespaces = namespaces
|
||||||
|
else:
|
||||||
|
idx = p.index(elem)
|
||||||
|
p.remove(elem)
|
||||||
|
elem = clone_element(elem, nsmap=namespaces)
|
||||||
|
p.insert(idx, elem)
|
||||||
|
|
||||||
|
b = barename(elem.tag)
|
||||||
|
idx = b.find('U0003A')
|
||||||
|
if idx > -1:
|
||||||
|
prefix, tag = b[:idx], b[idx+6:]
|
||||||
|
ns = elem.nsmap.get(prefix, None)
|
||||||
|
if ns is None:
|
||||||
|
ns = non_html5_namespaces.get(prefix, None)
|
||||||
|
if ns is not None:
|
||||||
|
elem.tag = '{%s}%s'%(ns, tag)
|
||||||
|
|
||||||
|
for b in tuple(elem.attrib):
|
||||||
|
idx = b.find('U0003A')
|
||||||
|
if idx > -1:
|
||||||
|
prefix, tag = b[:idx], b[idx+6:]
|
||||||
|
ns = elem.nsmap.get(prefix, None)
|
||||||
|
if ns is None:
|
||||||
|
ns = non_html5_namespaces.get(prefix, None)
|
||||||
|
if ns is not None:
|
||||||
|
elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)
|
||||||
|
|
||||||
|
seen_namespaces |= set(elem.nsmap.itervalues())
|
||||||
|
|
||||||
|
nsmap = dict(html5lib.constants.namespaces)
|
||||||
|
nsmap[None] = nsmap.pop('html')
|
||||||
|
non_html5_namespaces.update(nsmap)
|
||||||
|
nsmap = non_html5_namespaces
|
||||||
|
|
||||||
|
data = clone_element(data, nsmap=nsmap, in_context=False)
|
||||||
|
|
||||||
|
# Remove unused namespace declarations
|
||||||
|
fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v !=
|
||||||
|
XMLNS_NS}
|
||||||
|
return clone_element(data, nsmap=fnsmap, in_context=False)
|
||||||
|
|
||||||
def _html4_parse(data, prefer_soup=False):
|
def _html4_parse(data, prefer_soup=False):
|
||||||
if prefer_soup:
|
if prefer_soup:
|
||||||
@ -177,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
log.debug('Parsing %s as HTML' % filename)
|
log.debug('Parsing %s as HTML' % filename)
|
||||||
try:
|
try:
|
||||||
data = _html5_parse(data)
|
data = html5_parse(data)
|
||||||
except:
|
except:
|
||||||
log.exception(
|
log.exception(
|
||||||
'HTML 5 parsing failed, falling back to older parsers')
|
'HTML 5 parsing failed, falling back to older parsers')
|
||||||
@ -261,6 +320,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
nroot.append(elem)
|
nroot.append(elem)
|
||||||
data = nroot
|
data = nroot
|
||||||
|
|
||||||
|
|
||||||
data = merge_multiple_html_heads_and_bodies(data, log)
|
data = merge_multiple_html_heads_and_bodies(data, log)
|
||||||
# Ensure has a <head/>
|
# Ensure has a <head/>
|
||||||
head = xpath(data, '/h:html/h:head')
|
head = xpath(data, '/h:html/h:head')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user